PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/ipc.h"
57 #include "storage/lwlock.h"
58 #include "storage/shmem.h"
59 #include "utils/dsa.h"
60 #include "utils/freepage.h"
61 #include "utils/memutils.h"
62 
63 /*
64  * The size of the initial DSM segment that backs a dsa_area created by
65  * dsa_create. After creating some number of segments of this size we'll
66  * double this size, and so on. Larger segments may be created if necessary
67  * to satisfy large requests.
68  */
69 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
70 
71 /*
72  * How many segments to create before we double the segment size. If this is
73  * low, then there is likely to be a lot of wasted space in the largest
74  * segment. If it is high, then we risk running out of segment slots (see
75  * dsm.c's limits on total number of segments), or limiting the total size
76  * an area can manage when using small pointers.
77  */
78 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
79 
80 /*
81  * The number of bits used to represent the offset part of a dsa_pointer.
82  * This controls the maximum size of a segment, the maximum possible
83  * allocation size and also the maximum number of segments per area.
84  */
85 #if SIZEOF_DSA_POINTER == 4
86 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
87 #else
88 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
89 #endif
90 
91 /*
92  * The maximum number of DSM segments that an area can own, determined by
93  * the number of bits remaining (but capped at 1024).
94  */
95 #define DSA_MAX_SEGMENTS \
96  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
97 
98 /* The bitmask for extracting the offset from a dsa_pointer. */
99 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
100 
101 /* The maximum size of a DSM segment. */
102 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
103 
104 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
105 #define DSA_PAGES_PER_SUPERBLOCK 16
106 
107 /*
108  * A magic number used as a sanity check for following DSM segments belonging
109  * to a DSA area (this number will be XORed with the area handle and
110  * the segment index).
111  */
112 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
113 
114 /* Build a dsa_pointer given a segment number and offset. */
115 #define DSA_MAKE_POINTER(segment_number, offset) \
116  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
117 
118 /* Extract the segment number from a dsa_pointer. */
119 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
120 
121 /* Extract the offset from a dsa_pointer. */
122 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
123 
124 /* The type used for index segment indexes (zero based). */
125 typedef size_t dsa_segment_index;
126 
127 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
128 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
129 
130 /*
131  * How many bins of segments do we have? The bins are used to categorize
132  * segments by their largest contiguous run of free pages.
133  */
134 #define DSA_NUM_SEGMENT_BINS 16
135 
136 /*
137  * What is the lowest bin that holds segments that *might* have n contiguous
138  * free pages? There is no point in looking in segments in lower bins; they
139  * definitely can't service a request for n free pages.
140  */
141 static inline size_t
143 {
144  size_t bin;
145 
146  if (n == 0)
147  bin = 0;
148  else
149  bin = pg_leftmost_one_pos_size_t(n) + 1;
150 
151  return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
152 }
153 
154 /* Macros for access to locks. */
155 #define DSA_AREA_LOCK(area) (&area->control->lock)
156 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
157 
158 /*
159  * The header for an individual segment. This lives at the start of each DSM
160  * segment owned by a DSA area including the first segment (where it appears
161  * as part of the dsa_area_control struct).
162  */
163 typedef struct
164 {
165  /* Sanity check magic value. */
167  /* Total number of pages in this segment (excluding metadata area). */
168  size_t usable_pages;
169  /* Total size of this segment in bytes. */
170  size_t size;
171 
172  /*
173  * Index of the segment that precedes this one in the same segment bin, or
174  * DSA_SEGMENT_INDEX_NONE if this is the first one.
175  */
177 
178  /*
179  * Index of the segment that follows this one in the same segment bin, or
180  * DSA_SEGMENT_INDEX_NONE if this is the last one.
181  */
183  /* The index of the bin that contains this segment. */
184  size_t bin;
185 
186  /*
187  * A flag raised to indicate that this segment is being returned to the
188  * operating system and has been unpinned.
189  */
190  bool freed;
192 
193 /*
194  * Metadata for one superblock.
195  *
196  * For most blocks, span objects are stored out-of-line; that is, the span
197  * object is not stored within the block itself. But, as an exception, for a
198  * "span of spans", the span object is stored "inline". The allocation is
199  * always exactly one page, and the dsa_area_span object is located at
200  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
201  * and the remaining fields are used just as they would be in an ordinary
202  * block. We can't allocate spans out of ordinary superblocks because
203  * creating an ordinary superblock requires us to be able to allocate a span
204  * *first*. Doing it this way avoids that circularity.
205  */
206 typedef struct
207 {
208  dsa_pointer pool; /* Containing pool. */
209  dsa_pointer prevspan; /* Previous span. */
210  dsa_pointer nextspan; /* Next span. */
211  dsa_pointer start; /* Starting address. */
212  size_t npages; /* Length of span in pages. */
213  uint16 size_class; /* Size class. */
214  uint16 ninitialized; /* Maximum number of objects ever allocated. */
215  uint16 nallocatable; /* Number of objects currently allocatable. */
216  uint16 firstfree; /* First object on free list. */
217  uint16 nmax; /* Maximum number of objects ever possible. */
218  uint16 fclass; /* Current fullness class. */
219 } dsa_area_span;
220 
221 /*
222  * Given a pointer to an object in a span, access the index of the next free
223  * object in the same span (ie in the span's freelist) as an L-value.
224  */
225 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
226 
227 /*
228  * Small allocations are handled by dividing a single block of memory into
229  * many small objects of equal size. The possible allocation sizes are
230  * defined by the following array. Larger size classes are spaced more widely
231  * than smaller size classes. We fudge the spacing for size classes >1kB to
232  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
233  * blocks, we bump the maximum object size up to the largest multiple of
234  * 8 bytes that still lets us fit the same number of objects into one block.
235  *
236  * NB: Because of this fudging, if we were ever to use differently-sized blocks
237  * for small allocations, these size classes would need to be reworked to be
238  * optimal for the new size.
239  *
240  * NB: The optimal spacing for size classes, as well as the size of the blocks
241  * out of which small objects are allocated, is not a question that has one
242  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
243  * size classes than we do here, while others (like aset.c) use more
244  * widely-spaced classes. Spacing the classes more closely avoids wasting
245  * memory within individual chunks, but also means a larger number of
246  * potentially-unfilled blocks.
247  */
248 static const uint16 dsa_size_classes[] = {
249  sizeof(dsa_area_span), 0, /* special size classes */
250  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
251  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
252  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
253  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
254  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
255  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
256  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
257  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
258 };
259 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
260 
261 /* Special size classes. */
262 #define DSA_SCLASS_BLOCK_OF_SPANS 0
263 #define DSA_SCLASS_SPAN_LARGE 1
264 
265 /*
266  * The following lookup table is used to map the size of small objects
267  * (less than 1kB) onto the corresponding size class. To use this table,
268  * round the size of the object up to the next multiple of 8 bytes, and then
269  * index into this array.
270  */
271 static const uint8 dsa_size_class_map[] = {
272  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
273  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
274  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
275  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
276  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
277  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
278  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
279  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
280 };
281 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
282 
283 /*
284  * Superblocks are binned by how full they are. Generally, each fullness
285  * class corresponds to one quartile, but the block being used for
286  * allocations is always at the head of the list for fullness class 1,
287  * regardless of how full it really is.
288  */
289 #define DSA_FULLNESS_CLASSES 4
290 
291 /*
292  * A dsa_area_pool represents a set of objects of a given size class.
293  *
294  * Perhaps there should be multiple pools for the same size class for
295  * contention avoidance, but for now there is just one!
296  */
297 typedef struct
298 {
299  /* A lock protecting access to this pool. */
301  /* A set of linked lists of spans, arranged by fullness. */
303  /* Should we pad this out to a cacheline boundary? */
304 } dsa_area_pool;
305 
306 /*
307  * The control block for an area. This lives in shared memory, at the start of
308  * the first DSM segment controlled by this area.
309  */
310 typedef struct
311 {
312  /* The segment header for the first segment. */
314  /* The handle for this area. */
316  /* The handles of the segments owned by this area. */
317  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
318  /* Lists of segments, binned by maximum contiguous run of free pages. */
320  /* The object pools for each size class. */
322  /* The total size of all active segments. */
324  /* The maximum total size of backing storage we are allowed. */
326  /* Highest used segment index in the history of this area. */
328  /* The reference count for this area. */
329  int refcnt;
330  /* A flag indicating that this area has been pinned. */
331  bool pinned;
332  /* The number of times that segments have been freed. */
334  /* The LWLock tranche ID. */
336  /* The general lock (protects everything except object pools). */
339 
340 /* Given a pointer to a pool, find a dsa_pointer. */
341 #define DsaAreaPoolToDsaPointer(area, p) \
342  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
343 
344 /*
345  * A dsa_segment_map is stored within the backend-private memory of each
346  * individual backend. It holds the base address of the segment within that
347  * backend, plus the addresses of key objects within the segment. Those
348  * could instead be derived from the base address but it's handy to have them
349  * around.
350  */
351 typedef struct
352 {
353  dsm_segment *segment; /* DSM segment */
354  char *mapped_address; /* Address at which segment is mapped */
355  dsa_segment_header *header; /* Header (same as mapped_address) */
356  FreePageManager *fpm; /* Free page manager within segment. */
357  dsa_pointer *pagemap; /* Page map within segment. */
359 
360 /*
361  * Per-backend state for a storage area. Backends obtain one of these by
362  * creating an area or attaching to an existing one using a handle. Each
363  * process that needs to use an area uses its own object to track where the
364  * segments are mapped.
365  */
366 struct dsa_area
367 {
368  /* Pointer to the control object in shared memory. */
370 
371  /* Has the mapping been pinned? */
373 
374  /*
375  * This backend's array of segment maps, ordered by segment index
376  * corresponding to control->segment_handles. Some of the area's segments
377  * may not be mapped in this backend yet, and some slots may have been
378  * freed and need to be detached; these operations happen on demand.
379  */
381 
382  /* The highest segment index this backend has ever mapped. */
384 
385  /* The last observed freed_segment_counter. */
387 };
388 
389 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
390 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
391 
392 /* Given a pointer to a segment_map, obtain a segment index number. */
393 #define get_segment_index(area, segment_map_ptr) \
394  (segment_map_ptr - &area->segment_maps[0])
395 
396 static void init_span(dsa_area *area, dsa_pointer span_pointer,
397  dsa_area_pool *pool, dsa_pointer start, size_t npages,
398  uint16 size_class);
399 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
400  int fromclass, int toclass);
401 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
402 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
403  int size_class);
406 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
407 static void unlink_span(dsa_area *area, dsa_area_span *span);
408 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
409  dsa_pointer span_pointer, int fclass);
410 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
411 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
412 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
413 static dsa_area *create_internal(void *place, size_t size,
414  int tranche_id,
415  dsm_handle control_handle,
416  dsm_segment *control_segment);
417 static dsa_area *attach_internal(void *place, dsm_segment *segment,
418  dsa_handle handle);
419 static void check_for_freed_segments(dsa_area *area);
420 static void check_for_freed_segments_locked(dsa_area *area);
421 static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
422 
423 /*
424  * Create a new shared area in a new DSM segment. Further DSM segments will
425  * be allocated as required to extend the available space.
426  *
427  * We can't allocate a LWLock tranche_id within this function, because tranche
428  * IDs are a scarce resource; there are only 64k available, using low numbers
429  * when possible matters, and we have no provision for recycling them. So,
430  * we require the caller to provide one.
431  */
432 dsa_area *
433 dsa_create(int tranche_id)
434 {
435  dsm_segment *segment;
436  dsa_area *area;
437 
438  /*
439  * Create the DSM segment that will hold the shared control object and the
440  * first segment of usable space.
441  */
442  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
443 
444  /*
445  * All segments backing this area are pinned, so that DSA can explicitly
446  * control their lifetime (otherwise a newly created segment belonging to
447  * this area might be freed when the only backend that happens to have it
448  * mapped in ends, corrupting the area).
449  */
450  dsm_pin_segment(segment);
451 
452  /* Create a new DSA area with the control object in this segment. */
453  area = create_internal(dsm_segment_address(segment),
455  tranche_id,
456  dsm_segment_handle(segment), segment);
457 
458  /* Clean up when the control segment detaches. */
461 
462  return area;
463 }
464 
465 /*
466  * Create a new shared area in an existing shared memory space, which may be
467  * either DSM or Postmaster-initialized memory. DSM segments will be
468  * allocated as required to extend the available space, though that can be
469  * prevented with dsa_set_size_limit(area, size) using the same size provided
470  * to dsa_create_in_place.
471  *
472  * Areas created in-place must eventually be released by the backend that
473  * created them and all backends that attach to them. This can be done
474  * explicitly with dsa_release_in_place, or, in the special case that 'place'
475  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
476  * segment so that a detach hook can be registered with the containing DSM
477  * segment.
478  *
479  * See dsa_create() for a note about the tranche arguments.
480  */
481 dsa_area *
482 dsa_create_in_place(void *place, size_t size,
483  int tranche_id, dsm_segment *segment)
484 {
485  dsa_area *area;
486 
487  area = create_internal(place, size, tranche_id,
488  DSM_HANDLE_INVALID, NULL);
489 
490  /*
491  * Clean up when the control segment detaches, if a containing DSM segment
492  * was provided.
493  */
494  if (segment != NULL)
496  PointerGetDatum(place));
497 
498  return area;
499 }
500 
501 /*
502  * Obtain a handle that can be passed to other processes so that they can
503  * attach to the given area. Cannot be called for areas created with
504  * dsa_create_in_place.
505  */
508 {
510  return area->control->handle;
511 }
512 
513 /*
514  * Attach to an area given a handle generated (possibly in another process) by
515  * dsa_get_handle. The area must have been created with dsa_create (not
516  * dsa_create_in_place).
517  */
518 dsa_area *
520 {
521  dsm_segment *segment;
522  dsa_area *area;
523 
524  /*
525  * An area handle is really a DSM segment handle for the first segment, so
526  * we go ahead and attach to that.
527  */
528  segment = dsm_attach(handle);
529  if (segment == NULL)
530  ereport(ERROR,
531  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
532  errmsg("could not attach to dynamic shared area")));
533 
534  area = attach_internal(dsm_segment_address(segment), segment, handle);
535 
536  /* Clean up when the control segment detaches. */
539 
540  return area;
541 }
542 
543 /*
544  * Attach to an area that was created with dsa_create_in_place. The caller
545  * must somehow know the location in memory that was used when the area was
546  * created, though it may be mapped at a different virtual address in this
547  * process.
548  *
549  * See dsa_create_in_place for note about releasing in-place areas, and the
550  * optional 'segment' argument which can be provided to allow automatic
551  * release if the containing memory happens to be a DSM segment.
552  */
553 dsa_area *
554 dsa_attach_in_place(void *place, dsm_segment *segment)
555 {
556  dsa_area *area;
557 
558  area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
559 
560  /*
561  * Clean up when the control segment detaches, if a containing DSM segment
562  * was provided.
563  */
564  if (segment != NULL)
566  PointerGetDatum(place));
567 
568  return area;
569 }
570 
571 /*
572  * Release a DSA area that was produced by dsa_create_in_place or
573  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
574  * interface suitable for on_dsm_detach, for the convenience of users who want
575  * to create a DSA segment inside an existing DSM segment and have it
576  * automatically released when the containing DSM segment is detached.
577  * 'place' should be the address of the place where the area was created.
578  *
579  * This callback is automatically registered for the DSM segment containing
580  * the control object of in-place areas when a segment is provided to
581  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
582  * with dsa_create.
583  */
584 void
586 {
588 }
589 
590 /*
591  * Release a DSA area that was produced by dsa_create_in_place or
592  * dsa_attach_in_place. The 'code' argument is ignored but provides an
593  * interface suitable for on_shmem_exit or before_shmem_exit, for the
594  * convenience of users who want to create a DSA segment inside shared memory
595  * other than a DSM segment and have it automatically release at backend exit.
596  * 'place' should be the address of the place where the area was created.
597  */
598 void
600 {
602 }
603 
604 /*
605  * Release a DSA area that was produced by dsa_create_in_place or
606  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
607  * callbacks so that this is managed automatically, because failure to release
608  * an area created in-place leaks its segments permanently.
609  *
610  * This is also called automatically for areas produced by dsa_create or
611  * dsa_attach as an implementation detail.
612  */
613 void
615 {
616  dsa_area_control *control = (dsa_area_control *) place;
617  int i;
618 
619  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
620  Assert(control->segment_header.magic ==
621  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
622  Assert(control->refcnt > 0);
623  if (--control->refcnt == 0)
624  {
625  for (i = 0; i <= control->high_segment_index; ++i)
626  {
627  dsm_handle handle;
628 
629  handle = control->segment_handles[i];
630  if (handle != DSM_HANDLE_INVALID)
631  dsm_unpin_segment(handle);
632  }
633  }
634  LWLockRelease(&control->lock);
635 }
636 
637 /*
638  * Keep a DSA area attached until end of session or explicit detach.
639  *
640  * By default, areas are owned by the current resource owner, which means they
641  * are detached automatically when that scope ends.
642  */
643 void
645 {
646  int i;
647 
648  Assert(!area->mapping_pinned);
649  area->mapping_pinned = true;
650 
651  for (i = 0; i <= area->high_segment_index; ++i)
652  if (area->segment_maps[i].segment != NULL)
654 }
655 
656 /*
657  * Allocate memory in this storage area. The return value is a dsa_pointer
658  * that can be passed to other processes, and converted to a local pointer
659  * with dsa_get_address. 'flags' is a bitmap which should be constructed
660  * from the following values:
661  *
662  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
663  * will result in an ERROR.
664  *
665  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
666  * no memory is available or a size limit established by dsa_set_size_limit
667  * would be exceeded. Otherwise, such allocations will result in an ERROR.
668  *
669  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
670  * contents of newly-allocated memory are indeterminate.
671  *
672  * These flags correspond to similarly named flags used by
673  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
674  * dsa_allocate0 which expand to a call to this function with commonly used
675  * flags.
676  */
678 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
679 {
680  uint16 size_class;
681  dsa_pointer start_pointer;
682  dsa_segment_map *segment_map;
683  dsa_pointer result;
684 
685  Assert(size > 0);
686 
687  /* Sanity check on huge individual allocation size. */
688  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
689  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
690  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
691 
692  /*
693  * If bigger than the largest size class, just grab a run of pages from
694  * the free page manager, instead of allocating an object from a pool.
695  * There will still be a span, but it's a special class of span that
696  * manages this whole allocation and simply gives all pages back to the
697  * free page manager when dsa_free is called.
698  */
699  if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
700  {
701  size_t npages = fpm_size_to_pages(size);
702  size_t first_page;
703  dsa_pointer span_pointer;
705 
706  /* Obtain a span object. */
707  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
708  if (!DsaPointerIsValid(span_pointer))
709  {
710  /* Raise error unless asked not to. */
711  if ((flags & DSA_ALLOC_NO_OOM) == 0)
712  ereport(ERROR,
713  (errcode(ERRCODE_OUT_OF_MEMORY),
714  errmsg("out of memory"),
715  errdetail("Failed on DSA request of size %zu.",
716  size)));
717  return InvalidDsaPointer;
718  }
719 
721 
722  /* Find a segment from which to allocate. */
723  segment_map = get_best_segment(area, npages);
724  if (segment_map == NULL)
725  segment_map = make_new_segment(area, npages);
726  if (segment_map == NULL)
727  {
728  /* Can't make any more segments: game over. */
730  dsa_free(area, span_pointer);
731 
732  /* Raise error unless asked not to. */
733  if ((flags & DSA_ALLOC_NO_OOM) == 0)
734  ereport(ERROR,
735  (errcode(ERRCODE_OUT_OF_MEMORY),
736  errmsg("out of memory"),
737  errdetail("Failed on DSA request of size %zu.",
738  size)));
739  return InvalidDsaPointer;
740  }
741 
742  /*
743  * Ask the free page manager for a run of pages. This should always
744  * succeed, since both get_best_segment and make_new_segment should
745  * only return a non-NULL pointer if it actually contains enough
746  * contiguous freespace. If it does fail, something in our backend
747  * private state is out of whack, so use FATAL to kill the process.
748  */
749  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
750  elog(FATAL,
751  "dsa_allocate could not find %zu free pages", npages);
753 
754  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
755  first_page * FPM_PAGE_SIZE);
756 
757  /* Initialize span and pagemap. */
759  LW_EXCLUSIVE);
760  init_span(area, span_pointer, pool, start_pointer, npages,
762  segment_map->pagemap[first_page] = span_pointer;
764 
765  /* Zero-initialize the memory if requested. */
766  if ((flags & DSA_ALLOC_ZERO) != 0)
767  memset(dsa_get_address(area, start_pointer), 0, size);
768 
769  return start_pointer;
770  }
771 
772  /* Map allocation to a size class. */
774  {
775  int mapidx;
776 
777  /* For smaller sizes we have a lookup table... */
778  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
780  size_class = dsa_size_class_map[mapidx];
781  }
782  else
783  {
784  uint16 min;
785  uint16 max;
786 
787  /* ... and for the rest we search by binary chop. */
789  max = lengthof(dsa_size_classes) - 1;
790 
791  while (min < max)
792  {
793  uint16 mid = (min + max) / 2;
794  uint16 class_size = dsa_size_classes[mid];
795 
796  if (class_size < size)
797  min = mid + 1;
798  else
799  max = mid;
800  }
801 
802  size_class = min;
803  }
804  Assert(size <= dsa_size_classes[size_class]);
805  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
806 
807  /* Attempt to allocate an object from the appropriate pool. */
808  result = alloc_object(area, size_class);
809 
810  /* Check for failure to allocate. */
811  if (!DsaPointerIsValid(result))
812  {
813  /* Raise error unless asked not to. */
814  if ((flags & DSA_ALLOC_NO_OOM) == 0)
815  ereport(ERROR,
816  (errcode(ERRCODE_OUT_OF_MEMORY),
817  errmsg("out of memory"),
818  errdetail("Failed on DSA request of size %zu.", size)));
819  return InvalidDsaPointer;
820  }
821 
822  /* Zero-initialize the memory if requested. */
823  if ((flags & DSA_ALLOC_ZERO) != 0)
824  memset(dsa_get_address(area, result), 0, size);
825 
826  return result;
827 }
828 
829 /*
830  * Free memory obtained with dsa_allocate.
831  */
832 void
834 {
835  dsa_segment_map *segment_map;
836  int pageno;
837  dsa_pointer span_pointer;
838  dsa_area_span *span;
839  char *superblock;
840  char *object;
841  size_t size;
842  int size_class;
843 
844  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
846 
847  /* Locate the object, span and pool. */
848  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
849  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
850  span_pointer = segment_map->pagemap[pageno];
851  span = dsa_get_address(area, span_pointer);
852  superblock = dsa_get_address(area, span->start);
853  object = dsa_get_address(area, dp);
854  size_class = span->size_class;
855  size = dsa_size_classes[size_class];
856 
857  /*
858  * Special case for large objects that live in a special span: we return
859  * those pages directly to the free page manager and free the span.
860  */
861  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
862  {
863 
864 #ifdef CLOBBER_FREED_MEMORY
865  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
866 #endif
867 
868  /* Give pages back to free page manager. */
870  FreePageManagerPut(segment_map->fpm,
872  span->npages);
873 
874  /* Move segment to appropriate bin if necessary. */
875  rebin_segment(area, segment_map);
877 
878  /* Unlink span. */
880  LW_EXCLUSIVE);
881  unlink_span(area, span);
883  /* Free the span object so it can be reused. */
884  dsa_free(area, span_pointer);
885  return;
886  }
887 
888 #ifdef CLOBBER_FREED_MEMORY
889  memset(object, 0x7f, size);
890 #endif
891 
892  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
893 
894  /* Put the object on the span's freelist. */
895  Assert(object >= superblock);
896  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
897  Assert((object - superblock) % size == 0);
898  NextFreeObjectIndex(object) = span->firstfree;
899  span->firstfree = (object - superblock) / size;
900  ++span->nallocatable;
901 
902  /*
903  * See if the span needs to moved to a different fullness class, or be
904  * freed so its pages can be given back to the segment.
905  */
906  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
907  {
908  /*
909  * The block was completely full and is located in the
910  * highest-numbered fullness class, which is never scanned for free
911  * chunks. We must move it to the next-lower fullness class.
912  */
913  unlink_span(area, span);
914  add_span_to_fullness_class(area, span, span_pointer,
916 
917  /*
918  * If this is the only span, and there is no active span, then we
919  * should probably move this span to fullness class 1. (Otherwise if
920  * you allocate exactly all the objects in the only span, it moves to
921  * class 3, then you free them all, it moves to 2, and then is given
922  * back, leaving no active span).
923  */
924  }
925  else if (span->nallocatable == span->nmax &&
926  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
927  {
928  /*
929  * This entire block is free, and it's not the active block for this
930  * size class. Return the memory to the free page manager. We don't
931  * do this for the active block to prevent hysteresis: if we
932  * repeatedly allocate and free the only chunk in the active block, it
933  * will be very inefficient if we deallocate and reallocate the block
934  * every time.
935  */
936  destroy_superblock(area, span_pointer);
937  }
938 
939  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
940 }
941 
942 /*
943  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
944  * memory allocated by the given area (possibly in another process) that
945  * hasn't yet been freed. This may cause a segment to be mapped into the
946  * current process if required, and may cause freed segments to be unmapped.
947  */
948 void *
950 {
952  size_t offset;
953 
954  /* Convert InvalidDsaPointer to NULL. */
955  if (!DsaPointerIsValid(dp))
956  return NULL;
957 
958  /* Process any requests to detach from freed segments. */
960 
961  /* Break the dsa_pointer into its components. */
963  offset = DSA_EXTRACT_OFFSET(dp);
965 
966  /* Check if we need to cause this segment to be mapped in. */
967  if (unlikely(area->segment_maps[index].mapped_address == NULL))
968  {
969  /* Call for effect (we don't need the result). */
971  }
972 
973  return area->segment_maps[index].mapped_address + offset;
974 }
975 
976 /*
977  * Pin this area, so that it will continue to exist even if all backends
978  * detach from it. In that case, the area can still be reattached to if a
979  * handle has been recorded somewhere.
980  */
981 void
983 {
985  if (area->control->pinned)
986  {
988  elog(ERROR, "dsa_area already pinned");
989  }
990  area->control->pinned = true;
991  ++area->control->refcnt;
993 }
994 
995 /*
996  * Undo the effects of dsa_pin, so that the given area can be freed when no
997  * backends are attached to it. May be called only if dsa_pin has been
998  * called.
999  */
1000 void
1002 {
1004  Assert(area->control->refcnt > 1);
1005  if (!area->control->pinned)
1006  {
1008  elog(ERROR, "dsa_area not pinned");
1009  }
1010  area->control->pinned = false;
1011  --area->control->refcnt;
1013 }
1014 
1015 /*
1016  * Set the total size limit for this area. This limit is checked whenever new
1017  * segments need to be allocated from the operating system. If the new size
1018  * limit is already exceeded, this has no immediate effect.
1019  *
1020  * Note that the total virtual memory usage may be temporarily larger than
1021  * this limit when segments have been freed, but not yet detached by all
1022  * backends that have attached to them.
1023  */
1024 void
1025 dsa_set_size_limit(dsa_area *area, size_t limit)
1026 {
1028  area->control->max_total_segment_size = limit;
1030 }
1031 
1032 /*
1033  * Aggressively free all spare memory in the hope of returning DSM segments to
1034  * the operating system.
1035  */
1036 void
1038 {
1039  int size_class;
1040 
1041  /*
1042  * Trim in reverse pool order so we get to the spans-of-spans last, just
1043  * in case any become entirely free while processing all the other pools.
1044  */
1045  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1046  {
1047  dsa_area_pool *pool = &area->control->pools[size_class];
1048  dsa_pointer span_pointer;
1049 
1050  if (size_class == DSA_SCLASS_SPAN_LARGE)
1051  {
1052  /* Large object frees give back segments aggressively already. */
1053  continue;
1054  }
1055 
1056  /*
1057  * Search fullness class 1 only. That is where we expect to find an
1058  * entirely empty superblock (entirely empty superblocks in other
1059  * fullness classes are returned to the free page map by dsa_free).
1060  */
1061  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1062  span_pointer = pool->spans[1];
1063  while (DsaPointerIsValid(span_pointer))
1064  {
1065  dsa_area_span *span = dsa_get_address(area, span_pointer);
1066  dsa_pointer next = span->nextspan;
1067 
1068  if (span->nallocatable == span->nmax)
1069  destroy_superblock(area, span_pointer);
1070 
1071  span_pointer = next;
1072  }
1073  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1074  }
1075 }
1076 
1077 /*
1078  * Print out debugging information about the internal state of the shared
1079  * memory area.
1080  */
1081 void
1083 {
1084  size_t i,
1085  j;
1086 
1087  /*
1088  * Note: This gives an inconsistent snapshot as it acquires and releases
1089  * individual locks as it goes...
1090  */
1091 
1094  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1095  fprintf(stderr, " max_total_segment_size: %zu\n",
1097  fprintf(stderr, " total_segment_size: %zu\n",
1098  area->control->total_segment_size);
1099  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1100  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1101  fprintf(stderr, " segment bins:\n");
1102  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1103  {
1105  {
1106  dsa_segment_index segment_index;
1107 
1108  fprintf(stderr,
1109  " segment bin %zu (at least %d contiguous pages free):\n",
1110  i, 1 << (i - 1));
1111  segment_index = area->control->segment_bins[i];
1112  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1113  {
1114  dsa_segment_map *segment_map;
1115 
1116  segment_map =
1117  get_segment_by_index(area, segment_index);
1118 
1119  fprintf(stderr,
1120  " segment index %zu, usable_pages = %zu, "
1121  "contiguous_pages = %zu, mapped at %p\n",
1122  segment_index,
1123  segment_map->header->usable_pages,
1124  fpm_largest(segment_map->fpm),
1125  segment_map->mapped_address);
1126  segment_index = segment_map->header->next;
1127  }
1128  }
1129  }
1131 
1132  fprintf(stderr, " pools:\n");
1133  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1134  {
1135  bool found = false;
1136 
1138  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1139  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1140  found = true;
1141  if (found)
1142  {
1144  fprintf(stderr, " pool for blocks of span objects:\n");
1145  else if (i == DSA_SCLASS_SPAN_LARGE)
1146  fprintf(stderr, " pool for large object spans:\n");
1147  else
1148  fprintf(stderr,
1149  " pool for size class %zu (object size %hu bytes):\n",
1150  i, dsa_size_classes[i]);
1151  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1152  {
1153  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1154  fprintf(stderr, " fullness class %zu is empty\n", j);
1155  else
1156  {
1157  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1158 
1159  fprintf(stderr, " fullness class %zu:\n", j);
1160  while (DsaPointerIsValid(span_pointer))
1161  {
1162  dsa_area_span *span;
1163 
1164  span = dsa_get_address(area, span_pointer);
1165  fprintf(stderr,
1166  " span descriptor at "
1167  DSA_POINTER_FORMAT ", superblock at "
1169  ", pages = %zu, objects free = %hu/%hu\n",
1170  span_pointer, span->start, span->npages,
1171  span->nallocatable, span->nmax);
1172  span_pointer = span->nextspan;
1173  }
1174  }
1175  }
1176  }
1178  }
1179 }
1180 
1181 /*
1182  * Return the smallest size that you can successfully provide to
1183  * dsa_create_in_place.
1184  */
1185 size_t
1187 {
1188  size_t size;
1189  int pages = 0;
1190 
1191  size = MAXALIGN(sizeof(dsa_area_control)) +
1192  MAXALIGN(sizeof(FreePageManager));
1193 
1194  /* Figure out how many pages we need, including the page map... */
1195  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1196  {
1197  ++pages;
1198  size += sizeof(dsa_pointer);
1199  }
1200 
1201  return pages * FPM_PAGE_SIZE;
1202 }
1203 
1204 /*
1205  * Workhorse function for dsa_create and dsa_create_in_place.
1206  */
1207 static dsa_area *
1208 create_internal(void *place, size_t size,
1209  int tranche_id,
1210  dsm_handle control_handle,
1211  dsm_segment *control_segment)
1212 {
1213  dsa_area_control *control;
1214  dsa_area *area;
1215  dsa_segment_map *segment_map;
1216  size_t usable_pages;
1217  size_t total_pages;
1218  size_t metadata_bytes;
1219  int i;
1220 
1221  /* Sanity check on the space we have to work in. */
1222  if (size < dsa_minimum_size())
1223  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1224  dsa_minimum_size(), size);
1225 
1226  /* Now figure out how much space is usable */
1227  total_pages = size / FPM_PAGE_SIZE;
1228  metadata_bytes =
1229  MAXALIGN(sizeof(dsa_area_control)) +
1230  MAXALIGN(sizeof(FreePageManager)) +
1231  total_pages * sizeof(dsa_pointer);
1232  /* Add padding up to next page boundary. */
1233  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1234  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1235  Assert(metadata_bytes <= size);
1236  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1237 
1238  /*
1239  * Initialize the dsa_area_control object located at the start of the
1240  * space.
1241  */
1242  control = (dsa_area_control *) place;
1243  memset(place, 0, sizeof(*control));
1244  control->segment_header.magic =
1245  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1248  control->segment_header.usable_pages = usable_pages;
1249  control->segment_header.freed = false;
1251  control->handle = control_handle;
1252  control->max_total_segment_size = (size_t) -1;
1253  control->total_segment_size = size;
1254  control->segment_handles[0] = control_handle;
1255  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1257  control->refcnt = 1;
1258  control->lwlock_tranche_id = tranche_id;
1259 
1260  /*
1261  * Create the dsa_area object that this backend will use to access the
1262  * area. Other backends will need to obtain their own dsa_area object by
1263  * attaching.
1264  */
1265  area = palloc(sizeof(dsa_area));
1266  area->control = control;
1267  area->mapping_pinned = false;
1268  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1269  area->high_segment_index = 0;
1270  area->freed_segment_counter = 0;
1271  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1272  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1274  control->lwlock_tranche_id);
1275 
1276  /* Set up the segment map for this process's mapping. */
1277  segment_map = &area->segment_maps[0];
1278  segment_map->segment = control_segment;
1279  segment_map->mapped_address = place;
1280  segment_map->header = (dsa_segment_header *) place;
1281  segment_map->fpm = (FreePageManager *)
1282  (segment_map->mapped_address +
1283  MAXALIGN(sizeof(dsa_area_control)));
1284  segment_map->pagemap = (dsa_pointer *)
1285  (segment_map->mapped_address +
1286  MAXALIGN(sizeof(dsa_area_control)) +
1287  MAXALIGN(sizeof(FreePageManager)));
1288 
1289  /* Set up the free page map. */
1290  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1291  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1292 
1293  if (usable_pages > 0)
1294  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1295  usable_pages);
1296 
1297  /* Put this segment into the appropriate bin. */
1298  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1299  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1300 
1301  return area;
1302 }
1303 
1304 /*
1305  * Workhorse function for dsa_attach and dsa_attach_in_place.
1306  */
1307 static dsa_area *
1308 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1309 {
1310  dsa_area_control *control;
1311  dsa_area *area;
1312  dsa_segment_map *segment_map;
1313 
1314  control = (dsa_area_control *) place;
1315  Assert(control->handle == handle);
1316  Assert(control->segment_handles[0] == handle);
1317  Assert(control->segment_header.magic ==
1318  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1319 
1320  /* Build the backend-local area object. */
1321  area = palloc(sizeof(dsa_area));
1322  area->control = control;
1323  area->mapping_pinned = false;
1324  memset(&area->segment_maps[0], 0,
1325  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1326  area->high_segment_index = 0;
1327 
1328  /* Set up the segment map for this process's mapping. */
1329  segment_map = &area->segment_maps[0];
1330  segment_map->segment = segment; /* NULL for in-place */
1331  segment_map->mapped_address = place;
1332  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1333  segment_map->fpm = (FreePageManager *)
1334  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1335  segment_map->pagemap = (dsa_pointer *)
1336  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1337  MAXALIGN(sizeof(FreePageManager)));
1338 
1339  /* Bump the reference count. */
1341  if (control->refcnt == 0)
1342  {
1343  /* We can't attach to a DSA area that has already been destroyed. */
1344  ereport(ERROR,
1345  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1346  errmsg("could not attach to dynamic shared area")));
1347  }
1348  ++control->refcnt;
1351 
1352  return area;
1353 }
1354 
1355 /*
1356  * Add a new span to fullness class 1 of the indicated pool.
1357  */
1358 static void
1360  dsa_pointer span_pointer,
1361  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1362  uint16 size_class)
1363 {
1364  dsa_area_span *span = dsa_get_address(area, span_pointer);
1365  size_t obsize = dsa_size_classes[size_class];
1366 
1367  /*
1368  * The per-pool lock must be held because we manipulate the span list for
1369  * this pool.
1370  */
1371  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1372 
1373  /* Push this span onto the front of the span list for fullness class 1. */
1374  if (DsaPointerIsValid(pool->spans[1]))
1375  {
1376  dsa_area_span *head = (dsa_area_span *)
1377  dsa_get_address(area, pool->spans[1]);
1378 
1379  head->prevspan = span_pointer;
1380  }
1381  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1382  span->nextspan = pool->spans[1];
1383  span->prevspan = InvalidDsaPointer;
1384  pool->spans[1] = span_pointer;
1385 
1386  span->start = start;
1387  span->npages = npages;
1388  span->size_class = size_class;
1389  span->ninitialized = 0;
1390  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1391  {
1392  /*
1393  * A block-of-spans contains its own descriptor, so mark one object as
1394  * initialized and reduce the count of allocatable objects by one.
1395  * Doing this here has the side effect of also reducing nmax by one,
1396  * which is important to make sure we free this object at the correct
1397  * time.
1398  */
1399  span->ninitialized = 1;
1400  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1401  }
1402  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1403  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1405  span->nmax = span->nallocatable;
1406  span->fclass = 1;
1407 }
1408 
1409 /*
1410  * Transfer the first span in one fullness class to the head of another
1411  * fullness class.
1412  */
1413 static bool
1415  dsa_area_pool *pool, int fromclass, int toclass)
1416 {
1417  dsa_pointer span_pointer;
1418  dsa_area_span *span;
1419  dsa_area_span *nextspan;
1420 
1421  /* Can't do it if source list is empty. */
1422  span_pointer = pool->spans[fromclass];
1423  if (!DsaPointerIsValid(span_pointer))
1424  return false;
1425 
1426  /* Remove span from head of source list. */
1427  span = dsa_get_address(area, span_pointer);
1428  pool->spans[fromclass] = span->nextspan;
1429  if (DsaPointerIsValid(span->nextspan))
1430  {
1431  nextspan = (dsa_area_span *)
1432  dsa_get_address(area, span->nextspan);
1433  nextspan->prevspan = InvalidDsaPointer;
1434  }
1435 
1436  /* Add span to head of target list. */
1437  span->nextspan = pool->spans[toclass];
1438  pool->spans[toclass] = span_pointer;
1439  if (DsaPointerIsValid(span->nextspan))
1440  {
1441  nextspan = (dsa_area_span *)
1442  dsa_get_address(area, span->nextspan);
1443  nextspan->prevspan = span_pointer;
1444  }
1445  span->fclass = toclass;
1446 
1447  return true;
1448 }
1449 
1450 /*
1451  * Allocate one object of the requested size class from the given area.
1452  */
1453 static inline dsa_pointer
1454 alloc_object(dsa_area *area, int size_class)
1455 {
1456  dsa_area_pool *pool = &area->control->pools[size_class];
1457  dsa_area_span *span;
1458  dsa_pointer block;
1459  dsa_pointer result;
1460  char *object;
1461  size_t size;
1462 
1463  /*
1464  * Even though ensure_active_superblock can in turn call alloc_object if
1465  * it needs to allocate a new span, that's always from a different pool,
1466  * and the order of lock acquisition is always the same, so it's OK that
1467  * we hold this lock for the duration of this function.
1468  */
1469  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1470  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1471 
1472  /*
1473  * If there's no active superblock, we must successfully obtain one or
1474  * fail the request.
1475  */
1476  if (!DsaPointerIsValid(pool->spans[1]) &&
1477  !ensure_active_superblock(area, pool, size_class))
1478  {
1479  result = InvalidDsaPointer;
1480  }
1481  else
1482  {
1483  /*
1484  * There should be a block in fullness class 1 at this point, and it
1485  * should never be completely full. Thus we can either pop an object
1486  * from the free list or, failing that, initialize a new object.
1487  */
1488  Assert(DsaPointerIsValid(pool->spans[1]));
1489  span = (dsa_area_span *)
1490  dsa_get_address(area, pool->spans[1]);
1491  Assert(span->nallocatable > 0);
1492  block = span->start;
1493  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1494  size = dsa_size_classes[size_class];
1495  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1496  {
1497  result = block + span->firstfree * size;
1498  object = dsa_get_address(area, result);
1499  span->firstfree = NextFreeObjectIndex(object);
1500  }
1501  else
1502  {
1503  result = block + span->ninitialized * size;
1504  ++span->ninitialized;
1505  }
1506  --span->nallocatable;
1507 
1508  /* If it's now full, move it to the highest-numbered fullness class. */
1509  if (span->nallocatable == 0)
1510  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1511  }
1512 
1513  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1514  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1515 
1516  return result;
1517 }
1518 
1519 /*
1520  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1521  * superblocks are completely full and no more can be allocated.
1522  *
1523  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1524  * utilization percentage is at least K/N, but we only enforce this rigorously
1525  * for the highest-numbered fullness class, which always contains exactly
1526  * those blocks that are completely full. It's otherwise acceptable for a
1527  * block to be in a higher-numbered fullness class than the one to which it
1528  * logically belongs. In addition, the active block, which is always the
1529  * first block in fullness class 1, is permitted to have a higher allocation
1530  * percentage than would normally be allowable for that fullness class; we
1531  * don't move it until it's completely full, and then it goes to the
1532  * highest-numbered fullness class.
1533  *
1534  * It might seem odd that the active block is the head of fullness class 1
1535  * rather than fullness class 0, but experience with other allocators has
1536  * shown that it's usually better to allocate from a block that's moderately
1537  * full rather than one that's nearly empty. Insofar as is reasonably
1538  * possible, we want to avoid performing new allocations in a block that would
1539  * otherwise become empty soon.
1540  */
1541 static bool
1543  int size_class)
1544 {
1545  dsa_pointer span_pointer;
1546  dsa_pointer start_pointer;
1547  size_t obsize = dsa_size_classes[size_class];
1548  size_t nmax;
1549  int fclass;
1550  size_t npages = 1;
1551  size_t first_page;
1552  size_t i;
1553  dsa_segment_map *segment_map;
1554 
1555  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1556 
1557  /*
1558  * Compute the number of objects that will fit in a block of this size
1559  * class. Span-of-spans blocks are just a single page, and the first
1560  * object isn't available for use because it describes the block-of-spans
1561  * itself.
1562  */
1563  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1564  nmax = FPM_PAGE_SIZE / obsize - 1;
1565  else
1566  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1567 
1568  /*
1569  * If fullness class 1 is empty, try to find a span to put in it by
1570  * scanning higher-numbered fullness classes (excluding the last one,
1571  * whose blocks are certain to all be completely full).
1572  */
1573  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1574  {
1575  span_pointer = pool->spans[fclass];
1576 
1577  while (DsaPointerIsValid(span_pointer))
1578  {
1579  int tfclass;
1580  dsa_area_span *span;
1581  dsa_area_span *nextspan;
1582  dsa_area_span *prevspan;
1583  dsa_pointer next_span_pointer;
1584 
1585  span = (dsa_area_span *)
1586  dsa_get_address(area, span_pointer);
1587  next_span_pointer = span->nextspan;
1588 
1589  /* Figure out what fullness class should contain this span. */
1590  tfclass = (nmax - span->nallocatable)
1591  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1592 
1593  /* Look up next span. */
1594  if (DsaPointerIsValid(span->nextspan))
1595  nextspan = (dsa_area_span *)
1596  dsa_get_address(area, span->nextspan);
1597  else
1598  nextspan = NULL;
1599 
1600  /*
1601  * If utilization has dropped enough that this now belongs in some
1602  * other fullness class, move it there.
1603  */
1604  if (tfclass < fclass)
1605  {
1606  /* Remove from the current fullness class list. */
1607  if (pool->spans[fclass] == span_pointer)
1608  {
1609  /* It was the head; remove it. */
1611  pool->spans[fclass] = span->nextspan;
1612  if (nextspan != NULL)
1613  nextspan->prevspan = InvalidDsaPointer;
1614  }
1615  else
1616  {
1617  /* It was not the head. */
1619  prevspan = (dsa_area_span *)
1620  dsa_get_address(area, span->prevspan);
1621  prevspan->nextspan = span->nextspan;
1622  }
1623  if (nextspan != NULL)
1624  nextspan->prevspan = span->prevspan;
1625 
1626  /* Push onto the head of the new fullness class list. */
1627  span->nextspan = pool->spans[tfclass];
1628  pool->spans[tfclass] = span_pointer;
1629  span->prevspan = InvalidDsaPointer;
1630  if (DsaPointerIsValid(span->nextspan))
1631  {
1632  nextspan = (dsa_area_span *)
1633  dsa_get_address(area, span->nextspan);
1634  nextspan->prevspan = span_pointer;
1635  }
1636  span->fclass = tfclass;
1637  }
1638 
1639  /* Advance to next span on list. */
1640  span_pointer = next_span_pointer;
1641  }
1642 
1643  /* Stop now if we found a suitable block. */
1644  if (DsaPointerIsValid(pool->spans[1]))
1645  return true;
1646  }
1647 
1648  /*
1649  * If there are no blocks that properly belong in fullness class 1, pick
1650  * one from some other fullness class and move it there anyway, so that we
1651  * have an allocation target. Our last choice is to transfer a block
1652  * that's almost empty (and might become completely empty soon if left
1653  * alone), but even that is better than failing, which is what we must do
1654  * if there are no blocks at all with freespace.
1655  */
1656  Assert(!DsaPointerIsValid(pool->spans[1]));
1657  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1658  if (transfer_first_span(area, pool, fclass, 1))
1659  return true;
1660  if (!DsaPointerIsValid(pool->spans[1]) &&
1661  transfer_first_span(area, pool, 0, 1))
1662  return true;
1663 
1664  /*
1665  * We failed to find an existing span with free objects, so we need to
1666  * allocate a new superblock and construct a new span to manage it.
1667  *
1668  * First, get a dsa_area_span object to describe the new superblock block
1669  * ... unless this allocation is for a dsa_area_span object, in which case
1670  * that's surely not going to work. We handle that case by storing the
1671  * span describing a block-of-spans inline.
1672  */
1673  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1674  {
1675  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1676  if (!DsaPointerIsValid(span_pointer))
1677  return false;
1678  npages = DSA_PAGES_PER_SUPERBLOCK;
1679  }
1680 
1681  /* Find or create a segment and allocate the superblock. */
1683  segment_map = get_best_segment(area, npages);
1684  if (segment_map == NULL)
1685  {
1686  segment_map = make_new_segment(area, npages);
1687  if (segment_map == NULL)
1688  {
1690  return false;
1691  }
1692  }
1693 
1694  /*
1695  * This shouldn't happen: get_best_segment() or make_new_segment()
1696  * promised that we can successfully allocate npages.
1697  */
1698  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1699  elog(FATAL,
1700  "dsa_allocate could not find %zu free pages for superblock",
1701  npages);
1703 
1704  /* Compute the start of the superblock. */
1705  start_pointer =
1706  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1707  first_page * FPM_PAGE_SIZE);
1708 
1709  /*
1710  * If this is a block-of-spans, carve the descriptor right out of the
1711  * allocated space.
1712  */
1713  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1714  {
1715  /*
1716  * We have a pointer into the segment. We need to build a dsa_pointer
1717  * from the segment index and offset into the segment.
1718  */
1719  span_pointer = start_pointer;
1720  }
1721 
1722  /* Initialize span and pagemap. */
1723  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1724  for (i = 0; i < npages; ++i)
1725  segment_map->pagemap[first_page + i] = span_pointer;
1726 
1727  return true;
1728 }
1729 
1730 /*
1731  * Return the segment map corresponding to a given segment index, mapping the
1732  * segment in if necessary. For internal segment book-keeping, this is called
1733  * with the area lock held. It is also called by dsa_free and dsa_get_address
1734  * without any locking, relying on the fact they have a known live segment
1735  * index and they always call check_for_freed_segments to ensures that any
1736  * freed segment occupying the same slot is detached first.
1737  */
1738 static dsa_segment_map *
1740 {
1741  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1742  {
1743  dsm_handle handle;
1744  dsm_segment *segment;
1745  dsa_segment_map *segment_map;
1746 
1747  /*
1748  * If we are reached by dsa_free or dsa_get_address, there must be at
1749  * least one object allocated in the referenced segment. Otherwise,
1750  * their caller has a double-free or access-after-free bug, which we
1751  * have no hope of detecting. So we know it's safe to access this
1752  * array slot without holding a lock; it won't change underneath us.
1753  * Furthermore, we know that we can see the latest contents of the
1754  * slot, as explained in check_for_freed_segments, which those
1755  * functions call before arriving here.
1756  */
1757  handle = area->control->segment_handles[index];
1758 
1759  /* It's an error to try to access an unused slot. */
1760  if (handle == DSM_HANDLE_INVALID)
1761  elog(ERROR,
1762  "dsa_area could not attach to a segment that has been freed");
1763 
1764  segment = dsm_attach(handle);
1765  if (segment == NULL)
1766  elog(ERROR, "dsa_area could not attach to segment");
1767  if (area->mapping_pinned)
1768  dsm_pin_mapping(segment);
1769  segment_map = &area->segment_maps[index];
1770  segment_map->segment = segment;
1771  segment_map->mapped_address = dsm_segment_address(segment);
1772  segment_map->header =
1773  (dsa_segment_header *) segment_map->mapped_address;
1774  segment_map->fpm = (FreePageManager *)
1775  (segment_map->mapped_address +
1776  MAXALIGN(sizeof(dsa_segment_header)));
1777  segment_map->pagemap = (dsa_pointer *)
1778  (segment_map->mapped_address +
1779  MAXALIGN(sizeof(dsa_segment_header)) +
1780  MAXALIGN(sizeof(FreePageManager)));
1781 
1782  /* Remember the highest index this backend has ever mapped. */
1783  if (area->high_segment_index < index)
1784  area->high_segment_index = index;
1785 
1786  Assert(segment_map->header->magic ==
1788  }
1789 
1790  /*
1791  * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1792  * but it's a bug in the calling code and undefined behavior if the
1793  * address is not live (ie if the segment might possibly have been freed,
1794  * they're trying to use a dangling pointer).
1795  *
1796  * For dsa.c code that holds the area lock to manipulate segment_bins
1797  * lists, it would be a bug if we ever reach a freed segment here. After
1798  * it's marked as freed, the only thing any backend should do with it is
1799  * unmap it, and it should always have done that in
1800  * check_for_freed_segments_locked() before arriving here to resolve an
1801  * index to a segment_map.
1802  *
1803  * Either way we can assert that we aren't returning a freed segment.
1804  */
1805  Assert(!area->segment_maps[index].header->freed);
1806 
1807  return &area->segment_maps[index];
1808 }
1809 
1810 /*
1811  * Return a superblock to the free page manager. If the underlying segment
1812  * has become entirely free, then return it to the operating system.
1813  *
1814  * The appropriate pool lock must be held.
1815  */
1816 static void
1818 {
1819  dsa_area_span *span = dsa_get_address(area, span_pointer);
1820  int size_class = span->size_class;
1821  dsa_segment_map *segment_map;
1822 
1823 
1824  /* Remove it from its fullness class list. */
1825  unlink_span(area, span);
1826 
1827  /*
1828  * Note: Here we acquire the area lock while we already hold a per-pool
1829  * lock. We never hold the area lock and then take a pool lock, or we
1830  * could deadlock.
1831  */
1834  segment_map =
1836  FreePageManagerPut(segment_map->fpm,
1838  span->npages);
1839  /* Check if the segment is now entirely free. */
1840  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1841  {
1842  dsa_segment_index index = get_segment_index(area, segment_map);
1843 
1844  /* If it's not the segment with extra control data, free it. */
1845  if (index != 0)
1846  {
1847  /*
1848  * Give it back to the OS, and allow other backends to detect that
1849  * they need to detach.
1850  */
1851  unlink_segment(area, segment_map);
1852  segment_map->header->freed = true;
1854  segment_map->header->size);
1855  area->control->total_segment_size -=
1856  segment_map->header->size;
1858  dsm_detach(segment_map->segment);
1860  ++area->control->freed_segment_counter;
1861  segment_map->segment = NULL;
1862  segment_map->header = NULL;
1863  segment_map->mapped_address = NULL;
1864  }
1865  }
1866 
1867  /* Move segment to appropriate bin if necessary. */
1868  if (segment_map->header != NULL)
1869  rebin_segment(area, segment_map);
1870 
1872 
1873  /*
1874  * Span-of-spans blocks store the span which describes them within the
1875  * block itself, so freeing the storage implicitly frees the descriptor
1876  * also. If this is a block of any other type, we need to separately free
1877  * the span object also. This recursive call to dsa_free will acquire the
1878  * span pool's lock. We can't deadlock because the acquisition order is
1879  * always some other pool and then the span pool.
1880  */
1881  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1882  dsa_free(area, span_pointer);
1883 }
1884 
1885 static void
1887 {
1888  if (DsaPointerIsValid(span->nextspan))
1889  {
1890  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1891 
1892  next->prevspan = span->prevspan;
1893  }
1894  if (DsaPointerIsValid(span->prevspan))
1895  {
1896  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1897 
1898  prev->nextspan = span->nextspan;
1899  }
1900  else
1901  {
1902  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1903 
1904  pool->spans[span->fclass] = span->nextspan;
1905  }
1906 }
1907 
1908 static void
1910  dsa_pointer span_pointer,
1911  int fclass)
1912 {
1913  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1914 
1915  if (DsaPointerIsValid(pool->spans[fclass]))
1916  {
1917  dsa_area_span *head = dsa_get_address(area,
1918  pool->spans[fclass]);
1919 
1920  head->prevspan = span_pointer;
1921  }
1922  span->prevspan = InvalidDsaPointer;
1923  span->nextspan = pool->spans[fclass];
1924  pool->spans[fclass] = span_pointer;
1925  span->fclass = fclass;
1926 }
1927 
1928 /*
1929  * Detach from an area that was either created or attached to by this process.
1930  */
1931 void
1933 {
1934  int i;
1935 
1936  /* Detach from all segments. */
1937  for (i = 0; i <= area->high_segment_index; ++i)
1938  if (area->segment_maps[i].segment != NULL)
1939  dsm_detach(area->segment_maps[i].segment);
1940 
1941  /*
1942  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1943  * 'releasing' (= adjusting the reference count). It would be nice to
1944  * combine these operations, but client code might never get around to
1945  * calling dsa_detach because of an error path, and a detach hook on any
1946  * particular segment is too late to detach other segments in the area
1947  * without risking a 'leak' warning in the non-error path.
1948  */
1949 
1950  /* Free the backend-local area object. */
1951  pfree(area);
1952 }
1953 
1954 /*
1955  * Unlink a segment from the bin that contains it.
1956  */
1957 static void
1959 {
1960  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1961  {
1962  dsa_segment_map *prev;
1963 
1964  prev = get_segment_by_index(area, segment_map->header->prev);
1965  prev->header->next = segment_map->header->next;
1966  }
1967  else
1968  {
1969  Assert(area->control->segment_bins[segment_map->header->bin] ==
1970  get_segment_index(area, segment_map));
1971  area->control->segment_bins[segment_map->header->bin] =
1972  segment_map->header->next;
1973  }
1974  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1975  {
1977 
1978  next = get_segment_by_index(area, segment_map->header->next);
1979  next->header->prev = segment_map->header->prev;
1980  }
1981 }
1982 
1983 /*
1984  * Find a segment that could satisfy a request for 'npages' of contiguous
1985  * memory, or return NULL if none can be found. This may involve attaching to
1986  * segments that weren't previously attached so that we can query their free
1987  * pages map.
1988  */
1989 static dsa_segment_map *
1990 get_best_segment(dsa_area *area, size_t npages)
1991 {
1992  size_t bin;
1993 
1996 
1997  /*
1998  * Start searching from the first bin that *might* have enough contiguous
1999  * pages.
2000  */
2001  for (bin = contiguous_pages_to_segment_bin(npages);
2002  bin < DSA_NUM_SEGMENT_BINS;
2003  ++bin)
2004  {
2005  /*
2006  * The minimum contiguous size that any segment in this bin should
2007  * have. We'll re-bin if we see segments with fewer.
2008  */
2009  size_t threshold = (size_t) 1 << (bin - 1);
2010  dsa_segment_index segment_index;
2011 
2012  /* Search this bin for a segment with enough contiguous space. */
2013  segment_index = area->control->segment_bins[bin];
2014  while (segment_index != DSA_SEGMENT_INDEX_NONE)
2015  {
2016  dsa_segment_map *segment_map;
2017  dsa_segment_index next_segment_index;
2018  size_t contiguous_pages;
2019 
2020  segment_map = get_segment_by_index(area, segment_index);
2021  next_segment_index = segment_map->header->next;
2022  contiguous_pages = fpm_largest(segment_map->fpm);
2023 
2024  /* Not enough for the request, still enough for this bin. */
2025  if (contiguous_pages >= threshold && contiguous_pages < npages)
2026  {
2027  segment_index = next_segment_index;
2028  continue;
2029  }
2030 
2031  /* Re-bin it if it's no longer in the appropriate bin. */
2032  if (contiguous_pages < threshold)
2033  {
2034  rebin_segment(area, segment_map);
2035 
2036  /*
2037  * But fall through to see if it's enough to satisfy this
2038  * request anyway....
2039  */
2040  }
2041 
2042  /* Check if we are done. */
2043  if (contiguous_pages >= npages)
2044  return segment_map;
2045 
2046  /* Continue searching the same bin. */
2047  segment_index = next_segment_index;
2048  }
2049  }
2050 
2051  /* Not found. */
2052  return NULL;
2053 }
2054 
2055 /*
2056  * Create a new segment that can handle at least requested_pages. Returns
2057  * NULL if the requested total size limit or maximum allowed number of
2058  * segments would be exceeded.
2059  */
2060 static dsa_segment_map *
2061 make_new_segment(dsa_area *area, size_t requested_pages)
2062 {
2063  dsa_segment_index new_index;
2064  size_t metadata_bytes;
2065  size_t total_size;
2066  size_t total_pages;
2067  size_t usable_pages;
2068  dsa_segment_map *segment_map;
2069  dsm_segment *segment;
2070 
2072 
2073  /* Find a segment slot that is not in use (linearly for now). */
2074  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2075  {
2076  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2077  break;
2078  }
2079  if (new_index == DSA_MAX_SEGMENTS)
2080  return NULL;
2081 
2082  /*
2083  * If the total size limit is already exceeded, then we exit early and
2084  * avoid arithmetic wraparound in the unsigned expressions below.
2085  */
2086  if (area->control->total_segment_size >=
2088  return NULL;
2089 
2090  /*
2091  * The size should be at least as big as requested, and at least big
2092  * enough to follow a geometric series that approximately doubles the
2093  * total storage each time we create a new segment. We use geometric
2094  * growth because the underlying DSM system isn't designed for large
2095  * numbers of segments (otherwise we might even consider just using one
2096  * DSM segment for each large allocation and for each superblock, and then
2097  * we wouldn't need to use FreePageManager).
2098  *
2099  * We decide on a total segment size first, so that we produce tidy
2100  * power-of-two sized segments. This is a good property to have if we
2101  * move to huge pages in the future. Then we work back to the number of
2102  * pages we can fit.
2103  */
2105  ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2109  area->control->total_segment_size);
2110 
2111  total_pages = total_size / FPM_PAGE_SIZE;
2112  metadata_bytes =
2113  MAXALIGN(sizeof(dsa_segment_header)) +
2114  MAXALIGN(sizeof(FreePageManager)) +
2115  sizeof(dsa_pointer) * total_pages;
2116 
2117  /* Add padding up to next page boundary. */
2118  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2119  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2120  if (total_size <= metadata_bytes)
2121  return NULL;
2122  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2123  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2124 
2125  /* See if that is enough... */
2126  if (requested_pages > usable_pages)
2127  {
2128  /*
2129  * We'll make an odd-sized segment, working forward from the requested
2130  * number of pages.
2131  */
2132  usable_pages = requested_pages;
2133  metadata_bytes =
2134  MAXALIGN(sizeof(dsa_segment_header)) +
2135  MAXALIGN(sizeof(FreePageManager)) +
2136  usable_pages * sizeof(dsa_pointer);
2137 
2138  /* Add padding up to next page boundary. */
2139  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2140  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2141  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2142 
2143  /* Is that too large for dsa_pointer's addressing scheme? */
2145  return NULL;
2146 
2147  /* Would that exceed the limit? */
2149  area->control->total_segment_size)
2150  return NULL;
2151  }
2152 
2153  /* Create the segment. */
2154  segment = dsm_create(total_size, 0);
2155  if (segment == NULL)
2156  return NULL;
2157  dsm_pin_segment(segment);
2158  if (area->mapping_pinned)
2159  dsm_pin_mapping(segment);
2160 
2161  /* Store the handle in shared memory to be found by index. */
2162  area->control->segment_handles[new_index] =
2163  dsm_segment_handle(segment);
2164  /* Track the highest segment index in the history of the area. */
2165  if (area->control->high_segment_index < new_index)
2166  area->control->high_segment_index = new_index;
2167  /* Track the highest segment index this backend has ever mapped. */
2168  if (area->high_segment_index < new_index)
2169  area->high_segment_index = new_index;
2170  /* Track total size of all segments. */
2174 
2175  /* Build a segment map for this segment in this backend. */
2176  segment_map = &area->segment_maps[new_index];
2177  segment_map->segment = segment;
2178  segment_map->mapped_address = dsm_segment_address(segment);
2179  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2180  segment_map->fpm = (FreePageManager *)
2181  (segment_map->mapped_address +
2182  MAXALIGN(sizeof(dsa_segment_header)));
2183  segment_map->pagemap = (dsa_pointer *)
2184  (segment_map->mapped_address +
2185  MAXALIGN(sizeof(dsa_segment_header)) +
2186  MAXALIGN(sizeof(FreePageManager)));
2187 
2188  /* Set up the free page map. */
2189  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2190  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2191  usable_pages);
2192 
2193  /* Set up the segment header and put it in the appropriate bin. */
2194  segment_map->header->magic =
2195  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2196  segment_map->header->usable_pages = usable_pages;
2197  segment_map->header->size = total_size;
2198  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2199  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2200  segment_map->header->next =
2201  area->control->segment_bins[segment_map->header->bin];
2202  segment_map->header->freed = false;
2203  area->control->segment_bins[segment_map->header->bin] = new_index;
2204  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2205  {
2207  get_segment_by_index(area, segment_map->header->next);
2208 
2209  Assert(next->header->bin == segment_map->header->bin);
2210  next->header->prev = new_index;
2211  }
2212 
2213  return segment_map;
2214 }
2215 
2216 /*
2217  * Check if any segments have been freed by destroy_superblock, so we can
2218  * detach from them in this backend. This function is called by
2219  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2220  * received can be resolved to the correct segment.
2221  *
2222  * The danger we want to defend against is that there could be an old segment
2223  * mapped into a given slot in this backend, and the dsa_pointer they have
2224  * might refer to some new segment in the same slot. So those functions must
2225  * be sure to process all instructions to detach from a freed segment that had
2226  * been generated by the time this process received the dsa_pointer, before
2227  * they call get_segment_by_index.
2228  */
2229 static void
2231 {
2232  size_t freed_segment_counter;
2233 
2234  /*
2235  * Any other process that has freed a segment has incremented
2236  * freed_segment_counter while holding an LWLock, and that must precede
2237  * any backend creating a new segment in the same slot while holding an
2238  * LWLock, and that must precede the creation of any dsa_pointer pointing
2239  * into the new segment which might reach us here, and the caller must
2240  * have sent the dsa_pointer to this process using appropriate memory
2241  * synchronization (some kind of locking or atomic primitive or system
2242  * call). So all we need to do on the reading side is ask for the load of
2243  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2244  * has, and we can be sure to detect any segments that had been freed as
2245  * of the time that the dsa_pointer reached this process.
2246  */
2247  pg_read_barrier();
2248  freed_segment_counter = area->control->freed_segment_counter;
2249  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2250  {
2251  /* Check all currently mapped segments to find what's been freed. */
2255  }
2256 }
2257 
2258 /*
2259  * Workhorse for check_for_freed_segments(), and also used directly in path
2260  * where the area lock is already held. This should be called after acquiring
2261  * the lock but before looking up any segment by index number, to make sure we
2262  * unmap any stale segments that might have previously had the same index as a
2263  * current segment.
2264  */
2265 static void
2267 {
2268  size_t freed_segment_counter;
2269  int i;
2270 
2272  freed_segment_counter = area->control->freed_segment_counter;
2273  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2274  {
2275  for (i = 0; i <= area->high_segment_index; ++i)
2276  {
2277  if (area->segment_maps[i].header != NULL &&
2278  area->segment_maps[i].header->freed)
2279  {
2280  dsm_detach(area->segment_maps[i].segment);
2281  area->segment_maps[i].segment = NULL;
2282  area->segment_maps[i].header = NULL;
2283  area->segment_maps[i].mapped_address = NULL;
2284  }
2285  }
2286  area->freed_segment_counter = freed_segment_counter;
2287  }
2288 }
2289 
2290 /*
2291  * Re-bin segment if it's no longer in the appropriate bin.
2292  */
2293 static void
2295 {
2296  size_t new_bin;
2297  dsa_segment_index segment_index;
2298 
2299  new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2300  if (segment_map->header->bin == new_bin)
2301  return;
2302 
2303  /* Remove it from its current bin. */
2304  unlink_segment(area, segment_map);
2305 
2306  /* Push it onto the front of its new bin. */
2307  segment_index = get_segment_index(area, segment_map);
2308  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2309  segment_map->header->next = area->control->segment_bins[new_bin];
2310  segment_map->header->bin = new_bin;
2311  area->control->segment_bins[new_bin] = segment_index;
2312  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2313  {
2315 
2316  next = get_segment_by_index(area, segment_map->header->next);
2317  Assert(next->header->bin == new_bin);
2318  next->header->prev = segment_index;
2319  }
2320 }
#define pg_read_barrier()
Definition: atomics.h:153
static int32 next
Definition: blutils.c:219
unsigned short uint16
Definition: c.h:494
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
#define MAXALIGN(LEN)
Definition: c.h:800
#define unlikely(x)
Definition: c.h:300
#define lengthof(array)
Definition: c.h:777
unsigned char uint8
Definition: c.h:493
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1958
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2230
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:482
static const uint16 dsa_size_classes[]
Definition: dsa.c:248
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:119
#define DSA_AREA_LOCK(area)
Definition: dsa.c:155
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:78
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1909
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1542
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:128
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:112
void dsa_trim(dsa_area *area)
Definition: dsa.c:1037
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:389
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:115
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:393
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:599
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:69
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:585
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1454
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:105
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:281
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:142
static const uint8 dsa_size_class_map[]
Definition: dsa.c:271
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:678
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2061
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:519
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:390
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:341
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:949
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2266
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:122
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1817
#define DSA_MAX_SEGMENTS
Definition: dsa.c:95
size_t dsa_segment_index
Definition: dsa.c:125
static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2294
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:156
void dsa_release_in_place(void *place)
Definition: dsa.c:614
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1739
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1025
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:262
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1414
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1886
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:433
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:263
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:259
void dsa_unpin(dsa_area *area)
Definition: dsa.c:1001
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:644
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1308
#define NextFreeObjectIndex(object)
Definition: dsa.c:225
void dsa_dump(dsa_area *area)
Definition: dsa.c:1082
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:507
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1359
void dsa_detach(dsa_area *area)
Definition: dsa.c:1932
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:1990
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:554
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:102
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:289
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:833
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:134
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1208
size_t dsa_minimum_size(void)
Definition: dsa.c:1186
void dsa_pin(dsa_area *area)
Definition: dsa.c:982
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
dsm_handle dsa_handle
Definition: dsa.h:100
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:103
#define DsaPointerIsValid(x)
Definition: dsa.h:81
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1094
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1066
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:776
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1103
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:638
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:489
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:888
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:960
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:928
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1920
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1195
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1808
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:730
@ LW_EXCLUSIVE
Definition: lwlock.h:116
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1226
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:338
int64 total_size
Definition: pg_checksums.c:63
#define fprintf
Definition: port.h:242
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
Definition: lwlock.h:41
dsa_segment_header segment_header
Definition: dsa.c:313
size_t total_segment_size
Definition: dsa.c:323
int lwlock_tranche_id
Definition: dsa.c:335
dsa_segment_index high_segment_index
Definition: dsa.c:327
bool pinned
Definition: dsa.c:331
size_t max_total_segment_size
Definition: dsa.c:325
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:319
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:321
size_t freed_segment_counter
Definition: dsa.c:333
int refcnt
Definition: dsa.c:329
LWLock lock
Definition: dsa.c:337
dsa_handle handle
Definition: dsa.c:315
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:317
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:302
LWLock lock
Definition: dsa.c:300
dsa_pointer nextspan
Definition: dsa.c:210
uint16 fclass
Definition: dsa.c:218
dsa_pointer start
Definition: dsa.c:211
uint16 nallocatable
Definition: dsa.c:215
dsa_pointer prevspan
Definition: dsa.c:209
uint16 size_class
Definition: dsa.c:213
uint16 nmax
Definition: dsa.c:217
uint16 ninitialized
Definition: dsa.c:214
uint16 firstfree
Definition: dsa.c:216
dsa_pointer pool
Definition: dsa.c:208
size_t npages
Definition: dsa.c:212
Definition: dsa.c:367
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:380
dsa_segment_index high_segment_index
Definition: dsa.c:383
bool mapping_pinned
Definition: dsa.c:372
size_t freed_segment_counter
Definition: dsa.c:386
dsa_area_control * control
Definition: dsa.c:369
uint32 magic
Definition: dsa.c:166
size_t size
Definition: dsa.c:170
dsa_segment_index next
Definition: dsa.c:182
dsa_segment_index prev
Definition: dsa.c:176
size_t usable_pages
Definition: dsa.c:168
size_t bin
Definition: dsa.c:184
dsa_segment_header * header
Definition: dsa.c:355
FreePageManager * fpm
Definition: dsa.c:356
dsm_segment * segment
Definition: dsa.c:353
dsa_pointer * pagemap
Definition: dsa.c:357
char * mapped_address
Definition: dsa.c:354
Definition: type.h:95