PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/ipc.h"
57 #include "storage/lwlock.h"
58 #include "storage/shmem.h"
59 #include "utils/dsa.h"
60 #include "utils/freepage.h"
61 #include "utils/memutils.h"
62 
63 /*
64  * The size of the initial DSM segment that backs a dsa_area created by
65  * dsa_create. After creating some number of segments of this size we'll
66  * double this size, and so on. Larger segments may be created if necessary
67  * to satisfy large requests.
68  */
69 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
70 
71 /*
72  * How many segments to create before we double the segment size. If this is
73  * low, then there is likely to be a lot of wasted space in the largest
74  * segment. If it is high, then we risk running out of segment slots (see
75  * dsm.c's limits on total number of segments), or limiting the total size
76  * an area can manage when using small pointers.
77  */
78 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
79 
80 /*
81  * The number of bits used to represent the offset part of a dsa_pointer.
82  * This controls the maximum size of a segment, the maximum possible
83  * allocation size and also the maximum number of segments per area.
84  */
85 #if SIZEOF_DSA_POINTER == 4
86 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
87 #else
88 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
89 #endif
90 
91 /*
92  * The maximum number of DSM segments that an area can own, determined by
93  * the number of bits remaining (but capped at 1024).
94  */
95 #define DSA_MAX_SEGMENTS \
96  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
97 
98 /* The bitmask for extracting the offset from a dsa_pointer. */
99 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
100 
101 /* The maximum size of a DSM segment. */
102 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
103 
104 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
105 #define DSA_PAGES_PER_SUPERBLOCK 16
106 
107 /*
108  * A magic number used as a sanity check for following DSM segments belonging
109  * to a DSA area (this number will be XORed with the area handle and
110  * the segment index).
111  */
112 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
113 
114 /* Build a dsa_pointer given a segment number and offset. */
115 #define DSA_MAKE_POINTER(segment_number, offset) \
116  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
117 
118 /* Extract the segment number from a dsa_pointer. */
119 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
120 
121 /* Extract the offset from a dsa_pointer. */
122 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
123 
124 /* The type used for index segment indexes (zero based). */
125 typedef size_t dsa_segment_index;
126 
127 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
128 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
129 
130 /*
131  * How many bins of segments do we have? The bins are used to categorize
132  * segments by their largest contiguous run of free pages.
133  */
134 #define DSA_NUM_SEGMENT_BINS 16
135 
136 /*
137  * What is the lowest bin that holds segments that *might* have n contiguous
138  * free pages? There is no point in looking in segments in lower bins; they
139  * definitely can't service a request for n free pages.
140  */
141 static inline size_t
143 {
144  size_t bin;
145 
146  if (n == 0)
147  bin = 0;
148  else
149  bin = pg_leftmost_one_pos_size_t(n) + 1;
150 
151  return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
152 }
153 
154 /* Macros for access to locks. */
155 #define DSA_AREA_LOCK(area) (&area->control->lock)
156 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
157 
158 /*
159  * The header for an individual segment. This lives at the start of each DSM
160  * segment owned by a DSA area including the first segment (where it appears
161  * as part of the dsa_area_control struct).
162  */
163 typedef struct
164 {
165  /* Sanity check magic value. */
167  /* Total number of pages in this segment (excluding metadata area). */
168  size_t usable_pages;
169  /* Total size of this segment in bytes. */
170  size_t size;
171 
172  /*
173  * Index of the segment that precedes this one in the same segment bin, or
174  * DSA_SEGMENT_INDEX_NONE if this is the first one.
175  */
177 
178  /*
179  * Index of the segment that follows this one in the same segment bin, or
180  * DSA_SEGMENT_INDEX_NONE if this is the last one.
181  */
183  /* The index of the bin that contains this segment. */
184  size_t bin;
185 
186  /*
187  * A flag raised to indicate that this segment is being returned to the
188  * operating system and has been unpinned.
189  */
190  bool freed;
192 
193 /*
194  * Metadata for one superblock.
195  *
196  * For most blocks, span objects are stored out-of-line; that is, the span
197  * object is not stored within the block itself. But, as an exception, for a
198  * "span of spans", the span object is stored "inline". The allocation is
199  * always exactly one page, and the dsa_area_span object is located at
200  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
201  * and the remaining fields are used just as they would be in an ordinary
202  * block. We can't allocate spans out of ordinary superblocks because
203  * creating an ordinary superblock requires us to be able to allocate a span
204  * *first*. Doing it this way avoids that circularity.
205  */
206 typedef struct
207 {
208  dsa_pointer pool; /* Containing pool. */
209  dsa_pointer prevspan; /* Previous span. */
210  dsa_pointer nextspan; /* Next span. */
211  dsa_pointer start; /* Starting address. */
212  size_t npages; /* Length of span in pages. */
213  uint16 size_class; /* Size class. */
214  uint16 ninitialized; /* Maximum number of objects ever allocated. */
215  uint16 nallocatable; /* Number of objects currently allocatable. */
216  uint16 firstfree; /* First object on free list. */
217  uint16 nmax; /* Maximum number of objects ever possible. */
218  uint16 fclass; /* Current fullness class. */
219 } dsa_area_span;
220 
221 /*
222  * Given a pointer to an object in a span, access the index of the next free
223  * object in the same span (ie in the span's freelist) as an L-value.
224  */
225 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
226 
227 /*
228  * Small allocations are handled by dividing a single block of memory into
229  * many small objects of equal size. The possible allocation sizes are
230  * defined by the following array. Larger size classes are spaced more widely
231  * than smaller size classes. We fudge the spacing for size classes >1kB to
232  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
233  * blocks, we bump the maximum object size up to the largest multiple of
234  * 8 bytes that still lets us fit the same number of objects into one block.
235  *
236  * NB: Because of this fudging, if we were ever to use differently-sized blocks
237  * for small allocations, these size classes would need to be reworked to be
238  * optimal for the new size.
239  *
240  * NB: The optimal spacing for size classes, as well as the size of the blocks
241  * out of which small objects are allocated, is not a question that has one
242  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
243  * size classes than we do here, while others (like aset.c) use more
244  * widely-spaced classes. Spacing the classes more closely avoids wasting
245  * memory within individual chunks, but also means a larger number of
246  * potentially-unfilled blocks.
247  */
248 static const uint16 dsa_size_classes[] = {
249  sizeof(dsa_area_span), 0, /* special size classes */
250  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
251  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
252  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
253  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
254  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
255  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
256  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
257  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
258 };
259 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
260 
261 /* Special size classes. */
262 #define DSA_SCLASS_BLOCK_OF_SPANS 0
263 #define DSA_SCLASS_SPAN_LARGE 1
264 
265 /*
266  * The following lookup table is used to map the size of small objects
267  * (less than 1kB) onto the corresponding size class. To use this table,
268  * round the size of the object up to the next multiple of 8 bytes, and then
269  * index into this array.
270  */
271 static const uint8 dsa_size_class_map[] = {
272  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
273  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
274  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
275  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
276  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
277  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
278  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
279  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
280 };
281 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
282 
283 /*
284  * Superblocks are binned by how full they are. Generally, each fullness
285  * class corresponds to one quartile, but the block being used for
286  * allocations is always at the head of the list for fullness class 1,
287  * regardless of how full it really is.
288  */
289 #define DSA_FULLNESS_CLASSES 4
290 
291 /*
292  * A dsa_area_pool represents a set of objects of a given size class.
293  *
294  * Perhaps there should be multiple pools for the same size class for
295  * contention avoidance, but for now there is just one!
296  */
297 typedef struct
298 {
299  /* A lock protecting access to this pool. */
301  /* A set of linked lists of spans, arranged by fullness. */
303  /* Should we pad this out to a cacheline boundary? */
304 } dsa_area_pool;
305 
306 /*
307  * The control block for an area. This lives in shared memory, at the start of
308  * the first DSM segment controlled by this area.
309  */
310 typedef struct
311 {
312  /* The segment header for the first segment. */
314  /* The handle for this area. */
316  /* The handles of the segments owned by this area. */
317  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
318  /* Lists of segments, binned by maximum contiguous run of free pages. */
320  /* The object pools for each size class. */
322  /* The total size of all active segments. */
324  /* The maximum total size of backing storage we are allowed. */
326  /* Highest used segment index in the history of this area. */
328  /* The reference count for this area. */
329  int refcnt;
330  /* A flag indicating that this area has been pinned. */
331  bool pinned;
332  /* The number of times that segments have been freed. */
334  /* The LWLock tranche ID. */
336  /* The general lock (protects everything except object pools). */
339 
340 /* Given a pointer to a pool, find a dsa_pointer. */
341 #define DsaAreaPoolToDsaPointer(area, p) \
342  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
343 
344 /*
345  * A dsa_segment_map is stored within the backend-private memory of each
346  * individual backend. It holds the base address of the segment within that
347  * backend, plus the addresses of key objects within the segment. Those
348  * could instead be derived from the base address but it's handy to have them
349  * around.
350  */
351 typedef struct
352 {
353  dsm_segment *segment; /* DSM segment */
354  char *mapped_address; /* Address at which segment is mapped */
355  dsa_segment_header *header; /* Header (same as mapped_address) */
356  FreePageManager *fpm; /* Free page manager within segment. */
357  dsa_pointer *pagemap; /* Page map within segment. */
359 
360 /*
361  * Per-backend state for a storage area. Backends obtain one of these by
362  * creating an area or attaching to an existing one using a handle. Each
363  * process that needs to use an area uses its own object to track where the
364  * segments are mapped.
365  */
366 struct dsa_area
367 {
368  /* Pointer to the control object in shared memory. */
370 
371  /* Has the mapping been pinned? */
373 
374  /*
375  * This backend's array of segment maps, ordered by segment index
376  * corresponding to control->segment_handles. Some of the area's segments
377  * may not be mapped in this backend yet, and some slots may have been
378  * freed and need to be detached; these operations happen on demand.
379  */
381 
382  /* The highest segment index this backend has ever mapped. */
384 
385  /* The last observed freed_segment_counter. */
387 };
388 
389 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
390 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
391 
392 /* Given a pointer to a segment_map, obtain a segment index number. */
393 #define get_segment_index(area, segment_map_ptr) \
394  (segment_map_ptr - &area->segment_maps[0])
395 
396 static void init_span(dsa_area *area, dsa_pointer span_pointer,
397  dsa_area_pool *pool, dsa_pointer start, size_t npages,
398  uint16 size_class);
399 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
400  int fromclass, int toclass);
401 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
402 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
403  int size_class);
406 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
407 static void unlink_span(dsa_area *area, dsa_area_span *span);
408 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
409  dsa_pointer span_pointer, int fclass);
410 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
411 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
412 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
413 static dsa_area *create_internal(void *place, size_t size,
414  int tranche_id,
415  dsm_handle control_handle,
416  dsm_segment *control_segment);
417 static dsa_area *attach_internal(void *place, dsm_segment *segment,
418  dsa_handle handle);
419 static void check_for_freed_segments(dsa_area *area);
420 static void check_for_freed_segments_locked(dsa_area *area);
421 
422 /*
423  * Create a new shared area in a new DSM segment. Further DSM segments will
424  * be allocated as required to extend the available space.
425  *
426  * We can't allocate a LWLock tranche_id within this function, because tranche
427  * IDs are a scarce resource; there are only 64k available, using low numbers
428  * when possible matters, and we have no provision for recycling them. So,
429  * we require the caller to provide one.
430  */
431 dsa_area *
432 dsa_create(int tranche_id)
433 {
434  dsm_segment *segment;
435  dsa_area *area;
436 
437  /*
438  * Create the DSM segment that will hold the shared control object and the
439  * first segment of usable space.
440  */
441  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
442 
443  /*
444  * All segments backing this area are pinned, so that DSA can explicitly
445  * control their lifetime (otherwise a newly created segment belonging to
446  * this area might be freed when the only backend that happens to have it
447  * mapped in ends, corrupting the area).
448  */
449  dsm_pin_segment(segment);
450 
451  /* Create a new DSA area with the control object in this segment. */
452  area = create_internal(dsm_segment_address(segment),
454  tranche_id,
455  dsm_segment_handle(segment), segment);
456 
457  /* Clean up when the control segment detaches. */
460 
461  return area;
462 }
463 
464 /*
465  * Create a new shared area in an existing shared memory space, which may be
466  * either DSM or Postmaster-initialized memory. DSM segments will be
467  * allocated as required to extend the available space, though that can be
468  * prevented with dsa_set_size_limit(area, size) using the same size provided
469  * to dsa_create_in_place.
470  *
471  * Areas created in-place must eventually be released by the backend that
472  * created them and all backends that attach to them. This can be done
473  * explicitly with dsa_release_in_place, or, in the special case that 'place'
474  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
475  * segment so that a detach hook can be registered with the containing DSM
476  * segment.
477  *
478  * See dsa_create() for a note about the tranche arguments.
479  */
480 dsa_area *
481 dsa_create_in_place(void *place, size_t size,
482  int tranche_id, dsm_segment *segment)
483 {
484  dsa_area *area;
485 
486  area = create_internal(place, size, tranche_id,
487  DSM_HANDLE_INVALID, NULL);
488 
489  /*
490  * Clean up when the control segment detaches, if a containing DSM segment
491  * was provided.
492  */
493  if (segment != NULL)
495  PointerGetDatum(place));
496 
497  return area;
498 }
499 
500 /*
501  * Obtain a handle that can be passed to other processes so that they can
502  * attach to the given area. Cannot be called for areas created with
503  * dsa_create_in_place.
504  */
507 {
509  return area->control->handle;
510 }
511 
512 /*
513  * Attach to an area given a handle generated (possibly in another process) by
514  * dsa_get_handle. The area must have been created with dsa_create (not
515  * dsa_create_in_place).
516  */
517 dsa_area *
519 {
520  dsm_segment *segment;
521  dsa_area *area;
522 
523  /*
524  * An area handle is really a DSM segment handle for the first segment, so
525  * we go ahead and attach to that.
526  */
527  segment = dsm_attach(handle);
528  if (segment == NULL)
529  ereport(ERROR,
530  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
531  errmsg("could not attach to dynamic shared area")));
532 
533  area = attach_internal(dsm_segment_address(segment), segment, handle);
534 
535  /* Clean up when the control segment detaches. */
538 
539  return area;
540 }
541 
542 /*
543  * Attach to an area that was created with dsa_create_in_place. The caller
544  * must somehow know the location in memory that was used when the area was
545  * created, though it may be mapped at a different virtual address in this
546  * process.
547  *
548  * See dsa_create_in_place for note about releasing in-place areas, and the
549  * optional 'segment' argument which can be provided to allow automatic
550  * release if the containing memory happens to be a DSM segment.
551  */
552 dsa_area *
553 dsa_attach_in_place(void *place, dsm_segment *segment)
554 {
555  dsa_area *area;
556 
557  area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
558 
559  /*
560  * Clean up when the control segment detaches, if a containing DSM segment
561  * was provided.
562  */
563  if (segment != NULL)
565  PointerGetDatum(place));
566 
567  return area;
568 }
569 
570 /*
571  * Release a DSA area that was produced by dsa_create_in_place or
572  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
573  * interface suitable for on_dsm_detach, for the convenience of users who want
574  * to create a DSA segment inside an existing DSM segment and have it
575  * automatically released when the containing DSM segment is detached.
576  * 'place' should be the address of the place where the area was created.
577  *
578  * This callback is automatically registered for the DSM segment containing
579  * the control object of in-place areas when a segment is provided to
580  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
581  * with dsa_create.
582  */
583 void
585 {
587 }
588 
589 /*
590  * Release a DSA area that was produced by dsa_create_in_place or
591  * dsa_attach_in_place. The 'code' argument is ignored but provides an
592  * interface suitable for on_shmem_exit or before_shmem_exit, for the
593  * convenience of users who want to create a DSA segment inside shared memory
594  * other than a DSM segment and have it automatically release at backend exit.
595  * 'place' should be the address of the place where the area was created.
596  */
597 void
599 {
601 }
602 
603 /*
604  * Release a DSA area that was produced by dsa_create_in_place or
605  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
606  * callbacks so that this is managed automatically, because failure to release
607  * an area created in-place leaks its segments permanently.
608  *
609  * This is also called automatically for areas produced by dsa_create or
610  * dsa_attach as an implementation detail.
611  */
612 void
614 {
615  dsa_area_control *control = (dsa_area_control *) place;
616  int i;
617 
618  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
619  Assert(control->segment_header.magic ==
620  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
621  Assert(control->refcnt > 0);
622  if (--control->refcnt == 0)
623  {
624  for (i = 0; i <= control->high_segment_index; ++i)
625  {
626  dsm_handle handle;
627 
628  handle = control->segment_handles[i];
629  if (handle != DSM_HANDLE_INVALID)
630  dsm_unpin_segment(handle);
631  }
632  }
633  LWLockRelease(&control->lock);
634 }
635 
636 /*
637  * Keep a DSA area attached until end of session or explicit detach.
638  *
639  * By default, areas are owned by the current resource owner, which means they
640  * are detached automatically when that scope ends.
641  */
642 void
644 {
645  int i;
646 
647  Assert(!area->mapping_pinned);
648  area->mapping_pinned = true;
649 
650  for (i = 0; i <= area->high_segment_index; ++i)
651  if (area->segment_maps[i].segment != NULL)
653 }
654 
655 /*
656  * Allocate memory in this storage area. The return value is a dsa_pointer
657  * that can be passed to other processes, and converted to a local pointer
658  * with dsa_get_address. 'flags' is a bitmap which should be constructed
659  * from the following values:
660  *
661  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
662  * will result in an ERROR.
663  *
664  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
665  * no memory is available or a size limit established by dsa_set_size_limit
666  * would be exceeded. Otherwise, such allocations will result in an ERROR.
667  *
668  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
669  * contents of newly-allocated memory are indeterminate.
670  *
671  * These flags correspond to similarly named flags used by
672  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
673  * dsa_allocate0 which expand to a call to this function with commonly used
674  * flags.
675  */
677 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
678 {
679  uint16 size_class;
680  dsa_pointer start_pointer;
681  dsa_segment_map *segment_map;
682  dsa_pointer result;
683 
684  Assert(size > 0);
685 
686  /* Sanity check on huge individual allocation size. */
687  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
688  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
689  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
690 
691  /*
692  * If bigger than the largest size class, just grab a run of pages from
693  * the free page manager, instead of allocating an object from a pool.
694  * There will still be a span, but it's a special class of span that
695  * manages this whole allocation and simply gives all pages back to the
696  * free page manager when dsa_free is called.
697  */
698  if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
699  {
700  size_t npages = fpm_size_to_pages(size);
701  size_t first_page;
702  dsa_pointer span_pointer;
704 
705  /* Obtain a span object. */
706  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
707  if (!DsaPointerIsValid(span_pointer))
708  {
709  /* Raise error unless asked not to. */
710  if ((flags & DSA_ALLOC_NO_OOM) == 0)
711  ereport(ERROR,
712  (errcode(ERRCODE_OUT_OF_MEMORY),
713  errmsg("out of memory"),
714  errdetail("Failed on DSA request of size %zu.",
715  size)));
716  return InvalidDsaPointer;
717  }
718 
720 
721  /* Find a segment from which to allocate. */
722  segment_map = get_best_segment(area, npages);
723  if (segment_map == NULL)
724  segment_map = make_new_segment(area, npages);
725  if (segment_map == NULL)
726  {
727  /* Can't make any more segments: game over. */
729  dsa_free(area, span_pointer);
730 
731  /* Raise error unless asked not to. */
732  if ((flags & DSA_ALLOC_NO_OOM) == 0)
733  ereport(ERROR,
734  (errcode(ERRCODE_OUT_OF_MEMORY),
735  errmsg("out of memory"),
736  errdetail("Failed on DSA request of size %zu.",
737  size)));
738  return InvalidDsaPointer;
739  }
740 
741  /*
742  * Ask the free page manager for a run of pages. This should always
743  * succeed, since both get_best_segment and make_new_segment should
744  * only return a non-NULL pointer if it actually contains enough
745  * contiguous freespace. If it does fail, something in our backend
746  * private state is out of whack, so use FATAL to kill the process.
747  */
748  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
749  elog(FATAL,
750  "dsa_allocate could not find %zu free pages", npages);
752 
753  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
754  first_page * FPM_PAGE_SIZE);
755 
756  /* Initialize span and pagemap. */
758  LW_EXCLUSIVE);
759  init_span(area, span_pointer, pool, start_pointer, npages,
761  segment_map->pagemap[first_page] = span_pointer;
763 
764  /* Zero-initialize the memory if requested. */
765  if ((flags & DSA_ALLOC_ZERO) != 0)
766  memset(dsa_get_address(area, start_pointer), 0, size);
767 
768  return start_pointer;
769  }
770 
771  /* Map allocation to a size class. */
773  {
774  int mapidx;
775 
776  /* For smaller sizes we have a lookup table... */
777  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
779  size_class = dsa_size_class_map[mapidx];
780  }
781  else
782  {
783  uint16 min;
784  uint16 max;
785 
786  /* ... and for the rest we search by binary chop. */
788  max = lengthof(dsa_size_classes) - 1;
789 
790  while (min < max)
791  {
792  uint16 mid = (min + max) / 2;
793  uint16 class_size = dsa_size_classes[mid];
794 
795  if (class_size < size)
796  min = mid + 1;
797  else
798  max = mid;
799  }
800 
801  size_class = min;
802  }
803  Assert(size <= dsa_size_classes[size_class]);
804  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
805 
806  /* Attempt to allocate an object from the appropriate pool. */
807  result = alloc_object(area, size_class);
808 
809  /* Check for failure to allocate. */
810  if (!DsaPointerIsValid(result))
811  {
812  /* Raise error unless asked not to. */
813  if ((flags & DSA_ALLOC_NO_OOM) == 0)
814  ereport(ERROR,
815  (errcode(ERRCODE_OUT_OF_MEMORY),
816  errmsg("out of memory"),
817  errdetail("Failed on DSA request of size %zu.", size)));
818  return InvalidDsaPointer;
819  }
820 
821  /* Zero-initialize the memory if requested. */
822  if ((flags & DSA_ALLOC_ZERO) != 0)
823  memset(dsa_get_address(area, result), 0, size);
824 
825  return result;
826 }
827 
828 /*
829  * Free memory obtained with dsa_allocate.
830  */
831 void
833 {
834  dsa_segment_map *segment_map;
835  int pageno;
836  dsa_pointer span_pointer;
837  dsa_area_span *span;
838  char *superblock;
839  char *object;
840  size_t size;
841  int size_class;
842 
843  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
845 
846  /* Locate the object, span and pool. */
847  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
848  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
849  span_pointer = segment_map->pagemap[pageno];
850  span = dsa_get_address(area, span_pointer);
851  superblock = dsa_get_address(area, span->start);
852  object = dsa_get_address(area, dp);
853  size_class = span->size_class;
854  size = dsa_size_classes[size_class];
855 
856  /*
857  * Special case for large objects that live in a special span: we return
858  * those pages directly to the free page manager and free the span.
859  */
860  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
861  {
862 
863 #ifdef CLOBBER_FREED_MEMORY
864  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
865 #endif
866 
867  /* Give pages back to free page manager. */
869  FreePageManagerPut(segment_map->fpm,
871  span->npages);
873  /* Unlink span. */
875  LW_EXCLUSIVE);
876  unlink_span(area, span);
878  /* Free the span object so it can be reused. */
879  dsa_free(area, span_pointer);
880  return;
881  }
882 
883 #ifdef CLOBBER_FREED_MEMORY
884  memset(object, 0x7f, size);
885 #endif
886 
887  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
888 
889  /* Put the object on the span's freelist. */
890  Assert(object >= superblock);
891  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
892  Assert((object - superblock) % size == 0);
893  NextFreeObjectIndex(object) = span->firstfree;
894  span->firstfree = (object - superblock) / size;
895  ++span->nallocatable;
896 
897  /*
898  * See if the span needs to moved to a different fullness class, or be
899  * freed so its pages can be given back to the segment.
900  */
901  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
902  {
903  /*
904  * The block was completely full and is located in the
905  * highest-numbered fullness class, which is never scanned for free
906  * chunks. We must move it to the next-lower fullness class.
907  */
908  unlink_span(area, span);
909  add_span_to_fullness_class(area, span, span_pointer,
911 
912  /*
913  * If this is the only span, and there is no active span, then we
914  * should probably move this span to fullness class 1. (Otherwise if
915  * you allocate exactly all the objects in the only span, it moves to
916  * class 3, then you free them all, it moves to 2, and then is given
917  * back, leaving no active span).
918  */
919  }
920  else if (span->nallocatable == span->nmax &&
921  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
922  {
923  /*
924  * This entire block is free, and it's not the active block for this
925  * size class. Return the memory to the free page manager. We don't
926  * do this for the active block to prevent hysteresis: if we
927  * repeatedly allocate and free the only chunk in the active block, it
928  * will be very inefficient if we deallocate and reallocate the block
929  * every time.
930  */
931  destroy_superblock(area, span_pointer);
932  }
933 
934  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
935 }
936 
937 /*
938  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
939  * memory allocated by the given area (possibly in another process) that
940  * hasn't yet been freed. This may cause a segment to be mapped into the
941  * current process if required, and may cause freed segments to be unmapped.
942  */
943 void *
945 {
947  size_t offset;
948 
949  /* Convert InvalidDsaPointer to NULL. */
950  if (!DsaPointerIsValid(dp))
951  return NULL;
952 
953  /* Process any requests to detach from freed segments. */
955 
956  /* Break the dsa_pointer into its components. */
958  offset = DSA_EXTRACT_OFFSET(dp);
960 
961  /* Check if we need to cause this segment to be mapped in. */
962  if (unlikely(area->segment_maps[index].mapped_address == NULL))
963  {
964  /* Call for effect (we don't need the result). */
966  }
967 
968  return area->segment_maps[index].mapped_address + offset;
969 }
970 
971 /*
972  * Pin this area, so that it will continue to exist even if all backends
973  * detach from it. In that case, the area can still be reattached to if a
974  * handle has been recorded somewhere.
975  */
976 void
978 {
980  if (area->control->pinned)
981  {
983  elog(ERROR, "dsa_area already pinned");
984  }
985  area->control->pinned = true;
986  ++area->control->refcnt;
988 }
989 
990 /*
991  * Undo the effects of dsa_pin, so that the given area can be freed when no
992  * backends are attached to it. May be called only if dsa_pin has been
993  * called.
994  */
995 void
997 {
999  Assert(area->control->refcnt > 1);
1000  if (!area->control->pinned)
1001  {
1003  elog(ERROR, "dsa_area not pinned");
1004  }
1005  area->control->pinned = false;
1006  --area->control->refcnt;
1008 }
1009 
1010 /*
1011  * Set the total size limit for this area. This limit is checked whenever new
1012  * segments need to be allocated from the operating system. If the new size
1013  * limit is already exceeded, this has no immediate effect.
1014  *
1015  * Note that the total virtual memory usage may be temporarily larger than
1016  * this limit when segments have been freed, but not yet detached by all
1017  * backends that have attached to them.
1018  */
1019 void
1020 dsa_set_size_limit(dsa_area *area, size_t limit)
1021 {
1023  area->control->max_total_segment_size = limit;
1025 }
1026 
1027 /*
1028  * Aggressively free all spare memory in the hope of returning DSM segments to
1029  * the operating system.
1030  */
1031 void
1033 {
1034  int size_class;
1035 
1036  /*
1037  * Trim in reverse pool order so we get to the spans-of-spans last, just
1038  * in case any become entirely free while processing all the other pools.
1039  */
1040  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1041  {
1042  dsa_area_pool *pool = &area->control->pools[size_class];
1043  dsa_pointer span_pointer;
1044 
1045  if (size_class == DSA_SCLASS_SPAN_LARGE)
1046  {
1047  /* Large object frees give back segments aggressively already. */
1048  continue;
1049  }
1050 
1051  /*
1052  * Search fullness class 1 only. That is where we expect to find an
1053  * entirely empty superblock (entirely empty superblocks in other
1054  * fullness classes are returned to the free page map by dsa_free).
1055  */
1056  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1057  span_pointer = pool->spans[1];
1058  while (DsaPointerIsValid(span_pointer))
1059  {
1060  dsa_area_span *span = dsa_get_address(area, span_pointer);
1061  dsa_pointer next = span->nextspan;
1062 
1063  if (span->nallocatable == span->nmax)
1064  destroy_superblock(area, span_pointer);
1065 
1066  span_pointer = next;
1067  }
1068  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1069  }
1070 }
1071 
1072 /*
1073  * Print out debugging information about the internal state of the shared
1074  * memory area.
1075  */
1076 void
1078 {
1079  size_t i,
1080  j;
1081 
1082  /*
1083  * Note: This gives an inconsistent snapshot as it acquires and releases
1084  * individual locks as it goes...
1085  */
1086 
1089  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1090  fprintf(stderr, " max_total_segment_size: %zu\n",
1092  fprintf(stderr, " total_segment_size: %zu\n",
1093  area->control->total_segment_size);
1094  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1095  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1096  fprintf(stderr, " segment bins:\n");
1097  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1098  {
1100  {
1101  dsa_segment_index segment_index;
1102 
1103  fprintf(stderr,
1104  " segment bin %zu (at least %d contiguous pages free):\n",
1105  i, 1 << (i - 1));
1106  segment_index = area->control->segment_bins[i];
1107  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1108  {
1109  dsa_segment_map *segment_map;
1110 
1111  segment_map =
1112  get_segment_by_index(area, segment_index);
1113 
1114  fprintf(stderr,
1115  " segment index %zu, usable_pages = %zu, "
1116  "contiguous_pages = %zu, mapped at %p\n",
1117  segment_index,
1118  segment_map->header->usable_pages,
1119  fpm_largest(segment_map->fpm),
1120  segment_map->mapped_address);
1121  segment_index = segment_map->header->next;
1122  }
1123  }
1124  }
1126 
1127  fprintf(stderr, " pools:\n");
1128  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1129  {
1130  bool found = false;
1131 
1133  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1134  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1135  found = true;
1136  if (found)
1137  {
1139  fprintf(stderr, " pool for blocks of span objects:\n");
1140  else if (i == DSA_SCLASS_SPAN_LARGE)
1141  fprintf(stderr, " pool for large object spans:\n");
1142  else
1143  fprintf(stderr,
1144  " pool for size class %zu (object size %hu bytes):\n",
1145  i, dsa_size_classes[i]);
1146  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1147  {
1148  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1149  fprintf(stderr, " fullness class %zu is empty\n", j);
1150  else
1151  {
1152  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1153 
1154  fprintf(stderr, " fullness class %zu:\n", j);
1155  while (DsaPointerIsValid(span_pointer))
1156  {
1157  dsa_area_span *span;
1158 
1159  span = dsa_get_address(area, span_pointer);
1160  fprintf(stderr,
1161  " span descriptor at "
1162  DSA_POINTER_FORMAT ", superblock at "
1164  ", pages = %zu, objects free = %hu/%hu\n",
1165  span_pointer, span->start, span->npages,
1166  span->nallocatable, span->nmax);
1167  span_pointer = span->nextspan;
1168  }
1169  }
1170  }
1171  }
1173  }
1174 }
1175 
1176 /*
1177  * Return the smallest size that you can successfully provide to
1178  * dsa_create_in_place.
1179  */
1180 size_t
1182 {
1183  size_t size;
1184  int pages = 0;
1185 
1186  size = MAXALIGN(sizeof(dsa_area_control)) +
1187  MAXALIGN(sizeof(FreePageManager));
1188 
1189  /* Figure out how many pages we need, including the page map... */
1190  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1191  {
1192  ++pages;
1193  size += sizeof(dsa_pointer);
1194  }
1195 
1196  return pages * FPM_PAGE_SIZE;
1197 }
1198 
1199 /*
1200  * Workhorse function for dsa_create and dsa_create_in_place.
1201  */
1202 static dsa_area *
1203 create_internal(void *place, size_t size,
1204  int tranche_id,
1205  dsm_handle control_handle,
1206  dsm_segment *control_segment)
1207 {
1208  dsa_area_control *control;
1209  dsa_area *area;
1210  dsa_segment_map *segment_map;
1211  size_t usable_pages;
1212  size_t total_pages;
1213  size_t metadata_bytes;
1214  int i;
1215 
1216  /* Sanity check on the space we have to work in. */
1217  if (size < dsa_minimum_size())
1218  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1219  dsa_minimum_size(), size);
1220 
1221  /* Now figure out how much space is usable */
1222  total_pages = size / FPM_PAGE_SIZE;
1223  metadata_bytes =
1224  MAXALIGN(sizeof(dsa_area_control)) +
1225  MAXALIGN(sizeof(FreePageManager)) +
1226  total_pages * sizeof(dsa_pointer);
1227  /* Add padding up to next page boundary. */
1228  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1229  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1230  Assert(metadata_bytes <= size);
1231  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1232 
1233  /*
1234  * Initialize the dsa_area_control object located at the start of the
1235  * space.
1236  */
1237  control = (dsa_area_control *) place;
1238  memset(place, 0, sizeof(*control));
1239  control->segment_header.magic =
1240  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1243  control->segment_header.usable_pages = usable_pages;
1244  control->segment_header.freed = false;
1246  control->handle = control_handle;
1247  control->max_total_segment_size = (size_t) -1;
1248  control->total_segment_size = size;
1249  control->segment_handles[0] = control_handle;
1250  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1252  control->refcnt = 1;
1253  control->lwlock_tranche_id = tranche_id;
1254 
1255  /*
1256  * Create the dsa_area object that this backend will use to access the
1257  * area. Other backends will need to obtain their own dsa_area object by
1258  * attaching.
1259  */
1260  area = palloc(sizeof(dsa_area));
1261  area->control = control;
1262  area->mapping_pinned = false;
1263  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1264  area->high_segment_index = 0;
1265  area->freed_segment_counter = 0;
1266  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1267  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1269  control->lwlock_tranche_id);
1270 
1271  /* Set up the segment map for this process's mapping. */
1272  segment_map = &area->segment_maps[0];
1273  segment_map->segment = control_segment;
1274  segment_map->mapped_address = place;
1275  segment_map->header = (dsa_segment_header *) place;
1276  segment_map->fpm = (FreePageManager *)
1277  (segment_map->mapped_address +
1278  MAXALIGN(sizeof(dsa_area_control)));
1279  segment_map->pagemap = (dsa_pointer *)
1280  (segment_map->mapped_address +
1281  MAXALIGN(sizeof(dsa_area_control)) +
1282  MAXALIGN(sizeof(FreePageManager)));
1283 
1284  /* Set up the free page map. */
1285  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1286  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1287 
1288  if (usable_pages > 0)
1289  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1290  usable_pages);
1291 
1292  /* Put this segment into the appropriate bin. */
1293  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1294  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1295 
1296  return area;
1297 }
1298 
1299 /*
1300  * Workhorse function for dsa_attach and dsa_attach_in_place.
1301  */
1302 static dsa_area *
1303 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1304 {
1305  dsa_area_control *control;
1306  dsa_area *area;
1307  dsa_segment_map *segment_map;
1308 
1309  control = (dsa_area_control *) place;
1310  Assert(control->handle == handle);
1311  Assert(control->segment_handles[0] == handle);
1312  Assert(control->segment_header.magic ==
1313  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1314 
1315  /* Build the backend-local area object. */
1316  area = palloc(sizeof(dsa_area));
1317  area->control = control;
1318  area->mapping_pinned = false;
1319  memset(&area->segment_maps[0], 0,
1320  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1321  area->high_segment_index = 0;
1322 
1323  /* Set up the segment map for this process's mapping. */
1324  segment_map = &area->segment_maps[0];
1325  segment_map->segment = segment; /* NULL for in-place */
1326  segment_map->mapped_address = place;
1327  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1328  segment_map->fpm = (FreePageManager *)
1329  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1330  segment_map->pagemap = (dsa_pointer *)
1331  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1332  MAXALIGN(sizeof(FreePageManager)));
1333 
1334  /* Bump the reference count. */
1336  if (control->refcnt == 0)
1337  {
1338  /* We can't attach to a DSA area that has already been destroyed. */
1339  ereport(ERROR,
1340  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1341  errmsg("could not attach to dynamic shared area")));
1342  }
1343  ++control->refcnt;
1346 
1347  return area;
1348 }
1349 
1350 /*
1351  * Add a new span to fullness class 1 of the indicated pool.
1352  */
1353 static void
1355  dsa_pointer span_pointer,
1356  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1357  uint16 size_class)
1358 {
1359  dsa_area_span *span = dsa_get_address(area, span_pointer);
1360  size_t obsize = dsa_size_classes[size_class];
1361 
1362  /*
1363  * The per-pool lock must be held because we manipulate the span list for
1364  * this pool.
1365  */
1366  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1367 
1368  /* Push this span onto the front of the span list for fullness class 1. */
1369  if (DsaPointerIsValid(pool->spans[1]))
1370  {
1371  dsa_area_span *head = (dsa_area_span *)
1372  dsa_get_address(area, pool->spans[1]);
1373 
1374  head->prevspan = span_pointer;
1375  }
1376  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1377  span->nextspan = pool->spans[1];
1378  span->prevspan = InvalidDsaPointer;
1379  pool->spans[1] = span_pointer;
1380 
1381  span->start = start;
1382  span->npages = npages;
1383  span->size_class = size_class;
1384  span->ninitialized = 0;
1385  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1386  {
1387  /*
1388  * A block-of-spans contains its own descriptor, so mark one object as
1389  * initialized and reduce the count of allocatable objects by one.
1390  * Doing this here has the side effect of also reducing nmax by one,
1391  * which is important to make sure we free this object at the correct
1392  * time.
1393  */
1394  span->ninitialized = 1;
1395  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1396  }
1397  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1398  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1400  span->nmax = span->nallocatable;
1401  span->fclass = 1;
1402 }
1403 
1404 /*
1405  * Transfer the first span in one fullness class to the head of another
1406  * fullness class.
1407  */
1408 static bool
1410  dsa_area_pool *pool, int fromclass, int toclass)
1411 {
1412  dsa_pointer span_pointer;
1413  dsa_area_span *span;
1414  dsa_area_span *nextspan;
1415 
1416  /* Can't do it if source list is empty. */
1417  span_pointer = pool->spans[fromclass];
1418  if (!DsaPointerIsValid(span_pointer))
1419  return false;
1420 
1421  /* Remove span from head of source list. */
1422  span = dsa_get_address(area, span_pointer);
1423  pool->spans[fromclass] = span->nextspan;
1424  if (DsaPointerIsValid(span->nextspan))
1425  {
1426  nextspan = (dsa_area_span *)
1427  dsa_get_address(area, span->nextspan);
1428  nextspan->prevspan = InvalidDsaPointer;
1429  }
1430 
1431  /* Add span to head of target list. */
1432  span->nextspan = pool->spans[toclass];
1433  pool->spans[toclass] = span_pointer;
1434  if (DsaPointerIsValid(span->nextspan))
1435  {
1436  nextspan = (dsa_area_span *)
1437  dsa_get_address(area, span->nextspan);
1438  nextspan->prevspan = span_pointer;
1439  }
1440  span->fclass = toclass;
1441 
1442  return true;
1443 }
1444 
1445 /*
1446  * Allocate one object of the requested size class from the given area.
1447  */
1448 static inline dsa_pointer
1449 alloc_object(dsa_area *area, int size_class)
1450 {
1451  dsa_area_pool *pool = &area->control->pools[size_class];
1452  dsa_area_span *span;
1453  dsa_pointer block;
1454  dsa_pointer result;
1455  char *object;
1456  size_t size;
1457 
1458  /*
1459  * Even though ensure_active_superblock can in turn call alloc_object if
1460  * it needs to allocate a new span, that's always from a different pool,
1461  * and the order of lock acquisition is always the same, so it's OK that
1462  * we hold this lock for the duration of this function.
1463  */
1464  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1465  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1466 
1467  /*
1468  * If there's no active superblock, we must successfully obtain one or
1469  * fail the request.
1470  */
1471  if (!DsaPointerIsValid(pool->spans[1]) &&
1472  !ensure_active_superblock(area, pool, size_class))
1473  {
1474  result = InvalidDsaPointer;
1475  }
1476  else
1477  {
1478  /*
1479  * There should be a block in fullness class 1 at this point, and it
1480  * should never be completely full. Thus we can either pop an object
1481  * from the free list or, failing that, initialize a new object.
1482  */
1483  Assert(DsaPointerIsValid(pool->spans[1]));
1484  span = (dsa_area_span *)
1485  dsa_get_address(area, pool->spans[1]);
1486  Assert(span->nallocatable > 0);
1487  block = span->start;
1488  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1489  size = dsa_size_classes[size_class];
1490  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1491  {
1492  result = block + span->firstfree * size;
1493  object = dsa_get_address(area, result);
1494  span->firstfree = NextFreeObjectIndex(object);
1495  }
1496  else
1497  {
1498  result = block + span->ninitialized * size;
1499  ++span->ninitialized;
1500  }
1501  --span->nallocatable;
1502 
1503  /* If it's now full, move it to the highest-numbered fullness class. */
1504  if (span->nallocatable == 0)
1505  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1506  }
1507 
1508  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1509  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1510 
1511  return result;
1512 }
1513 
1514 /*
1515  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1516  * superblocks are completely full and no more can be allocated.
1517  *
1518  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1519  * utilization percentage is at least K/N, but we only enforce this rigorously
1520  * for the highest-numbered fullness class, which always contains exactly
1521  * those blocks that are completely full. It's otherwise acceptable for a
1522  * block to be in a higher-numbered fullness class than the one to which it
1523  * logically belongs. In addition, the active block, which is always the
1524  * first block in fullness class 1, is permitted to have a higher allocation
1525  * percentage than would normally be allowable for that fullness class; we
1526  * don't move it until it's completely full, and then it goes to the
1527  * highest-numbered fullness class.
1528  *
1529  * It might seem odd that the active block is the head of fullness class 1
1530  * rather than fullness class 0, but experience with other allocators has
1531  * shown that it's usually better to allocate from a block that's moderately
1532  * full rather than one that's nearly empty. Insofar as is reasonably
1533  * possible, we want to avoid performing new allocations in a block that would
1534  * otherwise become empty soon.
1535  */
1536 static bool
1538  int size_class)
1539 {
1540  dsa_pointer span_pointer;
1541  dsa_pointer start_pointer;
1542  size_t obsize = dsa_size_classes[size_class];
1543  size_t nmax;
1544  int fclass;
1545  size_t npages = 1;
1546  size_t first_page;
1547  size_t i;
1548  dsa_segment_map *segment_map;
1549 
1550  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1551 
1552  /*
1553  * Compute the number of objects that will fit in a block of this size
1554  * class. Span-of-spans blocks are just a single page, and the first
1555  * object isn't available for use because it describes the block-of-spans
1556  * itself.
1557  */
1558  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1559  nmax = FPM_PAGE_SIZE / obsize - 1;
1560  else
1561  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1562 
1563  /*
1564  * If fullness class 1 is empty, try to find a span to put in it by
1565  * scanning higher-numbered fullness classes (excluding the last one,
1566  * whose blocks are certain to all be completely full).
1567  */
1568  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1569  {
1570  span_pointer = pool->spans[fclass];
1571 
1572  while (DsaPointerIsValid(span_pointer))
1573  {
1574  int tfclass;
1575  dsa_area_span *span;
1576  dsa_area_span *nextspan;
1577  dsa_area_span *prevspan;
1578  dsa_pointer next_span_pointer;
1579 
1580  span = (dsa_area_span *)
1581  dsa_get_address(area, span_pointer);
1582  next_span_pointer = span->nextspan;
1583 
1584  /* Figure out what fullness class should contain this span. */
1585  tfclass = (nmax - span->nallocatable)
1586  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1587 
1588  /* Look up next span. */
1589  if (DsaPointerIsValid(span->nextspan))
1590  nextspan = (dsa_area_span *)
1591  dsa_get_address(area, span->nextspan);
1592  else
1593  nextspan = NULL;
1594 
1595  /*
1596  * If utilization has dropped enough that this now belongs in some
1597  * other fullness class, move it there.
1598  */
1599  if (tfclass < fclass)
1600  {
1601  /* Remove from the current fullness class list. */
1602  if (pool->spans[fclass] == span_pointer)
1603  {
1604  /* It was the head; remove it. */
1606  pool->spans[fclass] = span->nextspan;
1607  if (nextspan != NULL)
1608  nextspan->prevspan = InvalidDsaPointer;
1609  }
1610  else
1611  {
1612  /* It was not the head. */
1614  prevspan = (dsa_area_span *)
1615  dsa_get_address(area, span->prevspan);
1616  prevspan->nextspan = span->nextspan;
1617  }
1618  if (nextspan != NULL)
1619  nextspan->prevspan = span->prevspan;
1620 
1621  /* Push onto the head of the new fullness class list. */
1622  span->nextspan = pool->spans[tfclass];
1623  pool->spans[tfclass] = span_pointer;
1624  span->prevspan = InvalidDsaPointer;
1625  if (DsaPointerIsValid(span->nextspan))
1626  {
1627  nextspan = (dsa_area_span *)
1628  dsa_get_address(area, span->nextspan);
1629  nextspan->prevspan = span_pointer;
1630  }
1631  span->fclass = tfclass;
1632  }
1633 
1634  /* Advance to next span on list. */
1635  span_pointer = next_span_pointer;
1636  }
1637 
1638  /* Stop now if we found a suitable block. */
1639  if (DsaPointerIsValid(pool->spans[1]))
1640  return true;
1641  }
1642 
1643  /*
1644  * If there are no blocks that properly belong in fullness class 1, pick
1645  * one from some other fullness class and move it there anyway, so that we
1646  * have an allocation target. Our last choice is to transfer a block
1647  * that's almost empty (and might become completely empty soon if left
1648  * alone), but even that is better than failing, which is what we must do
1649  * if there are no blocks at all with freespace.
1650  */
1651  Assert(!DsaPointerIsValid(pool->spans[1]));
1652  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1653  if (transfer_first_span(area, pool, fclass, 1))
1654  return true;
1655  if (!DsaPointerIsValid(pool->spans[1]) &&
1656  transfer_first_span(area, pool, 0, 1))
1657  return true;
1658 
1659  /*
1660  * We failed to find an existing span with free objects, so we need to
1661  * allocate a new superblock and construct a new span to manage it.
1662  *
1663  * First, get a dsa_area_span object to describe the new superblock block
1664  * ... unless this allocation is for a dsa_area_span object, in which case
1665  * that's surely not going to work. We handle that case by storing the
1666  * span describing a block-of-spans inline.
1667  */
1668  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1669  {
1670  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1671  if (!DsaPointerIsValid(span_pointer))
1672  return false;
1673  npages = DSA_PAGES_PER_SUPERBLOCK;
1674  }
1675 
1676  /* Find or create a segment and allocate the superblock. */
1678  segment_map = get_best_segment(area, npages);
1679  if (segment_map == NULL)
1680  {
1681  segment_map = make_new_segment(area, npages);
1682  if (segment_map == NULL)
1683  {
1685  return false;
1686  }
1687  }
1688 
1689  /*
1690  * This shouldn't happen: get_best_segment() or make_new_segment()
1691  * promised that we can successfully allocate npages.
1692  */
1693  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1694  elog(FATAL,
1695  "dsa_allocate could not find %zu free pages for superblock",
1696  npages);
1698 
1699  /* Compute the start of the superblock. */
1700  start_pointer =
1701  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1702  first_page * FPM_PAGE_SIZE);
1703 
1704  /*
1705  * If this is a block-of-spans, carve the descriptor right out of the
1706  * allocated space.
1707  */
1708  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1709  {
1710  /*
1711  * We have a pointer into the segment. We need to build a dsa_pointer
1712  * from the segment index and offset into the segment.
1713  */
1714  span_pointer = start_pointer;
1715  }
1716 
1717  /* Initialize span and pagemap. */
1718  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1719  for (i = 0; i < npages; ++i)
1720  segment_map->pagemap[first_page + i] = span_pointer;
1721 
1722  return true;
1723 }
1724 
1725 /*
1726  * Return the segment map corresponding to a given segment index, mapping the
1727  * segment in if necessary. For internal segment book-keeping, this is called
1728  * with the area lock held. It is also called by dsa_free and dsa_get_address
1729  * without any locking, relying on the fact they have a known live segment
1730  * index and they always call check_for_freed_segments to ensures that any
1731  * freed segment occupying the same slot is detached first.
1732  */
1733 static dsa_segment_map *
1735 {
1736  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1737  {
1738  dsm_handle handle;
1739  dsm_segment *segment;
1740  dsa_segment_map *segment_map;
1741 
1742  /*
1743  * If we are reached by dsa_free or dsa_get_address, there must be at
1744  * least one object allocated in the referenced segment. Otherwise,
1745  * their caller has a double-free or access-after-free bug, which we
1746  * have no hope of detecting. So we know it's safe to access this
1747  * array slot without holding a lock; it won't change underneath us.
1748  * Furthermore, we know that we can see the latest contents of the
1749  * slot, as explained in check_for_freed_segments, which those
1750  * functions call before arriving here.
1751  */
1752  handle = area->control->segment_handles[index];
1753 
1754  /* It's an error to try to access an unused slot. */
1755  if (handle == DSM_HANDLE_INVALID)
1756  elog(ERROR,
1757  "dsa_area could not attach to a segment that has been freed");
1758 
1759  segment = dsm_attach(handle);
1760  if (segment == NULL)
1761  elog(ERROR, "dsa_area could not attach to segment");
1762  if (area->mapping_pinned)
1763  dsm_pin_mapping(segment);
1764  segment_map = &area->segment_maps[index];
1765  segment_map->segment = segment;
1766  segment_map->mapped_address = dsm_segment_address(segment);
1767  segment_map->header =
1768  (dsa_segment_header *) segment_map->mapped_address;
1769  segment_map->fpm = (FreePageManager *)
1770  (segment_map->mapped_address +
1771  MAXALIGN(sizeof(dsa_segment_header)));
1772  segment_map->pagemap = (dsa_pointer *)
1773  (segment_map->mapped_address +
1774  MAXALIGN(sizeof(dsa_segment_header)) +
1775  MAXALIGN(sizeof(FreePageManager)));
1776 
1777  /* Remember the highest index this backend has ever mapped. */
1778  if (area->high_segment_index < index)
1779  area->high_segment_index = index;
1780 
1781  Assert(segment_map->header->magic ==
1783  }
1784 
1785  /*
1786  * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1787  * but it's a bug in the calling code and undefined behavior if the
1788  * address is not live (ie if the segment might possibly have been freed,
1789  * they're trying to use a dangling pointer).
1790  *
1791  * For dsa.c code that holds the area lock to manipulate segment_bins
1792  * lists, it would be a bug if we ever reach a freed segment here. After
1793  * it's marked as freed, the only thing any backend should do with it is
1794  * unmap it, and it should always have done that in
1795  * check_for_freed_segments_locked() before arriving here to resolve an
1796  * index to a segment_map.
1797  *
1798  * Either way we can assert that we aren't returning a freed segment.
1799  */
1800  Assert(!area->segment_maps[index].header->freed);
1801 
1802  return &area->segment_maps[index];
1803 }
1804 
1805 /*
1806  * Return a superblock to the free page manager. If the underlying segment
1807  * has become entirely free, then return it to the operating system.
1808  *
1809  * The appropriate pool lock must be held.
1810  */
1811 static void
1813 {
1814  dsa_area_span *span = dsa_get_address(area, span_pointer);
1815  int size_class = span->size_class;
1816  dsa_segment_map *segment_map;
1817 
1818 
1819  /* Remove it from its fullness class list. */
1820  unlink_span(area, span);
1821 
1822  /*
1823  * Note: Here we acquire the area lock while we already hold a per-pool
1824  * lock. We never hold the area lock and then take a pool lock, or we
1825  * could deadlock.
1826  */
1829  segment_map =
1831  FreePageManagerPut(segment_map->fpm,
1833  span->npages);
1834  /* Check if the segment is now entirely free. */
1835  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1836  {
1837  dsa_segment_index index = get_segment_index(area, segment_map);
1838 
1839  /* If it's not the segment with extra control data, free it. */
1840  if (index != 0)
1841  {
1842  /*
1843  * Give it back to the OS, and allow other backends to detect that
1844  * they need to detach.
1845  */
1846  unlink_segment(area, segment_map);
1847  segment_map->header->freed = true;
1849  segment_map->header->size);
1850  area->control->total_segment_size -=
1851  segment_map->header->size;
1853  dsm_detach(segment_map->segment);
1855  ++area->control->freed_segment_counter;
1856  segment_map->segment = NULL;
1857  segment_map->header = NULL;
1858  segment_map->mapped_address = NULL;
1859  }
1860  }
1862 
1863  /*
1864  * Span-of-spans blocks store the span which describes them within the
1865  * block itself, so freeing the storage implicitly frees the descriptor
1866  * also. If this is a block of any other type, we need to separately free
1867  * the span object also. This recursive call to dsa_free will acquire the
1868  * span pool's lock. We can't deadlock because the acquisition order is
1869  * always some other pool and then the span pool.
1870  */
1871  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1872  dsa_free(area, span_pointer);
1873 }
1874 
1875 static void
1877 {
1878  if (DsaPointerIsValid(span->nextspan))
1879  {
1880  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1881 
1882  next->prevspan = span->prevspan;
1883  }
1884  if (DsaPointerIsValid(span->prevspan))
1885  {
1886  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1887 
1888  prev->nextspan = span->nextspan;
1889  }
1890  else
1891  {
1892  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1893 
1894  pool->spans[span->fclass] = span->nextspan;
1895  }
1896 }
1897 
1898 static void
1900  dsa_pointer span_pointer,
1901  int fclass)
1902 {
1903  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1904 
1905  if (DsaPointerIsValid(pool->spans[fclass]))
1906  {
1907  dsa_area_span *head = dsa_get_address(area,
1908  pool->spans[fclass]);
1909 
1910  head->prevspan = span_pointer;
1911  }
1912  span->prevspan = InvalidDsaPointer;
1913  span->nextspan = pool->spans[fclass];
1914  pool->spans[fclass] = span_pointer;
1915  span->fclass = fclass;
1916 }
1917 
1918 /*
1919  * Detach from an area that was either created or attached to by this process.
1920  */
1921 void
1923 {
1924  int i;
1925 
1926  /* Detach from all segments. */
1927  for (i = 0; i <= area->high_segment_index; ++i)
1928  if (area->segment_maps[i].segment != NULL)
1929  dsm_detach(area->segment_maps[i].segment);
1930 
1931  /*
1932  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1933  * 'releasing' (= adjusting the reference count). It would be nice to
1934  * combine these operations, but client code might never get around to
1935  * calling dsa_detach because of an error path, and a detach hook on any
1936  * particular segment is too late to detach other segments in the area
1937  * without risking a 'leak' warning in the non-error path.
1938  */
1939 
1940  /* Free the backend-local area object. */
1941  pfree(area);
1942 }
1943 
1944 /*
1945  * Unlink a segment from the bin that contains it.
1946  */
1947 static void
1949 {
1950  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1951  {
1952  dsa_segment_map *prev;
1953 
1954  prev = get_segment_by_index(area, segment_map->header->prev);
1955  prev->header->next = segment_map->header->next;
1956  }
1957  else
1958  {
1959  Assert(area->control->segment_bins[segment_map->header->bin] ==
1960  get_segment_index(area, segment_map));
1961  area->control->segment_bins[segment_map->header->bin] =
1962  segment_map->header->next;
1963  }
1964  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1965  {
1967 
1968  next = get_segment_by_index(area, segment_map->header->next);
1969  next->header->prev = segment_map->header->prev;
1970  }
1971 }
1972 
1973 /*
1974  * Find a segment that could satisfy a request for 'npages' of contiguous
1975  * memory, or return NULL if none can be found. This may involve attaching to
1976  * segments that weren't previously attached so that we can query their free
1977  * pages map.
1978  */
1979 static dsa_segment_map *
1980 get_best_segment(dsa_area *area, size_t npages)
1981 {
1982  size_t bin;
1983 
1986 
1987  /*
1988  * Start searching from the first bin that *might* have enough contiguous
1989  * pages.
1990  */
1991  for (bin = contiguous_pages_to_segment_bin(npages);
1992  bin < DSA_NUM_SEGMENT_BINS;
1993  ++bin)
1994  {
1995  /*
1996  * The minimum contiguous size that any segment in this bin should
1997  * have. We'll re-bin if we see segments with fewer.
1998  */
1999  size_t threshold = (size_t) 1 << (bin - 1);
2000  dsa_segment_index segment_index;
2001 
2002  /* Search this bin for a segment with enough contiguous space. */
2003  segment_index = area->control->segment_bins[bin];
2004  while (segment_index != DSA_SEGMENT_INDEX_NONE)
2005  {
2006  dsa_segment_map *segment_map;
2007  dsa_segment_index next_segment_index;
2008  size_t contiguous_pages;
2009 
2010  segment_map = get_segment_by_index(area, segment_index);
2011  next_segment_index = segment_map->header->next;
2012  contiguous_pages = fpm_largest(segment_map->fpm);
2013 
2014  /* Not enough for the request, still enough for this bin. */
2015  if (contiguous_pages >= threshold && contiguous_pages < npages)
2016  {
2017  segment_index = next_segment_index;
2018  continue;
2019  }
2020 
2021  /* Re-bin it if it's no longer in the appropriate bin. */
2022  if (contiguous_pages < threshold)
2023  {
2024  size_t new_bin;
2025 
2026  new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
2027 
2028  /* Remove it from its current bin. */
2029  unlink_segment(area, segment_map);
2030 
2031  /* Push it onto the front of its new bin. */
2032  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2033  segment_map->header->next =
2034  area->control->segment_bins[new_bin];
2035  segment_map->header->bin = new_bin;
2036  area->control->segment_bins[new_bin] = segment_index;
2037  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2038  {
2040 
2041  next = get_segment_by_index(area,
2042  segment_map->header->next);
2043  Assert(next->header->bin == new_bin);
2044  next->header->prev = segment_index;
2045  }
2046 
2047  /*
2048  * But fall through to see if it's enough to satisfy this
2049  * request anyway....
2050  */
2051  }
2052 
2053  /* Check if we are done. */
2054  if (contiguous_pages >= npages)
2055  return segment_map;
2056 
2057  /* Continue searching the same bin. */
2058  segment_index = next_segment_index;
2059  }
2060  }
2061 
2062  /* Not found. */
2063  return NULL;
2064 }
2065 
2066 /*
2067  * Create a new segment that can handle at least requested_pages. Returns
2068  * NULL if the requested total size limit or maximum allowed number of
2069  * segments would be exceeded.
2070  */
2071 static dsa_segment_map *
2072 make_new_segment(dsa_area *area, size_t requested_pages)
2073 {
2074  dsa_segment_index new_index;
2075  size_t metadata_bytes;
2076  size_t total_size;
2077  size_t total_pages;
2078  size_t usable_pages;
2079  dsa_segment_map *segment_map;
2080  dsm_segment *segment;
2081 
2083 
2084  /* Find a segment slot that is not in use (linearly for now). */
2085  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2086  {
2087  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2088  break;
2089  }
2090  if (new_index == DSA_MAX_SEGMENTS)
2091  return NULL;
2092 
2093  /*
2094  * If the total size limit is already exceeded, then we exit early and
2095  * avoid arithmetic wraparound in the unsigned expressions below.
2096  */
2097  if (area->control->total_segment_size >=
2099  return NULL;
2100 
2101  /*
2102  * The size should be at least as big as requested, and at least big
2103  * enough to follow a geometric series that approximately doubles the
2104  * total storage each time we create a new segment. We use geometric
2105  * growth because the underlying DSM system isn't designed for large
2106  * numbers of segments (otherwise we might even consider just using one
2107  * DSM segment for each large allocation and for each superblock, and then
2108  * we wouldn't need to use FreePageManager).
2109  *
2110  * We decide on a total segment size first, so that we produce tidy
2111  * power-of-two sized segments. This is a good property to have if we
2112  * move to huge pages in the future. Then we work back to the number of
2113  * pages we can fit.
2114  */
2116  ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2120  area->control->total_segment_size);
2121 
2122  total_pages = total_size / FPM_PAGE_SIZE;
2123  metadata_bytes =
2124  MAXALIGN(sizeof(dsa_segment_header)) +
2125  MAXALIGN(sizeof(FreePageManager)) +
2126  sizeof(dsa_pointer) * total_pages;
2127 
2128  /* Add padding up to next page boundary. */
2129  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2130  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2131  if (total_size <= metadata_bytes)
2132  return NULL;
2133  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2134  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2135 
2136  /* See if that is enough... */
2137  if (requested_pages > usable_pages)
2138  {
2139  /*
2140  * We'll make an odd-sized segment, working forward from the requested
2141  * number of pages.
2142  */
2143  usable_pages = requested_pages;
2144  metadata_bytes =
2145  MAXALIGN(sizeof(dsa_segment_header)) +
2146  MAXALIGN(sizeof(FreePageManager)) +
2147  usable_pages * sizeof(dsa_pointer);
2148 
2149  /* Add padding up to next page boundary. */
2150  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2151  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2152  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2153 
2154  /* Is that too large for dsa_pointer's addressing scheme? */
2156  return NULL;
2157 
2158  /* Would that exceed the limit? */
2160  area->control->total_segment_size)
2161  return NULL;
2162  }
2163 
2164  /* Create the segment. */
2165  segment = dsm_create(total_size, 0);
2166  if (segment == NULL)
2167  return NULL;
2168  dsm_pin_segment(segment);
2169  if (area->mapping_pinned)
2170  dsm_pin_mapping(segment);
2171 
2172  /* Store the handle in shared memory to be found by index. */
2173  area->control->segment_handles[new_index] =
2174  dsm_segment_handle(segment);
2175  /* Track the highest segment index in the history of the area. */
2176  if (area->control->high_segment_index < new_index)
2177  area->control->high_segment_index = new_index;
2178  /* Track the highest segment index this backend has ever mapped. */
2179  if (area->high_segment_index < new_index)
2180  area->high_segment_index = new_index;
2181  /* Track total size of all segments. */
2185 
2186  /* Build a segment map for this segment in this backend. */
2187  segment_map = &area->segment_maps[new_index];
2188  segment_map->segment = segment;
2189  segment_map->mapped_address = dsm_segment_address(segment);
2190  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2191  segment_map->fpm = (FreePageManager *)
2192  (segment_map->mapped_address +
2193  MAXALIGN(sizeof(dsa_segment_header)));
2194  segment_map->pagemap = (dsa_pointer *)
2195  (segment_map->mapped_address +
2196  MAXALIGN(sizeof(dsa_segment_header)) +
2197  MAXALIGN(sizeof(FreePageManager)));
2198 
2199  /* Set up the free page map. */
2200  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2201  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2202  usable_pages);
2203 
2204  /* Set up the segment header and put it in the appropriate bin. */
2205  segment_map->header->magic =
2206  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2207  segment_map->header->usable_pages = usable_pages;
2208  segment_map->header->size = total_size;
2209  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2210  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2211  segment_map->header->next =
2212  area->control->segment_bins[segment_map->header->bin];
2213  segment_map->header->freed = false;
2214  area->control->segment_bins[segment_map->header->bin] = new_index;
2215  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2216  {
2218  get_segment_by_index(area, segment_map->header->next);
2219 
2220  Assert(next->header->bin == segment_map->header->bin);
2221  next->header->prev = new_index;
2222  }
2223 
2224  return segment_map;
2225 }
2226 
2227 /*
2228  * Check if any segments have been freed by destroy_superblock, so we can
2229  * detach from them in this backend. This function is called by
2230  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2231  * received can be resolved to the correct segment.
2232  *
2233  * The danger we want to defend against is that there could be an old segment
2234  * mapped into a given slot in this backend, and the dsa_pointer they have
2235  * might refer to some new segment in the same slot. So those functions must
2236  * be sure to process all instructions to detach from a freed segment that had
2237  * been generated by the time this process received the dsa_pointer, before
2238  * they call get_segment_by_index.
2239  */
2240 static void
2242 {
2243  size_t freed_segment_counter;
2244 
2245  /*
2246  * Any other process that has freed a segment has incremented
2247  * freed_segment_counter while holding an LWLock, and that must precede
2248  * any backend creating a new segment in the same slot while holding an
2249  * LWLock, and that must precede the creation of any dsa_pointer pointing
2250  * into the new segment which might reach us here, and the caller must
2251  * have sent the dsa_pointer to this process using appropriate memory
2252  * synchronization (some kind of locking or atomic primitive or system
2253  * call). So all we need to do on the reading side is ask for the load of
2254  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2255  * has, and we can be sure to detect any segments that had been freed as
2256  * of the time that the dsa_pointer reached this process.
2257  */
2258  pg_read_barrier();
2259  freed_segment_counter = area->control->freed_segment_counter;
2260  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2261  {
2262  /* Check all currently mapped segments to find what's been freed. */
2266  }
2267 }
2268 
2269 /*
2270  * Workhorse for check_for_freed_segments(), and also used directly in path
2271  * where the area lock is already held. This should be called after acquiring
2272  * the lock but before looking up any segment by index number, to make sure we
2273  * unmap any stale segments that might have previously had the same index as a
2274  * current segment.
2275  */
2276 static void
2278 {
2279  size_t freed_segment_counter;
2280  int i;
2281 
2283  freed_segment_counter = area->control->freed_segment_counter;
2284  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2285  {
2286  for (i = 0; i <= area->high_segment_index; ++i)
2287  {
2288  if (area->segment_maps[i].header != NULL &&
2289  area->segment_maps[i].header->freed)
2290  {
2291  dsm_detach(area->segment_maps[i].segment);
2292  area->segment_maps[i].segment = NULL;
2293  area->segment_maps[i].header = NULL;
2294  area->segment_maps[i].mapped_address = NULL;
2295  }
2296  }
2297  area->freed_segment_counter = freed_segment_counter;
2298  }
2299 }
#define pg_read_barrier()
Definition: atomics.h:153
static int32 next
Definition: blutils.c:219
unsigned short uint16
Definition: c.h:489
unsigned int uint32
Definition: c.h:490
#define Min(x, y)
Definition: c.h:988
#define MAXALIGN(LEN)
Definition: c.h:795
#define unlikely(x)
Definition: c.h:295
#define lengthof(array)
Definition: c.h:772
unsigned char uint8
Definition: c.h:488
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1948
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2241
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:481
static const uint16 dsa_size_classes[]
Definition: dsa.c:248
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:119
#define DSA_AREA_LOCK(area)
Definition: dsa.c:155
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:78
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1899
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1537
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:128
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:112
void dsa_trim(dsa_area *area)
Definition: dsa.c:1032
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:389
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:115
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:393
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:598
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:69
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:584
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1449
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:105
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:281
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:142
static const uint8 dsa_size_class_map[]
Definition: dsa.c:271
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:677
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2072
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:518
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:390
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:341
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:944
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2277
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:122
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1812
#define DSA_MAX_SEGMENTS
Definition: dsa.c:95
size_t dsa_segment_index
Definition: dsa.c:125
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:156
void dsa_release_in_place(void *place)
Definition: dsa.c:613
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1734
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1020
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:262
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1409
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1876
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:432
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:263
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:259
void dsa_unpin(dsa_area *area)
Definition: dsa.c:996
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:643
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1303
#define NextFreeObjectIndex(object)
Definition: dsa.c:225
void dsa_dump(dsa_area *area)
Definition: dsa.c:1077
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:506
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1354
void dsa_detach(dsa_area *area)
Definition: dsa.c:1922
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:1980
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:553
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:102
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:289
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:832
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:134
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1203
size_t dsa_minimum_size(void)
Definition: dsa.c:1181
void dsa_pin(dsa_area *area)
Definition: dsa.c:977
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
dsm_handle dsa_handle
Definition: dsa.h:100
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:103
#define DsaPointerIsValid(x)
Definition: dsa.h:81
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1094
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1066
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:776
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1103
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:638
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:489
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:888
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:960
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:928
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1919
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1195
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1803
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:730
@ LW_EXCLUSIVE
Definition: lwlock.h:115
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc(Size size)
Definition: mcxt.c:1210
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:334
int64 total_size
Definition: pg_checksums.c:72
#define fprintf
Definition: port.h:242
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
Definition: lwlock.h:40
dsa_segment_header segment_header
Definition: dsa.c:313
size_t total_segment_size
Definition: dsa.c:323
int lwlock_tranche_id
Definition: dsa.c:335
dsa_segment_index high_segment_index
Definition: dsa.c:327
bool pinned
Definition: dsa.c:331
size_t max_total_segment_size
Definition: dsa.c:325
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:319
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:321
size_t freed_segment_counter
Definition: dsa.c:333
int refcnt
Definition: dsa.c:329
LWLock lock
Definition: dsa.c:337
dsa_handle handle
Definition: dsa.c:315
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:317
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:302
LWLock lock
Definition: dsa.c:300
dsa_pointer nextspan
Definition: dsa.c:210
uint16 fclass
Definition: dsa.c:218
dsa_pointer start
Definition: dsa.c:211
uint16 nallocatable
Definition: dsa.c:215
dsa_pointer prevspan
Definition: dsa.c:209
uint16 size_class
Definition: dsa.c:213
uint16 nmax
Definition: dsa.c:217
uint16 ninitialized
Definition: dsa.c:214
uint16 firstfree
Definition: dsa.c:216
dsa_pointer pool
Definition: dsa.c:208
size_t npages
Definition: dsa.c:212
Definition: dsa.c:367
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:380
dsa_segment_index high_segment_index
Definition: dsa.c:383
bool mapping_pinned
Definition: dsa.c:372
size_t freed_segment_counter
Definition: dsa.c:386
dsa_area_control * control
Definition: dsa.c:369
uint32 magic
Definition: dsa.c:166
size_t size
Definition: dsa.c:170
dsa_segment_index next
Definition: dsa.c:182
dsa_segment_index prev
Definition: dsa.c:176
size_t usable_pages
Definition: dsa.c:168
size_t bin
Definition: dsa.c:184
dsa_segment_header * header
Definition: dsa.c:355
FreePageManager * fpm
Definition: dsa.c:356
dsm_segment * segment
Definition: dsa.c:353
dsa_pointer * pagemap
Definition: dsa.c:357
char * mapped_address
Definition: dsa.c:354
Definition: type.h:95