PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/lwlock.h"
57 #include "utils/dsa.h"
58 #include "utils/freepage.h"
59 #include "utils/memutils.h"
60 #include "utils/resowner.h"
61 
62 /*
63  * The size of the initial DSM segment that backs a dsa_area created by
64  * dsa_create. After creating some number of segments of this size we'll
65  * double this size, and so on. Larger segments may be created if necessary
66  * to satisfy large requests.
67  */
68 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
69 
70 /*
71  * How many segments to create before we double the segment size. If this is
72  * low, then there is likely to be a lot of wasted space in the largest
73  * segment. If it is high, then we risk running out of segment slots (see
74  * dsm.c's limits on total number of segments), or limiting the total size
75  * an area can manage when using small pointers.
76  */
77 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
78 
79 /*
80  * The number of bits used to represent the offset part of a dsa_pointer.
81  * This controls the maximum size of a segment, the maximum possible
82  * allocation size and also the maximum number of segments per area.
83  */
84 #if SIZEOF_DSA_POINTER == 4
85 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
86 #else
87 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
88 #endif
89 
90 /*
91  * The maximum number of DSM segments that an area can own, determined by
92  * the number of bits remaining (but capped at 1024).
93  */
94 #define DSA_MAX_SEGMENTS \
95  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96 
97 /* The bitmask for extracting the offset from a dsa_pointer. */
98 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99 
100 /* The maximum size of a DSM segment. */
101 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
102 
103 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104 #define DSA_PAGES_PER_SUPERBLOCK 16
105 
106 /*
107  * A magic number used as a sanity check for following DSM segments belonging
108  * to a DSA area (this number will be XORed with the area handle and
109  * the segment index).
110  */
111 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112 
113 /* Build a dsa_pointer given a segment number and offset. */
114 #define DSA_MAKE_POINTER(segment_number, offset) \
115  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116 
117 /* Extract the segment number from a dsa_pointer. */
118 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119 
120 /* Extract the offset from a dsa_pointer. */
121 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122 
123 /* The type used for index segment indexes (zero based). */
124 typedef size_t dsa_segment_index;
125 
126 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128 
129 /*
130  * How many bins of segments do we have? The bins are used to categorize
131  * segments by their largest contiguous run of free pages.
132  */
133 #define DSA_NUM_SEGMENT_BINS 16
134 
135 /*
136  * What is the lowest bin that holds segments that *might* have n contiguous
137  * free pages? There is no point in looking in segments in lower bins; they
138  * definitely can't service a request for n free pages.
139  */
140 static inline size_t
142 {
143  size_t bin;
144 
145  if (n == 0)
146  bin = 0;
147  else
148  bin = pg_leftmost_one_pos_size_t(n) + 1;
149 
150  return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
151 }
152 
153 /* Macros for access to locks. */
154 #define DSA_AREA_LOCK(area) (&area->control->lock)
155 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
156 
157 /*
158  * The header for an individual segment. This lives at the start of each DSM
159  * segment owned by a DSA area including the first segment (where it appears
160  * as part of the dsa_area_control struct).
161  */
162 typedef struct
163 {
164  /* Sanity check magic value. */
166  /* Total number of pages in this segment (excluding metadata area). */
167  size_t usable_pages;
168  /* Total size of this segment in bytes. */
169  size_t size;
170 
171  /*
172  * Index of the segment that precedes this one in the same segment bin, or
173  * DSA_SEGMENT_INDEX_NONE if this is the first one.
174  */
176 
177  /*
178  * Index of the segment that follows this one in the same segment bin, or
179  * DSA_SEGMENT_INDEX_NONE if this is the last one.
180  */
182  /* The index of the bin that contains this segment. */
183  size_t bin;
184 
185  /*
186  * A flag raised to indicate that this segment is being returned to the
187  * operating system and has been unpinned.
188  */
189  bool freed;
191 
192 /*
193  * Metadata for one superblock.
194  *
195  * For most blocks, span objects are stored out-of-line; that is, the span
196  * object is not stored within the block itself. But, as an exception, for a
197  * "span of spans", the span object is stored "inline". The allocation is
198  * always exactly one page, and the dsa_area_span object is located at
199  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
200  * and the remaining fields are used just as they would be in an ordinary
201  * block. We can't allocate spans out of ordinary superblocks because
202  * creating an ordinary superblock requires us to be able to allocate a span
203  * *first*. Doing it this way avoids that circularity.
204  */
205 typedef struct
206 {
207  dsa_pointer pool; /* Containing pool. */
208  dsa_pointer prevspan; /* Previous span. */
209  dsa_pointer nextspan; /* Next span. */
210  dsa_pointer start; /* Starting address. */
211  size_t npages; /* Length of span in pages. */
212  uint16 size_class; /* Size class. */
213  uint16 ninitialized; /* Maximum number of objects ever allocated. */
214  uint16 nallocatable; /* Number of objects currently allocatable. */
215  uint16 firstfree; /* First object on free list. */
216  uint16 nmax; /* Maximum number of objects ever possible. */
217  uint16 fclass; /* Current fullness class. */
218 } dsa_area_span;
219 
220 /*
221  * Given a pointer to an object in a span, access the index of the next free
222  * object in the same span (ie in the span's freelist) as an L-value.
223  */
224 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
225 
226 /*
227  * Small allocations are handled by dividing a single block of memory into
228  * many small objects of equal size. The possible allocation sizes are
229  * defined by the following array. Larger size classes are spaced more widely
230  * than smaller size classes. We fudge the spacing for size classes >1kB to
231  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
232  * blocks, we bump the maximum object size up to the largest multiple of
233  * 8 bytes that still lets us fit the same number of objects into one block.
234  *
235  * NB: Because of this fudging, if we were ever to use differently-sized blocks
236  * for small allocations, these size classes would need to be reworked to be
237  * optimal for the new size.
238  *
239  * NB: The optimal spacing for size classes, as well as the size of the blocks
240  * out of which small objects are allocated, is not a question that has one
241  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
242  * size classes than we do here, while others (like aset.c) use more
243  * widely-spaced classes. Spacing the classes more closely avoids wasting
244  * memory within individual chunks, but also means a larger number of
245  * potentially-unfilled blocks.
246  */
247 static const uint16 dsa_size_classes[] = {
248  sizeof(dsa_area_span), 0, /* special size classes */
249  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
250  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
251  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
252  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
253  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
254  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
255  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
256  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
257 };
258 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
259 
260 /* Special size classes. */
261 #define DSA_SCLASS_BLOCK_OF_SPANS 0
262 #define DSA_SCLASS_SPAN_LARGE 1
263 
264 /*
265  * The following lookup table is used to map the size of small objects
266  * (less than 1kB) onto the corresponding size class. To use this table,
267  * round the size of the object up to the next multiple of 8 bytes, and then
268  * index into this array.
269  */
270 static const uint8 dsa_size_class_map[] = {
271  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
272  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
273  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
274  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
275  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
276  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
277  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
278  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
279 };
280 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
281 
282 /*
283  * Superblocks are binned by how full they are. Generally, each fullness
284  * class corresponds to one quartile, but the block being used for
285  * allocations is always at the head of the list for fullness class 1,
286  * regardless of how full it really is.
287  */
288 #define DSA_FULLNESS_CLASSES 4
289 
290 /*
291  * A dsa_area_pool represents a set of objects of a given size class.
292  *
293  * Perhaps there should be multiple pools for the same size class for
294  * contention avoidance, but for now there is just one!
295  */
296 typedef struct
297 {
298  /* A lock protecting access to this pool. */
300  /* A set of linked lists of spans, arranged by fullness. */
302  /* Should we pad this out to a cacheline boundary? */
303 } dsa_area_pool;
304 
305 /*
306  * The control block for an area. This lives in shared memory, at the start of
307  * the first DSM segment controlled by this area.
308  */
309 typedef struct
310 {
311  /* The segment header for the first segment. */
313  /* The handle for this area. */
315  /* The handles of the segments owned by this area. */
316  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
317  /* Lists of segments, binned by maximum contiguous run of free pages. */
319  /* The object pools for each size class. */
321  /* The total size of all active segments. */
323  /* The maximum total size of backing storage we are allowed. */
325  /* Highest used segment index in the history of this area. */
327  /* The reference count for this area. */
328  int refcnt;
329  /* A flag indicating that this area has been pinned. */
330  bool pinned;
331  /* The number of times that segments have been freed. */
333  /* The LWLock tranche ID. */
335  /* The general lock (protects everything except object pools). */
338 
339 /* Given a pointer to a pool, find a dsa_pointer. */
340 #define DsaAreaPoolToDsaPointer(area, p) \
341  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
342 
343 /*
344  * A dsa_segment_map is stored within the backend-private memory of each
345  * individual backend. It holds the base address of the segment within that
346  * backend, plus the addresses of key objects within the segment. Those
347  * could instead be derived from the base address but it's handy to have them
348  * around.
349  */
350 typedef struct
351 {
352  dsm_segment *segment; /* DSM segment */
353  char *mapped_address; /* Address at which segment is mapped */
354  dsa_segment_header *header; /* Header (same as mapped_address) */
355  FreePageManager *fpm; /* Free page manager within segment. */
356  dsa_pointer *pagemap; /* Page map within segment. */
358 
359 /*
360  * Per-backend state for a storage area. Backends obtain one of these by
361  * creating an area or attaching to an existing one using a handle. Each
362  * process that needs to use an area uses its own object to track where the
363  * segments are mapped.
364  */
365 struct dsa_area
366 {
367  /* Pointer to the control object in shared memory. */
369 
370  /*
371  * All the mappings are owned by this. The dsa_area itself is not
372  * directly tracked by the ResourceOwner, but the effect is the same. NULL
373  * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
374  * been called.
375  */
377 
378  /*
379  * This backend's array of segment maps, ordered by segment index
380  * corresponding to control->segment_handles. Some of the area's segments
381  * may not be mapped in this backend yet, and some slots may have been
382  * freed and need to be detached; these operations happen on demand.
383  */
385 
386  /* The highest segment index this backend has ever mapped. */
388 
389  /* The last observed freed_segment_counter. */
391 };
392 
393 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
394 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
395 
396 /* Given a pointer to a segment_map, obtain a segment index number. */
397 #define get_segment_index(area, segment_map_ptr) \
398  (segment_map_ptr - &area->segment_maps[0])
399 
400 static void init_span(dsa_area *area, dsa_pointer span_pointer,
401  dsa_area_pool *pool, dsa_pointer start, size_t npages,
402  uint16 size_class);
403 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
404  int fromclass, int toclass);
405 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
406 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
407  int size_class);
410 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
411 static void unlink_span(dsa_area *area, dsa_area_span *span);
412 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
413  dsa_pointer span_pointer, int fclass);
414 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
415 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
416 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
417 static dsa_area *create_internal(void *place, size_t size,
418  int tranche_id,
419  dsm_handle control_handle,
420  dsm_segment *control_segment);
421 static dsa_area *attach_internal(void *place, dsm_segment *segment,
422  dsa_handle handle);
423 static void check_for_freed_segments(dsa_area *area);
424 static void check_for_freed_segments_locked(dsa_area *area);
425 static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
426 
427 /*
428  * Create a new shared area in a new DSM segment. Further DSM segments will
429  * be allocated as required to extend the available space.
430  *
431  * We can't allocate a LWLock tranche_id within this function, because tranche
432  * IDs are a scarce resource; there are only 64k available, using low numbers
433  * when possible matters, and we have no provision for recycling them. So,
434  * we require the caller to provide one.
435  */
436 dsa_area *
437 dsa_create(int tranche_id)
438 {
439  dsm_segment *segment;
440  dsa_area *area;
441 
442  /*
443  * Create the DSM segment that will hold the shared control object and the
444  * first segment of usable space.
445  */
446  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
447 
448  /*
449  * All segments backing this area are pinned, so that DSA can explicitly
450  * control their lifetime (otherwise a newly created segment belonging to
451  * this area might be freed when the only backend that happens to have it
452  * mapped in ends, corrupting the area).
453  */
454  dsm_pin_segment(segment);
455 
456  /* Create a new DSA area with the control object in this segment. */
457  area = create_internal(dsm_segment_address(segment),
459  tranche_id,
460  dsm_segment_handle(segment), segment);
461 
462  /* Clean up when the control segment detaches. */
465 
466  return area;
467 }
468 
469 /*
470  * Create a new shared area in an existing shared memory space, which may be
471  * either DSM or Postmaster-initialized memory. DSM segments will be
472  * allocated as required to extend the available space, though that can be
473  * prevented with dsa_set_size_limit(area, size) using the same size provided
474  * to dsa_create_in_place.
475  *
476  * Areas created in-place must eventually be released by the backend that
477  * created them and all backends that attach to them. This can be done
478  * explicitly with dsa_release_in_place, or, in the special case that 'place'
479  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
480  * segment so that a detach hook can be registered with the containing DSM
481  * segment.
482  *
483  * See dsa_create() for a note about the tranche arguments.
484  */
485 dsa_area *
486 dsa_create_in_place(void *place, size_t size,
487  int tranche_id, dsm_segment *segment)
488 {
489  dsa_area *area;
490 
491  area = create_internal(place, size, tranche_id,
492  DSM_HANDLE_INVALID, NULL);
493 
494  /*
495  * Clean up when the control segment detaches, if a containing DSM segment
496  * was provided.
497  */
498  if (segment != NULL)
500  PointerGetDatum(place));
501 
502  return area;
503 }
504 
505 /*
506  * Obtain a handle that can be passed to other processes so that they can
507  * attach to the given area. Cannot be called for areas created with
508  * dsa_create_in_place.
509  */
512 {
514  return area->control->handle;
515 }
516 
517 /*
518  * Attach to an area given a handle generated (possibly in another process) by
519  * dsa_get_handle. The area must have been created with dsa_create (not
520  * dsa_create_in_place).
521  */
522 dsa_area *
524 {
525  dsm_segment *segment;
526  dsa_area *area;
527 
528  /*
529  * An area handle is really a DSM segment handle for the first segment, so
530  * we go ahead and attach to that.
531  */
532  segment = dsm_attach(handle);
533  if (segment == NULL)
534  ereport(ERROR,
535  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
536  errmsg("could not attach to dynamic shared area")));
537 
538  area = attach_internal(dsm_segment_address(segment), segment, handle);
539 
540  /* Clean up when the control segment detaches. */
543 
544  return area;
545 }
546 
547 /*
548  * Attach to an area that was created with dsa_create_in_place. The caller
549  * must somehow know the location in memory that was used when the area was
550  * created, though it may be mapped at a different virtual address in this
551  * process.
552  *
553  * See dsa_create_in_place for note about releasing in-place areas, and the
554  * optional 'segment' argument which can be provided to allow automatic
555  * release if the containing memory happens to be a DSM segment.
556  */
557 dsa_area *
558 dsa_attach_in_place(void *place, dsm_segment *segment)
559 {
560  dsa_area *area;
561 
562  area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
563 
564  /*
565  * Clean up when the control segment detaches, if a containing DSM segment
566  * was provided.
567  */
568  if (segment != NULL)
570  PointerGetDatum(place));
571 
572  return area;
573 }
574 
575 /*
576  * Release a DSA area that was produced by dsa_create_in_place or
577  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
578  * interface suitable for on_dsm_detach, for the convenience of users who want
579  * to create a DSA segment inside an existing DSM segment and have it
580  * automatically released when the containing DSM segment is detached.
581  * 'place' should be the address of the place where the area was created.
582  *
583  * This callback is automatically registered for the DSM segment containing
584  * the control object of in-place areas when a segment is provided to
585  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
586  * with dsa_create.
587  */
588 void
590 {
592 }
593 
594 /*
595  * Release a DSA area that was produced by dsa_create_in_place or
596  * dsa_attach_in_place. The 'code' argument is ignored but provides an
597  * interface suitable for on_shmem_exit or before_shmem_exit, for the
598  * convenience of users who want to create a DSA segment inside shared memory
599  * other than a DSM segment and have it automatically release at backend exit.
600  * 'place' should be the address of the place where the area was created.
601  */
602 void
604 {
606 }
607 
608 /*
609  * Release a DSA area that was produced by dsa_create_in_place or
610  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
611  * callbacks so that this is managed automatically, because failure to release
612  * an area created in-place leaks its segments permanently.
613  *
614  * This is also called automatically for areas produced by dsa_create or
615  * dsa_attach as an implementation detail.
616  */
617 void
619 {
620  dsa_area_control *control = (dsa_area_control *) place;
621  int i;
622 
623  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
624  Assert(control->segment_header.magic ==
625  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
626  Assert(control->refcnt > 0);
627  if (--control->refcnt == 0)
628  {
629  for (i = 0; i <= control->high_segment_index; ++i)
630  {
631  dsm_handle handle;
632 
633  handle = control->segment_handles[i];
634  if (handle != DSM_HANDLE_INVALID)
635  dsm_unpin_segment(handle);
636  }
637  }
638  LWLockRelease(&control->lock);
639 }
640 
641 /*
642  * Keep a DSA area attached until end of session or explicit detach.
643  *
644  * By default, areas are owned by the current resource owner, which means they
645  * are detached automatically when that scope ends.
646  */
647 void
649 {
650  int i;
651 
652  if (area->resowner != NULL)
653  {
654  area->resowner = NULL;
655 
656  for (i = 0; i <= area->high_segment_index; ++i)
657  if (area->segment_maps[i].segment != NULL)
659  }
660 }
661 
662 /*
663  * Allocate memory in this storage area. The return value is a dsa_pointer
664  * that can be passed to other processes, and converted to a local pointer
665  * with dsa_get_address. 'flags' is a bitmap which should be constructed
666  * from the following values:
667  *
668  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
669  * will result in an ERROR.
670  *
671  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
672  * no memory is available or a size limit established by dsa_set_size_limit
673  * would be exceeded. Otherwise, such allocations will result in an ERROR.
674  *
675  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
676  * contents of newly-allocated memory are indeterminate.
677  *
678  * These flags correspond to similarly named flags used by
679  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
680  * dsa_allocate0 which expand to a call to this function with commonly used
681  * flags.
682  */
684 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
685 {
686  uint16 size_class;
687  dsa_pointer start_pointer;
688  dsa_segment_map *segment_map;
689  dsa_pointer result;
690 
691  Assert(size > 0);
692 
693  /* Sanity check on huge individual allocation size. */
694  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
695  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
696  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
697 
698  /*
699  * If bigger than the largest size class, just grab a run of pages from
700  * the free page manager, instead of allocating an object from a pool.
701  * There will still be a span, but it's a special class of span that
702  * manages this whole allocation and simply gives all pages back to the
703  * free page manager when dsa_free is called.
704  */
706  {
707  size_t npages = fpm_size_to_pages(size);
708  size_t first_page;
709  dsa_pointer span_pointer;
711 
712  /* Obtain a span object. */
713  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
714  if (!DsaPointerIsValid(span_pointer))
715  {
716  /* Raise error unless asked not to. */
717  if ((flags & DSA_ALLOC_NO_OOM) == 0)
718  ereport(ERROR,
719  (errcode(ERRCODE_OUT_OF_MEMORY),
720  errmsg("out of memory"),
721  errdetail("Failed on DSA request of size %zu.",
722  size)));
723  return InvalidDsaPointer;
724  }
725 
727 
728  /* Find a segment from which to allocate. */
729  segment_map = get_best_segment(area, npages);
730  if (segment_map == NULL)
731  segment_map = make_new_segment(area, npages);
732  if (segment_map == NULL)
733  {
734  /* Can't make any more segments: game over. */
736  dsa_free(area, span_pointer);
737 
738  /* Raise error unless asked not to. */
739  if ((flags & DSA_ALLOC_NO_OOM) == 0)
740  ereport(ERROR,
741  (errcode(ERRCODE_OUT_OF_MEMORY),
742  errmsg("out of memory"),
743  errdetail("Failed on DSA request of size %zu.",
744  size)));
745  return InvalidDsaPointer;
746  }
747 
748  /*
749  * Ask the free page manager for a run of pages. This should always
750  * succeed, since both get_best_segment and make_new_segment should
751  * only return a non-NULL pointer if it actually contains enough
752  * contiguous freespace. If it does fail, something in our backend
753  * private state is out of whack, so use FATAL to kill the process.
754  */
755  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
756  elog(FATAL,
757  "dsa_allocate could not find %zu free pages", npages);
759 
760  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
761  first_page * FPM_PAGE_SIZE);
762 
763  /* Initialize span and pagemap. */
765  LW_EXCLUSIVE);
766  init_span(area, span_pointer, pool, start_pointer, npages,
768  segment_map->pagemap[first_page] = span_pointer;
770 
771  /* Zero-initialize the memory if requested. */
772  if ((flags & DSA_ALLOC_ZERO) != 0)
773  memset(dsa_get_address(area, start_pointer), 0, size);
774 
775  return start_pointer;
776  }
777 
778  /* Map allocation to a size class. */
780  {
781  int mapidx;
782 
783  /* For smaller sizes we have a lookup table... */
784  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
786  size_class = dsa_size_class_map[mapidx];
787  }
788  else
789  {
790  uint16 min;
791  uint16 max;
792 
793  /* ... and for the rest we search by binary chop. */
795  max = lengthof(dsa_size_classes) - 1;
796 
797  while (min < max)
798  {
799  uint16 mid = (min + max) / 2;
800  uint16 class_size = dsa_size_classes[mid];
801 
802  if (class_size < size)
803  min = mid + 1;
804  else
805  max = mid;
806  }
807 
808  size_class = min;
809  }
810  Assert(size <= dsa_size_classes[size_class]);
811  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
812 
813  /* Attempt to allocate an object from the appropriate pool. */
814  result = alloc_object(area, size_class);
815 
816  /* Check for failure to allocate. */
817  if (!DsaPointerIsValid(result))
818  {
819  /* Raise error unless asked not to. */
820  if ((flags & DSA_ALLOC_NO_OOM) == 0)
821  ereport(ERROR,
822  (errcode(ERRCODE_OUT_OF_MEMORY),
823  errmsg("out of memory"),
824  errdetail("Failed on DSA request of size %zu.", size)));
825  return InvalidDsaPointer;
826  }
827 
828  /* Zero-initialize the memory if requested. */
829  if ((flags & DSA_ALLOC_ZERO) != 0)
830  memset(dsa_get_address(area, result), 0, size);
831 
832  return result;
833 }
834 
835 /*
836  * Free memory obtained with dsa_allocate.
837  */
838 void
840 {
841  dsa_segment_map *segment_map;
842  int pageno;
843  dsa_pointer span_pointer;
844  dsa_area_span *span;
845  char *superblock;
846  char *object;
847  size_t size;
848  int size_class;
849 
850  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
852 
853  /* Locate the object, span and pool. */
854  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
855  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
856  span_pointer = segment_map->pagemap[pageno];
857  span = dsa_get_address(area, span_pointer);
858  superblock = dsa_get_address(area, span->start);
859  object = dsa_get_address(area, dp);
860  size_class = span->size_class;
861  size = dsa_size_classes[size_class];
862 
863  /*
864  * Special case for large objects that live in a special span: we return
865  * those pages directly to the free page manager and free the span.
866  */
867  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
868  {
869 
870 #ifdef CLOBBER_FREED_MEMORY
871  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
872 #endif
873 
874  /* Give pages back to free page manager. */
876  FreePageManagerPut(segment_map->fpm,
878  span->npages);
879 
880  /* Move segment to appropriate bin if necessary. */
881  rebin_segment(area, segment_map);
883 
884  /* Unlink span. */
886  LW_EXCLUSIVE);
887  unlink_span(area, span);
889  /* Free the span object so it can be reused. */
890  dsa_free(area, span_pointer);
891  return;
892  }
893 
894 #ifdef CLOBBER_FREED_MEMORY
895  memset(object, 0x7f, size);
896 #endif
897 
898  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
899 
900  /* Put the object on the span's freelist. */
901  Assert(object >= superblock);
902  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
903  Assert((object - superblock) % size == 0);
904  NextFreeObjectIndex(object) = span->firstfree;
905  span->firstfree = (object - superblock) / size;
906  ++span->nallocatable;
907 
908  /*
909  * See if the span needs to moved to a different fullness class, or be
910  * freed so its pages can be given back to the segment.
911  */
912  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
913  {
914  /*
915  * The block was completely full and is located in the
916  * highest-numbered fullness class, which is never scanned for free
917  * chunks. We must move it to the next-lower fullness class.
918  */
919  unlink_span(area, span);
920  add_span_to_fullness_class(area, span, span_pointer,
922 
923  /*
924  * If this is the only span, and there is no active span, then we
925  * should probably move this span to fullness class 1. (Otherwise if
926  * you allocate exactly all the objects in the only span, it moves to
927  * class 3, then you free them all, it moves to 2, and then is given
928  * back, leaving no active span).
929  */
930  }
931  else if (span->nallocatable == span->nmax &&
932  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
933  {
934  /*
935  * This entire block is free, and it's not the active block for this
936  * size class. Return the memory to the free page manager. We don't
937  * do this for the active block to prevent hysteresis: if we
938  * repeatedly allocate and free the only chunk in the active block, it
939  * will be very inefficient if we deallocate and reallocate the block
940  * every time.
941  */
942  destroy_superblock(area, span_pointer);
943  }
944 
945  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
946 }
947 
948 /*
949  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
950  * memory allocated by the given area (possibly in another process) that
951  * hasn't yet been freed. This may cause a segment to be mapped into the
952  * current process if required, and may cause freed segments to be unmapped.
953  */
954 void *
956 {
958  size_t offset;
959 
960  /* Convert InvalidDsaPointer to NULL. */
961  if (!DsaPointerIsValid(dp))
962  return NULL;
963 
964  /* Process any requests to detach from freed segments. */
966 
967  /* Break the dsa_pointer into its components. */
969  offset = DSA_EXTRACT_OFFSET(dp);
971 
972  /* Check if we need to cause this segment to be mapped in. */
973  if (unlikely(area->segment_maps[index].mapped_address == NULL))
974  {
975  /* Call for effect (we don't need the result). */
977  }
978 
979  return area->segment_maps[index].mapped_address + offset;
980 }
981 
982 /*
983  * Pin this area, so that it will continue to exist even if all backends
984  * detach from it. In that case, the area can still be reattached to if a
985  * handle has been recorded somewhere.
986  */
987 void
989 {
991  if (area->control->pinned)
992  {
994  elog(ERROR, "dsa_area already pinned");
995  }
996  area->control->pinned = true;
997  ++area->control->refcnt;
999 }
1000 
1001 /*
1002  * Undo the effects of dsa_pin, so that the given area can be freed when no
1003  * backends are attached to it. May be called only if dsa_pin has been
1004  * called.
1005  */
1006 void
1008 {
1010  Assert(area->control->refcnt > 1);
1011  if (!area->control->pinned)
1012  {
1014  elog(ERROR, "dsa_area not pinned");
1015  }
1016  area->control->pinned = false;
1017  --area->control->refcnt;
1019 }
1020 
1021 /*
1022  * Set the total size limit for this area. This limit is checked whenever new
1023  * segments need to be allocated from the operating system. If the new size
1024  * limit is already exceeded, this has no immediate effect.
1025  *
1026  * Note that the total virtual memory usage may be temporarily larger than
1027  * this limit when segments have been freed, but not yet detached by all
1028  * backends that have attached to them.
1029  */
1030 void
1031 dsa_set_size_limit(dsa_area *area, size_t limit)
1032 {
1034  area->control->max_total_segment_size = limit;
1036 }
1037 
1038 /* Return the total size of all active segments */
1039 size_t
1041 {
1042  size_t size;
1043 
1045  size = area->control->total_segment_size;
1047 
1048  return size;
1049 }
1050 
1051 /*
1052  * Aggressively free all spare memory in the hope of returning DSM segments to
1053  * the operating system.
1054  */
1055 void
1057 {
1058  int size_class;
1059 
1060  /*
1061  * Trim in reverse pool order so we get to the spans-of-spans last, just
1062  * in case any become entirely free while processing all the other pools.
1063  */
1064  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1065  {
1066  dsa_area_pool *pool = &area->control->pools[size_class];
1067  dsa_pointer span_pointer;
1068 
1069  if (size_class == DSA_SCLASS_SPAN_LARGE)
1070  {
1071  /* Large object frees give back segments aggressively already. */
1072  continue;
1073  }
1074 
1075  /*
1076  * Search fullness class 1 only. That is where we expect to find an
1077  * entirely empty superblock (entirely empty superblocks in other
1078  * fullness classes are returned to the free page map by dsa_free).
1079  */
1080  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1081  span_pointer = pool->spans[1];
1082  while (DsaPointerIsValid(span_pointer))
1083  {
1084  dsa_area_span *span = dsa_get_address(area, span_pointer);
1085  dsa_pointer next = span->nextspan;
1086 
1087  if (span->nallocatable == span->nmax)
1088  destroy_superblock(area, span_pointer);
1089 
1090  span_pointer = next;
1091  }
1092  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1093  }
1094 }
1095 
1096 /*
1097  * Print out debugging information about the internal state of the shared
1098  * memory area.
1099  */
1100 void
1102 {
1103  size_t i,
1104  j;
1105 
1106  /*
1107  * Note: This gives an inconsistent snapshot as it acquires and releases
1108  * individual locks as it goes...
1109  */
1110 
1113  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1114  fprintf(stderr, " max_total_segment_size: %zu\n",
1116  fprintf(stderr, " total_segment_size: %zu\n",
1117  area->control->total_segment_size);
1118  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1119  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1120  fprintf(stderr, " segment bins:\n");
1121  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1122  {
1124  {
1125  dsa_segment_index segment_index;
1126 
1127  if (i == 0)
1128  fprintf(stderr,
1129  " segment bin %zu (no contiguous free pages):\n", i);
1130  else
1131  fprintf(stderr,
1132  " segment bin %zu (at least %d contiguous pages free):\n",
1133  i, 1 << (i - 1));
1134  segment_index = area->control->segment_bins[i];
1135  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1136  {
1137  dsa_segment_map *segment_map;
1138 
1139  segment_map =
1140  get_segment_by_index(area, segment_index);
1141 
1142  fprintf(stderr,
1143  " segment index %zu, usable_pages = %zu, "
1144  "contiguous_pages = %zu, mapped at %p\n",
1145  segment_index,
1146  segment_map->header->usable_pages,
1147  fpm_largest(segment_map->fpm),
1148  segment_map->mapped_address);
1149  segment_index = segment_map->header->next;
1150  }
1151  }
1152  }
1154 
1155  fprintf(stderr, " pools:\n");
1156  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1157  {
1158  bool found = false;
1159 
1161  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1162  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1163  found = true;
1164  if (found)
1165  {
1167  fprintf(stderr, " pool for blocks of span objects:\n");
1168  else if (i == DSA_SCLASS_SPAN_LARGE)
1169  fprintf(stderr, " pool for large object spans:\n");
1170  else
1171  fprintf(stderr,
1172  " pool for size class %zu (object size %hu bytes):\n",
1173  i, dsa_size_classes[i]);
1174  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1175  {
1176  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1177  fprintf(stderr, " fullness class %zu is empty\n", j);
1178  else
1179  {
1180  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1181 
1182  fprintf(stderr, " fullness class %zu:\n", j);
1183  while (DsaPointerIsValid(span_pointer))
1184  {
1185  dsa_area_span *span;
1186 
1187  span = dsa_get_address(area, span_pointer);
1188  fprintf(stderr,
1189  " span descriptor at "
1190  DSA_POINTER_FORMAT ", superblock at "
1192  ", pages = %zu, objects free = %hu/%hu\n",
1193  span_pointer, span->start, span->npages,
1194  span->nallocatable, span->nmax);
1195  span_pointer = span->nextspan;
1196  }
1197  }
1198  }
1199  }
1201  }
1202 }
1203 
1204 /*
1205  * Return the smallest size that you can successfully provide to
1206  * dsa_create_in_place.
1207  */
1208 size_t
1210 {
1211  size_t size;
1212  int pages = 0;
1213 
1214  size = MAXALIGN(sizeof(dsa_area_control)) +
1215  MAXALIGN(sizeof(FreePageManager));
1216 
1217  /* Figure out how many pages we need, including the page map... */
1218  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1219  {
1220  ++pages;
1221  size += sizeof(dsa_pointer);
1222  }
1223 
1224  return pages * FPM_PAGE_SIZE;
1225 }
1226 
1227 /*
1228  * Workhorse function for dsa_create and dsa_create_in_place.
1229  */
1230 static dsa_area *
1231 create_internal(void *place, size_t size,
1232  int tranche_id,
1233  dsm_handle control_handle,
1234  dsm_segment *control_segment)
1235 {
1236  dsa_area_control *control;
1237  dsa_area *area;
1238  dsa_segment_map *segment_map;
1239  size_t usable_pages;
1240  size_t total_pages;
1241  size_t metadata_bytes;
1242  int i;
1243 
1244  /* Sanity check on the space we have to work in. */
1245  if (size < dsa_minimum_size())
1246  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1247  dsa_minimum_size(), size);
1248 
1249  /* Now figure out how much space is usable */
1250  total_pages = size / FPM_PAGE_SIZE;
1251  metadata_bytes =
1252  MAXALIGN(sizeof(dsa_area_control)) +
1253  MAXALIGN(sizeof(FreePageManager)) +
1254  total_pages * sizeof(dsa_pointer);
1255  /* Add padding up to next page boundary. */
1256  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1257  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1258  Assert(metadata_bytes <= size);
1259  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1260 
1261  /*
1262  * Initialize the dsa_area_control object located at the start of the
1263  * space.
1264  */
1265  control = (dsa_area_control *) place;
1266  memset(place, 0, sizeof(*control));
1267  control->segment_header.magic =
1268  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1271  control->segment_header.usable_pages = usable_pages;
1272  control->segment_header.freed = false;
1274  control->handle = control_handle;
1275  control->max_total_segment_size = (size_t) -1;
1276  control->total_segment_size = size;
1277  control->segment_handles[0] = control_handle;
1278  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1280  control->refcnt = 1;
1281  control->lwlock_tranche_id = tranche_id;
1282 
1283  /*
1284  * Create the dsa_area object that this backend will use to access the
1285  * area. Other backends will need to obtain their own dsa_area object by
1286  * attaching.
1287  */
1288  area = palloc(sizeof(dsa_area));
1289  area->control = control;
1291  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1292  area->high_segment_index = 0;
1293  area->freed_segment_counter = 0;
1294  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1295  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1297  control->lwlock_tranche_id);
1298 
1299  /* Set up the segment map for this process's mapping. */
1300  segment_map = &area->segment_maps[0];
1301  segment_map->segment = control_segment;
1302  segment_map->mapped_address = place;
1303  segment_map->header = (dsa_segment_header *) place;
1304  segment_map->fpm = (FreePageManager *)
1305  (segment_map->mapped_address +
1306  MAXALIGN(sizeof(dsa_area_control)));
1307  segment_map->pagemap = (dsa_pointer *)
1308  (segment_map->mapped_address +
1309  MAXALIGN(sizeof(dsa_area_control)) +
1310  MAXALIGN(sizeof(FreePageManager)));
1311 
1312  /* Set up the free page map. */
1313  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1314  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1315 
1316  if (usable_pages > 0)
1317  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1318  usable_pages);
1319 
1320  /* Put this segment into the appropriate bin. */
1321  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1322  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1323 
1324  return area;
1325 }
1326 
1327 /*
1328  * Workhorse function for dsa_attach and dsa_attach_in_place.
1329  */
1330 static dsa_area *
1331 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1332 {
1333  dsa_area_control *control;
1334  dsa_area *area;
1335  dsa_segment_map *segment_map;
1336 
1337  control = (dsa_area_control *) place;
1338  Assert(control->handle == handle);
1339  Assert(control->segment_handles[0] == handle);
1340  Assert(control->segment_header.magic ==
1341  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1342 
1343  /* Build the backend-local area object. */
1344  area = palloc(sizeof(dsa_area));
1345  area->control = control;
1347  memset(&area->segment_maps[0], 0,
1348  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1349  area->high_segment_index = 0;
1350 
1351  /* Set up the segment map for this process's mapping. */
1352  segment_map = &area->segment_maps[0];
1353  segment_map->segment = segment; /* NULL for in-place */
1354  segment_map->mapped_address = place;
1355  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1356  segment_map->fpm = (FreePageManager *)
1357  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1358  segment_map->pagemap = (dsa_pointer *)
1359  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1360  MAXALIGN(sizeof(FreePageManager)));
1361 
1362  /* Bump the reference count. */
1364  if (control->refcnt == 0)
1365  {
1366  /* We can't attach to a DSA area that has already been destroyed. */
1367  ereport(ERROR,
1368  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1369  errmsg("could not attach to dynamic shared area")));
1370  }
1371  ++control->refcnt;
1374 
1375  return area;
1376 }
1377 
1378 /*
1379  * Add a new span to fullness class 1 of the indicated pool.
1380  */
1381 static void
1383  dsa_pointer span_pointer,
1384  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1385  uint16 size_class)
1386 {
1387  dsa_area_span *span = dsa_get_address(area, span_pointer);
1388  size_t obsize = dsa_size_classes[size_class];
1389 
1390  /*
1391  * The per-pool lock must be held because we manipulate the span list for
1392  * this pool.
1393  */
1394  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1395 
1396  /* Push this span onto the front of the span list for fullness class 1. */
1397  if (DsaPointerIsValid(pool->spans[1]))
1398  {
1399  dsa_area_span *head = (dsa_area_span *)
1400  dsa_get_address(area, pool->spans[1]);
1401 
1402  head->prevspan = span_pointer;
1403  }
1404  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1405  span->nextspan = pool->spans[1];
1406  span->prevspan = InvalidDsaPointer;
1407  pool->spans[1] = span_pointer;
1408 
1409  span->start = start;
1410  span->npages = npages;
1411  span->size_class = size_class;
1412  span->ninitialized = 0;
1413  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1414  {
1415  /*
1416  * A block-of-spans contains its own descriptor, so mark one object as
1417  * initialized and reduce the count of allocatable objects by one.
1418  * Doing this here has the side effect of also reducing nmax by one,
1419  * which is important to make sure we free this object at the correct
1420  * time.
1421  */
1422  span->ninitialized = 1;
1423  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1424  }
1425  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1426  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1428  span->nmax = span->nallocatable;
1429  span->fclass = 1;
1430 }
1431 
1432 /*
1433  * Transfer the first span in one fullness class to the head of another
1434  * fullness class.
1435  */
1436 static bool
1438  dsa_area_pool *pool, int fromclass, int toclass)
1439 {
1440  dsa_pointer span_pointer;
1441  dsa_area_span *span;
1442  dsa_area_span *nextspan;
1443 
1444  /* Can't do it if source list is empty. */
1445  span_pointer = pool->spans[fromclass];
1446  if (!DsaPointerIsValid(span_pointer))
1447  return false;
1448 
1449  /* Remove span from head of source list. */
1450  span = dsa_get_address(area, span_pointer);
1451  pool->spans[fromclass] = span->nextspan;
1452  if (DsaPointerIsValid(span->nextspan))
1453  {
1454  nextspan = (dsa_area_span *)
1455  dsa_get_address(area, span->nextspan);
1456  nextspan->prevspan = InvalidDsaPointer;
1457  }
1458 
1459  /* Add span to head of target list. */
1460  span->nextspan = pool->spans[toclass];
1461  pool->spans[toclass] = span_pointer;
1462  if (DsaPointerIsValid(span->nextspan))
1463  {
1464  nextspan = (dsa_area_span *)
1465  dsa_get_address(area, span->nextspan);
1466  nextspan->prevspan = span_pointer;
1467  }
1468  span->fclass = toclass;
1469 
1470  return true;
1471 }
1472 
1473 /*
1474  * Allocate one object of the requested size class from the given area.
1475  */
1476 static inline dsa_pointer
1477 alloc_object(dsa_area *area, int size_class)
1478 {
1479  dsa_area_pool *pool = &area->control->pools[size_class];
1480  dsa_area_span *span;
1481  dsa_pointer block;
1482  dsa_pointer result;
1483  char *object;
1484  size_t size;
1485 
1486  /*
1487  * Even though ensure_active_superblock can in turn call alloc_object if
1488  * it needs to allocate a new span, that's always from a different pool,
1489  * and the order of lock acquisition is always the same, so it's OK that
1490  * we hold this lock for the duration of this function.
1491  */
1492  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1493  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1494 
1495  /*
1496  * If there's no active superblock, we must successfully obtain one or
1497  * fail the request.
1498  */
1499  if (!DsaPointerIsValid(pool->spans[1]) &&
1500  !ensure_active_superblock(area, pool, size_class))
1501  {
1502  result = InvalidDsaPointer;
1503  }
1504  else
1505  {
1506  /*
1507  * There should be a block in fullness class 1 at this point, and it
1508  * should never be completely full. Thus we can either pop an object
1509  * from the free list or, failing that, initialize a new object.
1510  */
1511  Assert(DsaPointerIsValid(pool->spans[1]));
1512  span = (dsa_area_span *)
1513  dsa_get_address(area, pool->spans[1]);
1514  Assert(span->nallocatable > 0);
1515  block = span->start;
1516  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1517  size = dsa_size_classes[size_class];
1518  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1519  {
1520  result = block + span->firstfree * size;
1521  object = dsa_get_address(area, result);
1522  span->firstfree = NextFreeObjectIndex(object);
1523  }
1524  else
1525  {
1526  result = block + span->ninitialized * size;
1527  ++span->ninitialized;
1528  }
1529  --span->nallocatable;
1530 
1531  /* If it's now full, move it to the highest-numbered fullness class. */
1532  if (span->nallocatable == 0)
1533  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1534  }
1535 
1536  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1537  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1538 
1539  return result;
1540 }
1541 
1542 /*
1543  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1544  * superblocks are completely full and no more can be allocated.
1545  *
1546  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1547  * utilization percentage is at least K/N, but we only enforce this rigorously
1548  * for the highest-numbered fullness class, which always contains exactly
1549  * those blocks that are completely full. It's otherwise acceptable for a
1550  * block to be in a higher-numbered fullness class than the one to which it
1551  * logically belongs. In addition, the active block, which is always the
1552  * first block in fullness class 1, is permitted to have a higher allocation
1553  * percentage than would normally be allowable for that fullness class; we
1554  * don't move it until it's completely full, and then it goes to the
1555  * highest-numbered fullness class.
1556  *
1557  * It might seem odd that the active block is the head of fullness class 1
1558  * rather than fullness class 0, but experience with other allocators has
1559  * shown that it's usually better to allocate from a block that's moderately
1560  * full rather than one that's nearly empty. Insofar as is reasonably
1561  * possible, we want to avoid performing new allocations in a block that would
1562  * otherwise become empty soon.
1563  */
1564 static bool
1566  int size_class)
1567 {
1568  dsa_pointer span_pointer;
1569  dsa_pointer start_pointer;
1570  size_t obsize = dsa_size_classes[size_class];
1571  size_t nmax;
1572  int fclass;
1573  size_t npages = 1;
1574  size_t first_page;
1575  size_t i;
1576  dsa_segment_map *segment_map;
1577 
1578  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1579 
1580  /*
1581  * Compute the number of objects that will fit in a block of this size
1582  * class. Span-of-spans blocks are just a single page, and the first
1583  * object isn't available for use because it describes the block-of-spans
1584  * itself.
1585  */
1586  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1587  nmax = FPM_PAGE_SIZE / obsize - 1;
1588  else
1589  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1590 
1591  /*
1592  * If fullness class 1 is empty, try to find a span to put in it by
1593  * scanning higher-numbered fullness classes (excluding the last one,
1594  * whose blocks are certain to all be completely full).
1595  */
1596  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1597  {
1598  span_pointer = pool->spans[fclass];
1599 
1600  while (DsaPointerIsValid(span_pointer))
1601  {
1602  int tfclass;
1603  dsa_area_span *span;
1604  dsa_area_span *nextspan;
1605  dsa_area_span *prevspan;
1606  dsa_pointer next_span_pointer;
1607 
1608  span = (dsa_area_span *)
1609  dsa_get_address(area, span_pointer);
1610  next_span_pointer = span->nextspan;
1611 
1612  /* Figure out what fullness class should contain this span. */
1613  tfclass = (nmax - span->nallocatable)
1614  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1615 
1616  /* Look up next span. */
1617  if (DsaPointerIsValid(span->nextspan))
1618  nextspan = (dsa_area_span *)
1619  dsa_get_address(area, span->nextspan);
1620  else
1621  nextspan = NULL;
1622 
1623  /*
1624  * If utilization has dropped enough that this now belongs in some
1625  * other fullness class, move it there.
1626  */
1627  if (tfclass < fclass)
1628  {
1629  /* Remove from the current fullness class list. */
1630  if (pool->spans[fclass] == span_pointer)
1631  {
1632  /* It was the head; remove it. */
1634  pool->spans[fclass] = span->nextspan;
1635  if (nextspan != NULL)
1636  nextspan->prevspan = InvalidDsaPointer;
1637  }
1638  else
1639  {
1640  /* It was not the head. */
1642  prevspan = (dsa_area_span *)
1643  dsa_get_address(area, span->prevspan);
1644  prevspan->nextspan = span->nextspan;
1645  }
1646  if (nextspan != NULL)
1647  nextspan->prevspan = span->prevspan;
1648 
1649  /* Push onto the head of the new fullness class list. */
1650  span->nextspan = pool->spans[tfclass];
1651  pool->spans[tfclass] = span_pointer;
1652  span->prevspan = InvalidDsaPointer;
1653  if (DsaPointerIsValid(span->nextspan))
1654  {
1655  nextspan = (dsa_area_span *)
1656  dsa_get_address(area, span->nextspan);
1657  nextspan->prevspan = span_pointer;
1658  }
1659  span->fclass = tfclass;
1660  }
1661 
1662  /* Advance to next span on list. */
1663  span_pointer = next_span_pointer;
1664  }
1665 
1666  /* Stop now if we found a suitable block. */
1667  if (DsaPointerIsValid(pool->spans[1]))
1668  return true;
1669  }
1670 
1671  /*
1672  * If there are no blocks that properly belong in fullness class 1, pick
1673  * one from some other fullness class and move it there anyway, so that we
1674  * have an allocation target. Our last choice is to transfer a block
1675  * that's almost empty (and might become completely empty soon if left
1676  * alone), but even that is better than failing, which is what we must do
1677  * if there are no blocks at all with freespace.
1678  */
1679  Assert(!DsaPointerIsValid(pool->spans[1]));
1680  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1681  if (transfer_first_span(area, pool, fclass, 1))
1682  return true;
1683  if (!DsaPointerIsValid(pool->spans[1]) &&
1684  transfer_first_span(area, pool, 0, 1))
1685  return true;
1686 
1687  /*
1688  * We failed to find an existing span with free objects, so we need to
1689  * allocate a new superblock and construct a new span to manage it.
1690  *
1691  * First, get a dsa_area_span object to describe the new superblock block
1692  * ... unless this allocation is for a dsa_area_span object, in which case
1693  * that's surely not going to work. We handle that case by storing the
1694  * span describing a block-of-spans inline.
1695  */
1696  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1697  {
1698  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1699  if (!DsaPointerIsValid(span_pointer))
1700  return false;
1701  npages = DSA_PAGES_PER_SUPERBLOCK;
1702  }
1703 
1704  /* Find or create a segment and allocate the superblock. */
1706  segment_map = get_best_segment(area, npages);
1707  if (segment_map == NULL)
1708  {
1709  segment_map = make_new_segment(area, npages);
1710  if (segment_map == NULL)
1711  {
1713  return false;
1714  }
1715  }
1716 
1717  /*
1718  * This shouldn't happen: get_best_segment() or make_new_segment()
1719  * promised that we can successfully allocate npages.
1720  */
1721  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1722  elog(FATAL,
1723  "dsa_allocate could not find %zu free pages for superblock",
1724  npages);
1726 
1727  /* Compute the start of the superblock. */
1728  start_pointer =
1729  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1730  first_page * FPM_PAGE_SIZE);
1731 
1732  /*
1733  * If this is a block-of-spans, carve the descriptor right out of the
1734  * allocated space.
1735  */
1736  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1737  {
1738  /*
1739  * We have a pointer into the segment. We need to build a dsa_pointer
1740  * from the segment index and offset into the segment.
1741  */
1742  span_pointer = start_pointer;
1743  }
1744 
1745  /* Initialize span and pagemap. */
1746  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1747  for (i = 0; i < npages; ++i)
1748  segment_map->pagemap[first_page + i] = span_pointer;
1749 
1750  return true;
1751 }
1752 
1753 /*
1754  * Return the segment map corresponding to a given segment index, mapping the
1755  * segment in if necessary. For internal segment book-keeping, this is called
1756  * with the area lock held. It is also called by dsa_free and dsa_get_address
1757  * without any locking, relying on the fact they have a known live segment
1758  * index and they always call check_for_freed_segments to ensures that any
1759  * freed segment occupying the same slot is detached first.
1760  */
1761 static dsa_segment_map *
1763 {
1764  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1765  {
1766  dsm_handle handle;
1767  dsm_segment *segment;
1768  dsa_segment_map *segment_map;
1769  ResourceOwner oldowner;
1770 
1771  /*
1772  * If we are reached by dsa_free or dsa_get_address, there must be at
1773  * least one object allocated in the referenced segment. Otherwise,
1774  * their caller has a double-free or access-after-free bug, which we
1775  * have no hope of detecting. So we know it's safe to access this
1776  * array slot without holding a lock; it won't change underneath us.
1777  * Furthermore, we know that we can see the latest contents of the
1778  * slot, as explained in check_for_freed_segments, which those
1779  * functions call before arriving here.
1780  */
1781  handle = area->control->segment_handles[index];
1782 
1783  /* It's an error to try to access an unused slot. */
1784  if (handle == DSM_HANDLE_INVALID)
1785  elog(ERROR,
1786  "dsa_area could not attach to a segment that has been freed");
1787 
1788  oldowner = CurrentResourceOwner;
1790  segment = dsm_attach(handle);
1791  CurrentResourceOwner = oldowner;
1792  if (segment == NULL)
1793  elog(ERROR, "dsa_area could not attach to segment");
1794  segment_map = &area->segment_maps[index];
1795  segment_map->segment = segment;
1796  segment_map->mapped_address = dsm_segment_address(segment);
1797  segment_map->header =
1798  (dsa_segment_header *) segment_map->mapped_address;
1799  segment_map->fpm = (FreePageManager *)
1800  (segment_map->mapped_address +
1801  MAXALIGN(sizeof(dsa_segment_header)));
1802  segment_map->pagemap = (dsa_pointer *)
1803  (segment_map->mapped_address +
1804  MAXALIGN(sizeof(dsa_segment_header)) +
1805  MAXALIGN(sizeof(FreePageManager)));
1806 
1807  /* Remember the highest index this backend has ever mapped. */
1808  if (area->high_segment_index < index)
1809  area->high_segment_index = index;
1810 
1811  Assert(segment_map->header->magic ==
1813  }
1814 
1815  /*
1816  * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1817  * but it's a bug in the calling code and undefined behavior if the
1818  * address is not live (ie if the segment might possibly have been freed,
1819  * they're trying to use a dangling pointer).
1820  *
1821  * For dsa.c code that holds the area lock to manipulate segment_bins
1822  * lists, it would be a bug if we ever reach a freed segment here. After
1823  * it's marked as freed, the only thing any backend should do with it is
1824  * unmap it, and it should always have done that in
1825  * check_for_freed_segments_locked() before arriving here to resolve an
1826  * index to a segment_map.
1827  *
1828  * Either way we can assert that we aren't returning a freed segment.
1829  */
1830  Assert(!area->segment_maps[index].header->freed);
1831 
1832  return &area->segment_maps[index];
1833 }
1834 
1835 /*
1836  * Return a superblock to the free page manager. If the underlying segment
1837  * has become entirely free, then return it to the operating system.
1838  *
1839  * The appropriate pool lock must be held.
1840  */
1841 static void
1843 {
1844  dsa_area_span *span = dsa_get_address(area, span_pointer);
1845  int size_class = span->size_class;
1846  dsa_segment_map *segment_map;
1847 
1848 
1849  /* Remove it from its fullness class list. */
1850  unlink_span(area, span);
1851 
1852  /*
1853  * Note: Here we acquire the area lock while we already hold a per-pool
1854  * lock. We never hold the area lock and then take a pool lock, or we
1855  * could deadlock.
1856  */
1859  segment_map =
1861  FreePageManagerPut(segment_map->fpm,
1863  span->npages);
1864  /* Check if the segment is now entirely free. */
1865  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1866  {
1867  dsa_segment_index index = get_segment_index(area, segment_map);
1868 
1869  /* If it's not the segment with extra control data, free it. */
1870  if (index != 0)
1871  {
1872  /*
1873  * Give it back to the OS, and allow other backends to detect that
1874  * they need to detach.
1875  */
1876  unlink_segment(area, segment_map);
1877  segment_map->header->freed = true;
1879  segment_map->header->size);
1880  area->control->total_segment_size -=
1881  segment_map->header->size;
1883  dsm_detach(segment_map->segment);
1885  ++area->control->freed_segment_counter;
1886  segment_map->segment = NULL;
1887  segment_map->header = NULL;
1888  segment_map->mapped_address = NULL;
1889  }
1890  }
1891 
1892  /* Move segment to appropriate bin if necessary. */
1893  if (segment_map->header != NULL)
1894  rebin_segment(area, segment_map);
1895 
1897 
1898  /*
1899  * Span-of-spans blocks store the span which describes them within the
1900  * block itself, so freeing the storage implicitly frees the descriptor
1901  * also. If this is a block of any other type, we need to separately free
1902  * the span object also. This recursive call to dsa_free will acquire the
1903  * span pool's lock. We can't deadlock because the acquisition order is
1904  * always some other pool and then the span pool.
1905  */
1906  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1907  dsa_free(area, span_pointer);
1908 }
1909 
1910 static void
1912 {
1913  if (DsaPointerIsValid(span->nextspan))
1914  {
1915  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1916 
1917  next->prevspan = span->prevspan;
1918  }
1919  if (DsaPointerIsValid(span->prevspan))
1920  {
1921  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1922 
1923  prev->nextspan = span->nextspan;
1924  }
1925  else
1926  {
1927  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1928 
1929  pool->spans[span->fclass] = span->nextspan;
1930  }
1931 }
1932 
1933 static void
1935  dsa_pointer span_pointer,
1936  int fclass)
1937 {
1938  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1939 
1940  if (DsaPointerIsValid(pool->spans[fclass]))
1941  {
1942  dsa_area_span *head = dsa_get_address(area,
1943  pool->spans[fclass]);
1944 
1945  head->prevspan = span_pointer;
1946  }
1947  span->prevspan = InvalidDsaPointer;
1948  span->nextspan = pool->spans[fclass];
1949  pool->spans[fclass] = span_pointer;
1950  span->fclass = fclass;
1951 }
1952 
1953 /*
1954  * Detach from an area that was either created or attached to by this process.
1955  */
1956 void
1958 {
1959  int i;
1960 
1961  /* Detach from all segments. */
1962  for (i = 0; i <= area->high_segment_index; ++i)
1963  if (area->segment_maps[i].segment != NULL)
1964  dsm_detach(area->segment_maps[i].segment);
1965 
1966  /*
1967  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1968  * 'releasing' (= adjusting the reference count). It would be nice to
1969  * combine these operations, but client code might never get around to
1970  * calling dsa_detach because of an error path, and a detach hook on any
1971  * particular segment is too late to detach other segments in the area
1972  * without risking a 'leak' warning in the non-error path.
1973  */
1974 
1975  /* Free the backend-local area object. */
1976  pfree(area);
1977 }
1978 
1979 /*
1980  * Unlink a segment from the bin that contains it.
1981  */
1982 static void
1984 {
1985  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1986  {
1987  dsa_segment_map *prev;
1988 
1989  prev = get_segment_by_index(area, segment_map->header->prev);
1990  prev->header->next = segment_map->header->next;
1991  }
1992  else
1993  {
1994  Assert(area->control->segment_bins[segment_map->header->bin] ==
1995  get_segment_index(area, segment_map));
1996  area->control->segment_bins[segment_map->header->bin] =
1997  segment_map->header->next;
1998  }
1999  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2000  {
2002 
2003  next = get_segment_by_index(area, segment_map->header->next);
2004  next->header->prev = segment_map->header->prev;
2005  }
2006 }
2007 
2008 /*
2009  * Find a segment that could satisfy a request for 'npages' of contiguous
2010  * memory, or return NULL if none can be found. This may involve attaching to
2011  * segments that weren't previously attached so that we can query their free
2012  * pages map.
2013  */
2014 static dsa_segment_map *
2015 get_best_segment(dsa_area *area, size_t npages)
2016 {
2017  size_t bin;
2018 
2021 
2022  /*
2023  * Start searching from the first bin that *might* have enough contiguous
2024  * pages.
2025  */
2026  for (bin = contiguous_pages_to_segment_bin(npages);
2027  bin < DSA_NUM_SEGMENT_BINS;
2028  ++bin)
2029  {
2030  /*
2031  * The minimum contiguous size that any segment in this bin should
2032  * have. We'll re-bin if we see segments with fewer.
2033  */
2034  size_t threshold = (size_t) 1 << (bin - 1);
2035  dsa_segment_index segment_index;
2036 
2037  /* Search this bin for a segment with enough contiguous space. */
2038  segment_index = area->control->segment_bins[bin];
2039  while (segment_index != DSA_SEGMENT_INDEX_NONE)
2040  {
2041  dsa_segment_map *segment_map;
2042  dsa_segment_index next_segment_index;
2043  size_t contiguous_pages;
2044 
2045  segment_map = get_segment_by_index(area, segment_index);
2046  next_segment_index = segment_map->header->next;
2047  contiguous_pages = fpm_largest(segment_map->fpm);
2048 
2049  /* Not enough for the request, still enough for this bin. */
2050  if (contiguous_pages >= threshold && contiguous_pages < npages)
2051  {
2052  segment_index = next_segment_index;
2053  continue;
2054  }
2055 
2056  /* Re-bin it if it's no longer in the appropriate bin. */
2057  if (contiguous_pages < threshold)
2058  {
2059  rebin_segment(area, segment_map);
2060 
2061  /*
2062  * But fall through to see if it's enough to satisfy this
2063  * request anyway....
2064  */
2065  }
2066 
2067  /* Check if we are done. */
2068  if (contiguous_pages >= npages)
2069  return segment_map;
2070 
2071  /* Continue searching the same bin. */
2072  segment_index = next_segment_index;
2073  }
2074  }
2075 
2076  /* Not found. */
2077  return NULL;
2078 }
2079 
2080 /*
2081  * Create a new segment that can handle at least requested_pages. Returns
2082  * NULL if the requested total size limit or maximum allowed number of
2083  * segments would be exceeded.
2084  */
2085 static dsa_segment_map *
2086 make_new_segment(dsa_area *area, size_t requested_pages)
2087 {
2088  dsa_segment_index new_index;
2089  size_t metadata_bytes;
2090  size_t total_size;
2091  size_t total_pages;
2092  size_t usable_pages;
2093  dsa_segment_map *segment_map;
2094  dsm_segment *segment;
2095  ResourceOwner oldowner;
2096 
2098 
2099  /* Find a segment slot that is not in use (linearly for now). */
2100  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2101  {
2102  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2103  break;
2104  }
2105  if (new_index == DSA_MAX_SEGMENTS)
2106  return NULL;
2107 
2108  /*
2109  * If the total size limit is already exceeded, then we exit early and
2110  * avoid arithmetic wraparound in the unsigned expressions below.
2111  */
2112  if (area->control->total_segment_size >=
2114  return NULL;
2115 
2116  /*
2117  * The size should be at least as big as requested, and at least big
2118  * enough to follow a geometric series that approximately doubles the
2119  * total storage each time we create a new segment. We use geometric
2120  * growth because the underlying DSM system isn't designed for large
2121  * numbers of segments (otherwise we might even consider just using one
2122  * DSM segment for each large allocation and for each superblock, and then
2123  * we wouldn't need to use FreePageManager).
2124  *
2125  * We decide on a total segment size first, so that we produce tidy
2126  * power-of-two sized segments. This is a good property to have if we
2127  * move to huge pages in the future. Then we work back to the number of
2128  * pages we can fit.
2129  */
2131  ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2135  area->control->total_segment_size);
2136 
2137  total_pages = total_size / FPM_PAGE_SIZE;
2138  metadata_bytes =
2139  MAXALIGN(sizeof(dsa_segment_header)) +
2140  MAXALIGN(sizeof(FreePageManager)) +
2141  sizeof(dsa_pointer) * total_pages;
2142 
2143  /* Add padding up to next page boundary. */
2144  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2145  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2146  if (total_size <= metadata_bytes)
2147  return NULL;
2148  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2149  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2150 
2151  /* See if that is enough... */
2152  if (requested_pages > usable_pages)
2153  {
2154  /*
2155  * We'll make an odd-sized segment, working forward from the requested
2156  * number of pages.
2157  */
2158  usable_pages = requested_pages;
2159  metadata_bytes =
2160  MAXALIGN(sizeof(dsa_segment_header)) +
2161  MAXALIGN(sizeof(FreePageManager)) +
2162  usable_pages * sizeof(dsa_pointer);
2163 
2164  /* Add padding up to next page boundary. */
2165  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2166  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2167  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2168 
2169  /* Is that too large for dsa_pointer's addressing scheme? */
2171  return NULL;
2172 
2173  /* Would that exceed the limit? */
2175  area->control->total_segment_size)
2176  return NULL;
2177  }
2178 
2179  /* Create the segment. */
2180  oldowner = CurrentResourceOwner;
2182  segment = dsm_create(total_size, 0);
2183  CurrentResourceOwner = oldowner;
2184  if (segment == NULL)
2185  return NULL;
2186  dsm_pin_segment(segment);
2187 
2188  /* Store the handle in shared memory to be found by index. */
2189  area->control->segment_handles[new_index] =
2190  dsm_segment_handle(segment);
2191  /* Track the highest segment index in the history of the area. */
2192  if (area->control->high_segment_index < new_index)
2193  area->control->high_segment_index = new_index;
2194  /* Track the highest segment index this backend has ever mapped. */
2195  if (area->high_segment_index < new_index)
2196  area->high_segment_index = new_index;
2197  /* Track total size of all segments. */
2201 
2202  /* Build a segment map for this segment in this backend. */
2203  segment_map = &area->segment_maps[new_index];
2204  segment_map->segment = segment;
2205  segment_map->mapped_address = dsm_segment_address(segment);
2206  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2207  segment_map->fpm = (FreePageManager *)
2208  (segment_map->mapped_address +
2209  MAXALIGN(sizeof(dsa_segment_header)));
2210  segment_map->pagemap = (dsa_pointer *)
2211  (segment_map->mapped_address +
2212  MAXALIGN(sizeof(dsa_segment_header)) +
2213  MAXALIGN(sizeof(FreePageManager)));
2214 
2215  /* Set up the free page map. */
2216  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2217  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2218  usable_pages);
2219 
2220  /* Set up the segment header and put it in the appropriate bin. */
2221  segment_map->header->magic =
2222  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2223  segment_map->header->usable_pages = usable_pages;
2224  segment_map->header->size = total_size;
2225  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2226  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2227  segment_map->header->next =
2228  area->control->segment_bins[segment_map->header->bin];
2229  segment_map->header->freed = false;
2230  area->control->segment_bins[segment_map->header->bin] = new_index;
2231  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2232  {
2234  get_segment_by_index(area, segment_map->header->next);
2235 
2236  Assert(next->header->bin == segment_map->header->bin);
2237  next->header->prev = new_index;
2238  }
2239 
2240  return segment_map;
2241 }
2242 
2243 /*
2244  * Check if any segments have been freed by destroy_superblock, so we can
2245  * detach from them in this backend. This function is called by
2246  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2247  * received can be resolved to the correct segment.
2248  *
2249  * The danger we want to defend against is that there could be an old segment
2250  * mapped into a given slot in this backend, and the dsa_pointer they have
2251  * might refer to some new segment in the same slot. So those functions must
2252  * be sure to process all instructions to detach from a freed segment that had
2253  * been generated by the time this process received the dsa_pointer, before
2254  * they call get_segment_by_index.
2255  */
2256 static void
2258 {
2259  size_t freed_segment_counter;
2260 
2261  /*
2262  * Any other process that has freed a segment has incremented
2263  * freed_segment_counter while holding an LWLock, and that must precede
2264  * any backend creating a new segment in the same slot while holding an
2265  * LWLock, and that must precede the creation of any dsa_pointer pointing
2266  * into the new segment which might reach us here, and the caller must
2267  * have sent the dsa_pointer to this process using appropriate memory
2268  * synchronization (some kind of locking or atomic primitive or system
2269  * call). So all we need to do on the reading side is ask for the load of
2270  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2271  * has, and we can be sure to detect any segments that had been freed as
2272  * of the time that the dsa_pointer reached this process.
2273  */
2274  pg_read_barrier();
2275  freed_segment_counter = area->control->freed_segment_counter;
2276  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2277  {
2278  /* Check all currently mapped segments to find what's been freed. */
2282  }
2283 }
2284 
2285 /*
2286  * Workhorse for check_for_freed_segments(), and also used directly in path
2287  * where the area lock is already held. This should be called after acquiring
2288  * the lock but before looking up any segment by index number, to make sure we
2289  * unmap any stale segments that might have previously had the same index as a
2290  * current segment.
2291  */
2292 static void
2294 {
2295  size_t freed_segment_counter;
2296  int i;
2297 
2299  freed_segment_counter = area->control->freed_segment_counter;
2300  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2301  {
2302  for (i = 0; i <= area->high_segment_index; ++i)
2303  {
2304  if (area->segment_maps[i].header != NULL &&
2305  area->segment_maps[i].header->freed)
2306  {
2307  dsm_detach(area->segment_maps[i].segment);
2308  area->segment_maps[i].segment = NULL;
2309  area->segment_maps[i].header = NULL;
2310  area->segment_maps[i].mapped_address = NULL;
2311  }
2312  }
2313  area->freed_segment_counter = freed_segment_counter;
2314  }
2315 }
2316 
2317 /*
2318  * Re-bin segment if it's no longer in the appropriate bin.
2319  */
2320 static void
2322 {
2323  size_t new_bin;
2324  dsa_segment_index segment_index;
2325 
2326  new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2327  if (segment_map->header->bin == new_bin)
2328  return;
2329 
2330  /* Remove it from its current bin. */
2331  unlink_segment(area, segment_map);
2332 
2333  /* Push it onto the front of its new bin. */
2334  segment_index = get_segment_index(area, segment_map);
2335  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2336  segment_map->header->next = area->control->segment_bins[new_bin];
2337  segment_map->header->bin = new_bin;
2338  area->control->segment_bins[new_bin] = segment_index;
2339  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2340  {
2342 
2343  next = get_segment_by_index(area, segment_map->header->next);
2344  Assert(next->header->bin == new_bin);
2345  next->header->prev = segment_index;
2346  }
2347 }
#define pg_read_barrier()
Definition: atomics.h:151
static int32 next
Definition: blutils.c:221
unsigned short uint16
Definition: c.h:492
unsigned int uint32
Definition: c.h:493
#define Min(x, y)
Definition: c.h:991
#define MAXALIGN(LEN)
Definition: c.h:798
#define unlikely(x)
Definition: c.h:298
#define lengthof(array)
Definition: c.h:775
unsigned char uint8
Definition: c.h:491
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1983
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2257
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:486
static const uint16 dsa_size_classes[]
Definition: dsa.c:247
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:118
#define DSA_AREA_LOCK(area)
Definition: dsa.c:154
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:77
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1934
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1565
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:127
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:111
void dsa_trim(dsa_area *area)
Definition: dsa.c:1056
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:393
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:114
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:397
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:603
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:68
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:589
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1477
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:104
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:280
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:141
static const uint8 dsa_size_class_map[]
Definition: dsa.c:270
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:684
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2086
size_t dsa_get_total_size(dsa_area *area)
Definition: dsa.c:1040
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:523
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:394
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:340
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:955
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2293
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:121
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1842
#define DSA_MAX_SEGMENTS
Definition: dsa.c:94
size_t dsa_segment_index
Definition: dsa.c:124
static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2321
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:155
void dsa_release_in_place(void *place)
Definition: dsa.c:618
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1762
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1031
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:261
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1437
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1911
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:437
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:262
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:258
void dsa_unpin(dsa_area *area)
Definition: dsa.c:1007
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:648
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1331
#define NextFreeObjectIndex(object)
Definition: dsa.c:224
void dsa_dump(dsa_area *area)
Definition: dsa.c:1101
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:511
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1382
void dsa_detach(dsa_area *area)
Definition: dsa.c:1957
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:2015
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:558
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:101
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:288
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:839
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:133
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1231
size_t dsa_minimum_size(void)
Definition: dsa.c:1209
void dsa_pin(dsa_area *area)
Definition: dsa.c:988
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
dsm_handle dsa_handle
Definition: dsa.h:100
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:103
#define DsaPointerIsValid(x)
Definition: dsa.h:81
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1123
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1095
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:803
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1132
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:665
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:516
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:915
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:988
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:955
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1205
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1897
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1172
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1785
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:707
@ LW_EXCLUSIVE
Definition: lwlock.h:116
void pfree(void *pointer)
Definition: mcxt.c:1508
void * palloc(Size size)
Definition: mcxt.c:1304
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:338
int64 total_size
Definition: pg_checksums.c:63
#define fprintf
Definition: port.h:242
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
static pg_noinline void Size size
Definition: slab.c:607
Definition: lwlock.h:41
dsa_segment_header segment_header
Definition: dsa.c:312
size_t total_segment_size
Definition: dsa.c:322
int lwlock_tranche_id
Definition: dsa.c:334
dsa_segment_index high_segment_index
Definition: dsa.c:326
bool pinned
Definition: dsa.c:330
size_t max_total_segment_size
Definition: dsa.c:324
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:318
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:320
size_t freed_segment_counter
Definition: dsa.c:332
int refcnt
Definition: dsa.c:328
LWLock lock
Definition: dsa.c:336
dsa_handle handle
Definition: dsa.c:314
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:316
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:301
LWLock lock
Definition: dsa.c:299
dsa_pointer nextspan
Definition: dsa.c:209
uint16 fclass
Definition: dsa.c:217
dsa_pointer start
Definition: dsa.c:210
uint16 nallocatable
Definition: dsa.c:214
dsa_pointer prevspan
Definition: dsa.c:208
uint16 size_class
Definition: dsa.c:212
uint16 nmax
Definition: dsa.c:216
uint16 ninitialized
Definition: dsa.c:213
uint16 firstfree
Definition: dsa.c:215
dsa_pointer pool
Definition: dsa.c:207
size_t npages
Definition: dsa.c:211
Definition: dsa.c:366
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:384
dsa_segment_index high_segment_index
Definition: dsa.c:387
size_t freed_segment_counter
Definition: dsa.c:390
dsa_area_control * control
Definition: dsa.c:368
ResourceOwner resowner
Definition: dsa.c:376
uint32 magic
Definition: dsa.c:165
size_t size
Definition: dsa.c:169
dsa_segment_index next
Definition: dsa.c:181
dsa_segment_index prev
Definition: dsa.c:175
size_t usable_pages
Definition: dsa.c:167
size_t bin
Definition: dsa.c:183
dsa_segment_header * header
Definition: dsa.c:354
FreePageManager * fpm
Definition: dsa.c:355
dsm_segment * segment
Definition: dsa.c:352
dsa_pointer * pagemap
Definition: dsa.c:356
char * mapped_address
Definition: dsa.c:353
Definition: type.h:95