PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/lwlock.h"
57 #include "utils/dsa.h"
58 #include "utils/freepage.h"
59 #include "utils/memutils.h"
60 #include "utils/resowner.h"
61 
62 /*
63  * How many segments to create before we double the segment size. If this is
64  * low, then there is likely to be a lot of wasted space in the largest
65  * segment. If it is high, then we risk running out of segment slots (see
66  * dsm.c's limits on total number of segments), or limiting the total size
67  * an area can manage when using small pointers.
68  */
69 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
70 
71 /*
72  * The maximum number of DSM segments that an area can own, determined by
73  * the number of bits remaining (but capped at 1024).
74  */
75 #define DSA_MAX_SEGMENTS \
76  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
77 
78 /* The bitmask for extracting the offset from a dsa_pointer. */
79 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
80 
81 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
82 #define DSA_PAGES_PER_SUPERBLOCK 16
83 
84 /*
85  * A magic number used as a sanity check for following DSM segments belonging
86  * to a DSA area (this number will be XORed with the area handle and
87  * the segment index).
88  */
89 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
90 
91 /* Build a dsa_pointer given a segment number and offset. */
92 #define DSA_MAKE_POINTER(segment_number, offset) \
93  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
94 
95 /* Extract the segment number from a dsa_pointer. */
96 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
97 
98 /* Extract the offset from a dsa_pointer. */
99 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
100 
101 /* The type used for index segment indexes (zero based). */
102 typedef size_t dsa_segment_index;
103 
104 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
105 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
106 
107 /*
108  * How many bins of segments do we have? The bins are used to categorize
109  * segments by their largest contiguous run of free pages.
110  */
111 #define DSA_NUM_SEGMENT_BINS 16
112 
113 /*
114  * What is the lowest bin that holds segments that *might* have n contiguous
115  * free pages? There is no point in looking in segments in lower bins; they
116  * definitely can't service a request for n free pages.
117  */
118 static inline size_t
120 {
121  size_t bin;
122 
123  if (n == 0)
124  bin = 0;
125  else
126  bin = pg_leftmost_one_pos_size_t(n) + 1;
127 
128  return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
129 }
130 
131 /* Macros for access to locks. */
132 #define DSA_AREA_LOCK(area) (&area->control->lock)
133 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
134 
135 /*
136  * The header for an individual segment. This lives at the start of each DSM
137  * segment owned by a DSA area including the first segment (where it appears
138  * as part of the dsa_area_control struct).
139  */
140 typedef struct
141 {
142  /* Sanity check magic value. */
144  /* Total number of pages in this segment (excluding metadata area). */
145  size_t usable_pages;
146  /* Total size of this segment in bytes. */
147  size_t size;
148 
149  /*
150  * Index of the segment that precedes this one in the same segment bin, or
151  * DSA_SEGMENT_INDEX_NONE if this is the first one.
152  */
154 
155  /*
156  * Index of the segment that follows this one in the same segment bin, or
157  * DSA_SEGMENT_INDEX_NONE if this is the last one.
158  */
160  /* The index of the bin that contains this segment. */
161  size_t bin;
162 
163  /*
164  * A flag raised to indicate that this segment is being returned to the
165  * operating system and has been unpinned.
166  */
167  bool freed;
169 
170 /*
171  * Metadata for one superblock.
172  *
173  * For most blocks, span objects are stored out-of-line; that is, the span
174  * object is not stored within the block itself. But, as an exception, for a
175  * "span of spans", the span object is stored "inline". The allocation is
176  * always exactly one page, and the dsa_area_span object is located at
177  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
178  * and the remaining fields are used just as they would be in an ordinary
179  * block. We can't allocate spans out of ordinary superblocks because
180  * creating an ordinary superblock requires us to be able to allocate a span
181  * *first*. Doing it this way avoids that circularity.
182  */
183 typedef struct
184 {
185  dsa_pointer pool; /* Containing pool. */
186  dsa_pointer prevspan; /* Previous span. */
187  dsa_pointer nextspan; /* Next span. */
188  dsa_pointer start; /* Starting address. */
189  size_t npages; /* Length of span in pages. */
190  uint16 size_class; /* Size class. */
191  uint16 ninitialized; /* Maximum number of objects ever allocated. */
192  uint16 nallocatable; /* Number of objects currently allocatable. */
193  uint16 firstfree; /* First object on free list. */
194  uint16 nmax; /* Maximum number of objects ever possible. */
195  uint16 fclass; /* Current fullness class. */
196 } dsa_area_span;
197 
198 /*
199  * Given a pointer to an object in a span, access the index of the next free
200  * object in the same span (ie in the span's freelist) as an L-value.
201  */
202 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
203 
204 /*
205  * Small allocations are handled by dividing a single block of memory into
206  * many small objects of equal size. The possible allocation sizes are
207  * defined by the following array. Larger size classes are spaced more widely
208  * than smaller size classes. We fudge the spacing for size classes >1kB to
209  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
210  * blocks, we bump the maximum object size up to the largest multiple of
211  * 8 bytes that still lets us fit the same number of objects into one block.
212  *
213  * NB: Because of this fudging, if we were ever to use differently-sized blocks
214  * for small allocations, these size classes would need to be reworked to be
215  * optimal for the new size.
216  *
217  * NB: The optimal spacing for size classes, as well as the size of the blocks
218  * out of which small objects are allocated, is not a question that has one
219  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
220  * size classes than we do here, while others (like aset.c) use more
221  * widely-spaced classes. Spacing the classes more closely avoids wasting
222  * memory within individual chunks, but also means a larger number of
223  * potentially-unfilled blocks.
224  */
225 static const uint16 dsa_size_classes[] = {
226  sizeof(dsa_area_span), 0, /* special size classes */
227  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
228  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
229  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
230  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
231  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
232  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
233  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
234  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
235 };
236 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
237 
238 /* Special size classes. */
239 #define DSA_SCLASS_BLOCK_OF_SPANS 0
240 #define DSA_SCLASS_SPAN_LARGE 1
241 
242 /*
243  * The following lookup table is used to map the size of small objects
244  * (less than 1kB) onto the corresponding size class. To use this table,
245  * round the size of the object up to the next multiple of 8 bytes, and then
246  * index into this array.
247  */
248 static const uint8 dsa_size_class_map[] = {
249  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
250  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
251  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
252  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
253  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
254  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
255  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
256  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
257 };
258 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
259 
260 /*
261  * Superblocks are binned by how full they are. Generally, each fullness
262  * class corresponds to one quartile, but the block being used for
263  * allocations is always at the head of the list for fullness class 1,
264  * regardless of how full it really is.
265  */
266 #define DSA_FULLNESS_CLASSES 4
267 
268 /*
269  * A dsa_area_pool represents a set of objects of a given size class.
270  *
271  * Perhaps there should be multiple pools for the same size class for
272  * contention avoidance, but for now there is just one!
273  */
274 typedef struct
275 {
276  /* A lock protecting access to this pool. */
278  /* A set of linked lists of spans, arranged by fullness. */
280  /* Should we pad this out to a cacheline boundary? */
281 } dsa_area_pool;
282 
283 /*
284  * The control block for an area. This lives in shared memory, at the start of
285  * the first DSM segment controlled by this area.
286  */
287 typedef struct
288 {
289  /* The segment header for the first segment. */
291  /* The handle for this area. */
293  /* The handles of the segments owned by this area. */
294  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
295  /* Lists of segments, binned by maximum contiguous run of free pages. */
297  /* The object pools for each size class. */
299  /* initial allocation segment size */
301  /* maximum allocation segment size */
303  /* The total size of all active segments. */
305  /* The maximum total size of backing storage we are allowed. */
307  /* Highest used segment index in the history of this area. */
309  /* The reference count for this area. */
310  int refcnt;
311  /* A flag indicating that this area has been pinned. */
312  bool pinned;
313  /* The number of times that segments have been freed. */
315  /* The LWLock tranche ID. */
317  /* The general lock (protects everything except object pools). */
320 
321 /* Given a pointer to a pool, find a dsa_pointer. */
322 #define DsaAreaPoolToDsaPointer(area, p) \
323  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
324 
325 /*
326  * A dsa_segment_map is stored within the backend-private memory of each
327  * individual backend. It holds the base address of the segment within that
328  * backend, plus the addresses of key objects within the segment. Those
329  * could instead be derived from the base address but it's handy to have them
330  * around.
331  */
332 typedef struct
333 {
334  dsm_segment *segment; /* DSM segment */
335  char *mapped_address; /* Address at which segment is mapped */
336  dsa_segment_header *header; /* Header (same as mapped_address) */
337  FreePageManager *fpm; /* Free page manager within segment. */
338  dsa_pointer *pagemap; /* Page map within segment. */
340 
341 /*
342  * Per-backend state for a storage area. Backends obtain one of these by
343  * creating an area or attaching to an existing one using a handle. Each
344  * process that needs to use an area uses its own object to track where the
345  * segments are mapped.
346  */
347 struct dsa_area
348 {
349  /* Pointer to the control object in shared memory. */
351 
352  /*
353  * All the mappings are owned by this. The dsa_area itself is not
354  * directly tracked by the ResourceOwner, but the effect is the same. NULL
355  * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
356  * been called.
357  */
359 
360  /*
361  * This backend's array of segment maps, ordered by segment index
362  * corresponding to control->segment_handles. Some of the area's segments
363  * may not be mapped in this backend yet, and some slots may have been
364  * freed and need to be detached; these operations happen on demand.
365  */
367 
368  /* The highest segment index this backend has ever mapped. */
370 
371  /* The last observed freed_segment_counter. */
373 };
374 
375 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
376 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
377 
378 /* Given a pointer to a segment_map, obtain a segment index number. */
379 #define get_segment_index(area, segment_map_ptr) \
380  (segment_map_ptr - &area->segment_maps[0])
381 
382 static void init_span(dsa_area *area, dsa_pointer span_pointer,
383  dsa_area_pool *pool, dsa_pointer start, size_t npages,
384  uint16 size_class);
385 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
386  int fromclass, int toclass);
387 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
388 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
389  int size_class);
392 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
393 static void unlink_span(dsa_area *area, dsa_area_span *span);
394 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
395  dsa_pointer span_pointer, int fclass);
396 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
397 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
398 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
399 static dsa_area *create_internal(void *place, size_t size,
400  int tranche_id,
401  dsm_handle control_handle,
402  dsm_segment *control_segment,
403  size_t init_segment_size,
404  size_t max_segment_size);
405 static dsa_area *attach_internal(void *place, dsm_segment *segment,
406  dsa_handle handle);
407 static void check_for_freed_segments(dsa_area *area);
408 static void check_for_freed_segments_locked(dsa_area *area);
409 static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
410 
411 /*
412  * Create a new shared area in a new DSM segment. Further DSM segments will
413  * be allocated as required to extend the available space.
414  *
415  * We can't allocate a LWLock tranche_id within this function, because tranche
416  * IDs are a scarce resource; there are only 64k available, using low numbers
417  * when possible matters, and we have no provision for recycling them. So,
418  * we require the caller to provide one.
419  */
420 dsa_area *
421 dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
422 {
423  dsm_segment *segment;
424  dsa_area *area;
425 
426  /*
427  * Create the DSM segment that will hold the shared control object and the
428  * first segment of usable space.
429  */
430  segment = dsm_create(init_segment_size, 0);
431 
432  /*
433  * All segments backing this area are pinned, so that DSA can explicitly
434  * control their lifetime (otherwise a newly created segment belonging to
435  * this area might be freed when the only backend that happens to have it
436  * mapped in ends, corrupting the area).
437  */
438  dsm_pin_segment(segment);
439 
440  /* Create a new DSA area with the control object in this segment. */
441  area = create_internal(dsm_segment_address(segment),
442  init_segment_size,
443  tranche_id,
444  dsm_segment_handle(segment), segment,
445  init_segment_size, max_segment_size);
446 
447  /* Clean up when the control segment detaches. */
450 
451  return area;
452 }
453 
454 /*
455  * Create a new shared area in an existing shared memory space, which may be
456  * either DSM or Postmaster-initialized memory. DSM segments will be
457  * allocated as required to extend the available space, though that can be
458  * prevented with dsa_set_size_limit(area, size) using the same size provided
459  * to dsa_create_in_place.
460  *
461  * Areas created in-place must eventually be released by the backend that
462  * created them and all backends that attach to them. This can be done
463  * explicitly with dsa_release_in_place, or, in the special case that 'place'
464  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
465  * segment so that a detach hook can be registered with the containing DSM
466  * segment.
467  *
468  * See dsa_create() for a note about the tranche arguments.
469  */
470 dsa_area *
471 dsa_create_in_place_ext(void *place, size_t size,
472  int tranche_id, dsm_segment *segment,
473  size_t init_segment_size, size_t max_segment_size)
474 {
475  dsa_area *area;
476 
477  area = create_internal(place, size, tranche_id,
478  DSM_HANDLE_INVALID, NULL,
479  init_segment_size, max_segment_size);
480 
481  /*
482  * Clean up when the control segment detaches, if a containing DSM segment
483  * was provided.
484  */
485  if (segment != NULL)
487  PointerGetDatum(place));
488 
489  return area;
490 }
491 
492 /*
493  * Obtain a handle that can be passed to other processes so that they can
494  * attach to the given area. Cannot be called for areas created with
495  * dsa_create_in_place.
496  */
499 {
501  return area->control->handle;
502 }
503 
504 /*
505  * Attach to an area given a handle generated (possibly in another process) by
506  * dsa_get_handle. The area must have been created with dsa_create (not
507  * dsa_create_in_place).
508  */
509 dsa_area *
511 {
512  dsm_segment *segment;
513  dsa_area *area;
514 
515  /*
516  * An area handle is really a DSM segment handle for the first segment, so
517  * we go ahead and attach to that.
518  */
519  segment = dsm_attach(handle);
520  if (segment == NULL)
521  ereport(ERROR,
522  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
523  errmsg("could not attach to dynamic shared area")));
524 
525  area = attach_internal(dsm_segment_address(segment), segment, handle);
526 
527  /* Clean up when the control segment detaches. */
530 
531  return area;
532 }
533 
534 /*
535  * Attach to an area that was created with dsa_create_in_place. The caller
536  * must somehow know the location in memory that was used when the area was
537  * created, though it may be mapped at a different virtual address in this
538  * process.
539  *
540  * See dsa_create_in_place for note about releasing in-place areas, and the
541  * optional 'segment' argument which can be provided to allow automatic
542  * release if the containing memory happens to be a DSM segment.
543  */
544 dsa_area *
545 dsa_attach_in_place(void *place, dsm_segment *segment)
546 {
547  dsa_area *area;
548 
549  area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
550 
551  /*
552  * Clean up when the control segment detaches, if a containing DSM segment
553  * was provided.
554  */
555  if (segment != NULL)
557  PointerGetDatum(place));
558 
559  return area;
560 }
561 
562 /*
563  * Release a DSA area that was produced by dsa_create_in_place or
564  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
565  * interface suitable for on_dsm_detach, for the convenience of users who want
566  * to create a DSA segment inside an existing DSM segment and have it
567  * automatically released when the containing DSM segment is detached.
568  * 'place' should be the address of the place where the area was created.
569  *
570  * This callback is automatically registered for the DSM segment containing
571  * the control object of in-place areas when a segment is provided to
572  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
573  * with dsa_create.
574  */
575 void
577 {
579 }
580 
581 /*
582  * Release a DSA area that was produced by dsa_create_in_place or
583  * dsa_attach_in_place. The 'code' argument is ignored but provides an
584  * interface suitable for on_shmem_exit or before_shmem_exit, for the
585  * convenience of users who want to create a DSA segment inside shared memory
586  * other than a DSM segment and have it automatically release at backend exit.
587  * 'place' should be the address of the place where the area was created.
588  */
589 void
591 {
593 }
594 
595 /*
596  * Release a DSA area that was produced by dsa_create_in_place or
597  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
598  * callbacks so that this is managed automatically, because failure to release
599  * an area created in-place leaks its segments permanently.
600  *
601  * This is also called automatically for areas produced by dsa_create or
602  * dsa_attach as an implementation detail.
603  */
604 void
606 {
607  dsa_area_control *control = (dsa_area_control *) place;
608  int i;
609 
610  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
611  Assert(control->segment_header.magic ==
612  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
613  Assert(control->refcnt > 0);
614  if (--control->refcnt == 0)
615  {
616  for (i = 0; i <= control->high_segment_index; ++i)
617  {
618  dsm_handle handle;
619 
620  handle = control->segment_handles[i];
621  if (handle != DSM_HANDLE_INVALID)
622  dsm_unpin_segment(handle);
623  }
624  }
625  LWLockRelease(&control->lock);
626 }
627 
628 /*
629  * Keep a DSA area attached until end of session or explicit detach.
630  *
631  * By default, areas are owned by the current resource owner, which means they
632  * are detached automatically when that scope ends.
633  */
634 void
636 {
637  int i;
638 
639  if (area->resowner != NULL)
640  {
641  area->resowner = NULL;
642 
643  for (i = 0; i <= area->high_segment_index; ++i)
644  if (area->segment_maps[i].segment != NULL)
646  }
647 }
648 
649 /*
650  * Allocate memory in this storage area. The return value is a dsa_pointer
651  * that can be passed to other processes, and converted to a local pointer
652  * with dsa_get_address. 'flags' is a bitmap which should be constructed
653  * from the following values:
654  *
655  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
656  * will result in an ERROR.
657  *
658  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
659  * no memory is available or a size limit established by dsa_set_size_limit
660  * would be exceeded. Otherwise, such allocations will result in an ERROR.
661  *
662  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
663  * contents of newly-allocated memory are indeterminate.
664  *
665  * These flags correspond to similarly named flags used by
666  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
667  * dsa_allocate0 which expand to a call to this function with commonly used
668  * flags.
669  */
671 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
672 {
673  uint16 size_class;
674  dsa_pointer start_pointer;
675  dsa_segment_map *segment_map;
676  dsa_pointer result;
677 
678  Assert(size > 0);
679 
680  /* Sanity check on huge individual allocation size. */
681  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
682  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
683  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
684 
685  /*
686  * If bigger than the largest size class, just grab a run of pages from
687  * the free page manager, instead of allocating an object from a pool.
688  * There will still be a span, but it's a special class of span that
689  * manages this whole allocation and simply gives all pages back to the
690  * free page manager when dsa_free is called.
691  */
693  {
694  size_t npages = fpm_size_to_pages(size);
695  size_t first_page;
696  dsa_pointer span_pointer;
698 
699  /* Obtain a span object. */
700  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
701  if (!DsaPointerIsValid(span_pointer))
702  {
703  /* Raise error unless asked not to. */
704  if ((flags & DSA_ALLOC_NO_OOM) == 0)
705  ereport(ERROR,
706  (errcode(ERRCODE_OUT_OF_MEMORY),
707  errmsg("out of memory"),
708  errdetail("Failed on DSA request of size %zu.",
709  size)));
710  return InvalidDsaPointer;
711  }
712 
714 
715  /* Find a segment from which to allocate. */
716  segment_map = get_best_segment(area, npages);
717  if (segment_map == NULL)
718  segment_map = make_new_segment(area, npages);
719  if (segment_map == NULL)
720  {
721  /* Can't make any more segments: game over. */
723  dsa_free(area, span_pointer);
724 
725  /* Raise error unless asked not to. */
726  if ((flags & DSA_ALLOC_NO_OOM) == 0)
727  ereport(ERROR,
728  (errcode(ERRCODE_OUT_OF_MEMORY),
729  errmsg("out of memory"),
730  errdetail("Failed on DSA request of size %zu.",
731  size)));
732  return InvalidDsaPointer;
733  }
734 
735  /*
736  * Ask the free page manager for a run of pages. This should always
737  * succeed, since both get_best_segment and make_new_segment should
738  * only return a non-NULL pointer if it actually contains enough
739  * contiguous freespace. If it does fail, something in our backend
740  * private state is out of whack, so use FATAL to kill the process.
741  */
742  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
743  elog(FATAL,
744  "dsa_allocate could not find %zu free pages", npages);
746 
747  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
748  first_page * FPM_PAGE_SIZE);
749 
750  /* Initialize span and pagemap. */
752  LW_EXCLUSIVE);
753  init_span(area, span_pointer, pool, start_pointer, npages,
755  segment_map->pagemap[first_page] = span_pointer;
757 
758  /* Zero-initialize the memory if requested. */
759  if ((flags & DSA_ALLOC_ZERO) != 0)
760  memset(dsa_get_address(area, start_pointer), 0, size);
761 
762  return start_pointer;
763  }
764 
765  /* Map allocation to a size class. */
767  {
768  int mapidx;
769 
770  /* For smaller sizes we have a lookup table... */
771  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
773  size_class = dsa_size_class_map[mapidx];
774  }
775  else
776  {
777  uint16 min;
778  uint16 max;
779 
780  /* ... and for the rest we search by binary chop. */
782  max = lengthof(dsa_size_classes) - 1;
783 
784  while (min < max)
785  {
786  uint16 mid = (min + max) / 2;
787  uint16 class_size = dsa_size_classes[mid];
788 
789  if (class_size < size)
790  min = mid + 1;
791  else
792  max = mid;
793  }
794 
795  size_class = min;
796  }
797  Assert(size <= dsa_size_classes[size_class]);
798  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
799 
800  /* Attempt to allocate an object from the appropriate pool. */
801  result = alloc_object(area, size_class);
802 
803  /* Check for failure to allocate. */
804  if (!DsaPointerIsValid(result))
805  {
806  /* Raise error unless asked not to. */
807  if ((flags & DSA_ALLOC_NO_OOM) == 0)
808  ereport(ERROR,
809  (errcode(ERRCODE_OUT_OF_MEMORY),
810  errmsg("out of memory"),
811  errdetail("Failed on DSA request of size %zu.", size)));
812  return InvalidDsaPointer;
813  }
814 
815  /* Zero-initialize the memory if requested. */
816  if ((flags & DSA_ALLOC_ZERO) != 0)
817  memset(dsa_get_address(area, result), 0, size);
818 
819  return result;
820 }
821 
822 /*
823  * Free memory obtained with dsa_allocate.
824  */
825 void
827 {
828  dsa_segment_map *segment_map;
829  int pageno;
830  dsa_pointer span_pointer;
831  dsa_area_span *span;
832  char *superblock;
833  char *object;
834  size_t size;
835  int size_class;
836 
837  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
839 
840  /* Locate the object, span and pool. */
841  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
842  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
843  span_pointer = segment_map->pagemap[pageno];
844  span = dsa_get_address(area, span_pointer);
845  superblock = dsa_get_address(area, span->start);
846  object = dsa_get_address(area, dp);
847  size_class = span->size_class;
848  size = dsa_size_classes[size_class];
849 
850  /*
851  * Special case for large objects that live in a special span: we return
852  * those pages directly to the free page manager and free the span.
853  */
854  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
855  {
856 
857 #ifdef CLOBBER_FREED_MEMORY
858  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
859 #endif
860 
861  /* Give pages back to free page manager. */
863  FreePageManagerPut(segment_map->fpm,
865  span->npages);
866 
867  /* Move segment to appropriate bin if necessary. */
868  rebin_segment(area, segment_map);
870 
871  /* Unlink span. */
873  LW_EXCLUSIVE);
874  unlink_span(area, span);
876  /* Free the span object so it can be reused. */
877  dsa_free(area, span_pointer);
878  return;
879  }
880 
881 #ifdef CLOBBER_FREED_MEMORY
882  memset(object, 0x7f, size);
883 #endif
884 
885  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
886 
887  /* Put the object on the span's freelist. */
888  Assert(object >= superblock);
889  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
890  Assert((object - superblock) % size == 0);
891  NextFreeObjectIndex(object) = span->firstfree;
892  span->firstfree = (object - superblock) / size;
893  ++span->nallocatable;
894 
895  /*
896  * See if the span needs to moved to a different fullness class, or be
897  * freed so its pages can be given back to the segment.
898  */
899  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
900  {
901  /*
902  * The block was completely full and is located in the
903  * highest-numbered fullness class, which is never scanned for free
904  * chunks. We must move it to the next-lower fullness class.
905  */
906  unlink_span(area, span);
907  add_span_to_fullness_class(area, span, span_pointer,
909 
910  /*
911  * If this is the only span, and there is no active span, then we
912  * should probably move this span to fullness class 1. (Otherwise if
913  * you allocate exactly all the objects in the only span, it moves to
914  * class 3, then you free them all, it moves to 2, and then is given
915  * back, leaving no active span).
916  */
917  }
918  else if (span->nallocatable == span->nmax &&
919  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
920  {
921  /*
922  * This entire block is free, and it's not the active block for this
923  * size class. Return the memory to the free page manager. We don't
924  * do this for the active block to prevent hysteresis: if we
925  * repeatedly allocate and free the only chunk in the active block, it
926  * will be very inefficient if we deallocate and reallocate the block
927  * every time.
928  */
929  destroy_superblock(area, span_pointer);
930  }
931 
932  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
933 }
934 
935 /*
936  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
937  * memory allocated by the given area (possibly in another process) that
938  * hasn't yet been freed. This may cause a segment to be mapped into the
939  * current process if required, and may cause freed segments to be unmapped.
940  */
941 void *
943 {
945  size_t offset;
946 
947  /* Convert InvalidDsaPointer to NULL. */
948  if (!DsaPointerIsValid(dp))
949  return NULL;
950 
951  /* Process any requests to detach from freed segments. */
953 
954  /* Break the dsa_pointer into its components. */
956  offset = DSA_EXTRACT_OFFSET(dp);
958 
959  /* Check if we need to cause this segment to be mapped in. */
960  if (unlikely(area->segment_maps[index].mapped_address == NULL))
961  {
962  /* Call for effect (we don't need the result). */
964  }
965 
966  return area->segment_maps[index].mapped_address + offset;
967 }
968 
969 /*
970  * Pin this area, so that it will continue to exist even if all backends
971  * detach from it. In that case, the area can still be reattached to if a
972  * handle has been recorded somewhere.
973  */
974 void
976 {
978  if (area->control->pinned)
979  {
981  elog(ERROR, "dsa_area already pinned");
982  }
983  area->control->pinned = true;
984  ++area->control->refcnt;
986 }
987 
988 /*
989  * Undo the effects of dsa_pin, so that the given area can be freed when no
990  * backends are attached to it. May be called only if dsa_pin has been
991  * called.
992  */
993 void
995 {
997  Assert(area->control->refcnt > 1);
998  if (!area->control->pinned)
999  {
1001  elog(ERROR, "dsa_area not pinned");
1002  }
1003  area->control->pinned = false;
1004  --area->control->refcnt;
1006 }
1007 
1008 /*
1009  * Set the total size limit for this area. This limit is checked whenever new
1010  * segments need to be allocated from the operating system. If the new size
1011  * limit is already exceeded, this has no immediate effect.
1012  *
1013  * Note that the total virtual memory usage may be temporarily larger than
1014  * this limit when segments have been freed, but not yet detached by all
1015  * backends that have attached to them.
1016  */
1017 void
1018 dsa_set_size_limit(dsa_area *area, size_t limit)
1019 {
1021  area->control->max_total_segment_size = limit;
1023 }
1024 
1025 /* Return the total size of all active segments */
1026 size_t
1028 {
1029  size_t size;
1030 
1032  size = area->control->total_segment_size;
1034 
1035  return size;
1036 }
1037 
1038 /*
1039  * Aggressively free all spare memory in the hope of returning DSM segments to
1040  * the operating system.
1041  */
1042 void
1044 {
1045  int size_class;
1046 
1047  /*
1048  * Trim in reverse pool order so we get to the spans-of-spans last, just
1049  * in case any become entirely free while processing all the other pools.
1050  */
1051  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1052  {
1053  dsa_area_pool *pool = &area->control->pools[size_class];
1054  dsa_pointer span_pointer;
1055 
1056  if (size_class == DSA_SCLASS_SPAN_LARGE)
1057  {
1058  /* Large object frees give back segments aggressively already. */
1059  continue;
1060  }
1061 
1062  /*
1063  * Search fullness class 1 only. That is where we expect to find an
1064  * entirely empty superblock (entirely empty superblocks in other
1065  * fullness classes are returned to the free page map by dsa_free).
1066  */
1067  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1068  span_pointer = pool->spans[1];
1069  while (DsaPointerIsValid(span_pointer))
1070  {
1071  dsa_area_span *span = dsa_get_address(area, span_pointer);
1072  dsa_pointer next = span->nextspan;
1073 
1074  if (span->nallocatable == span->nmax)
1075  destroy_superblock(area, span_pointer);
1076 
1077  span_pointer = next;
1078  }
1079  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1080  }
1081 }
1082 
1083 /*
1084  * Print out debugging information about the internal state of the shared
1085  * memory area.
1086  */
1087 void
1089 {
1090  size_t i,
1091  j;
1092 
1093  /*
1094  * Note: This gives an inconsistent snapshot as it acquires and releases
1095  * individual locks as it goes...
1096  */
1097 
1100  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1101  fprintf(stderr, " max_total_segment_size: %zu\n",
1103  fprintf(stderr, " total_segment_size: %zu\n",
1104  area->control->total_segment_size);
1105  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1106  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1107  fprintf(stderr, " segment bins:\n");
1108  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1109  {
1111  {
1112  dsa_segment_index segment_index;
1113 
1114  if (i == 0)
1115  fprintf(stderr,
1116  " segment bin %zu (no contiguous free pages):\n", i);
1117  else
1118  fprintf(stderr,
1119  " segment bin %zu (at least %d contiguous pages free):\n",
1120  i, 1 << (i - 1));
1121  segment_index = area->control->segment_bins[i];
1122  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1123  {
1124  dsa_segment_map *segment_map;
1125 
1126  segment_map =
1127  get_segment_by_index(area, segment_index);
1128 
1129  fprintf(stderr,
1130  " segment index %zu, usable_pages = %zu, "
1131  "contiguous_pages = %zu, mapped at %p\n",
1132  segment_index,
1133  segment_map->header->usable_pages,
1134  fpm_largest(segment_map->fpm),
1135  segment_map->mapped_address);
1136  segment_index = segment_map->header->next;
1137  }
1138  }
1139  }
1141 
1142  fprintf(stderr, " pools:\n");
1143  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1144  {
1145  bool found = false;
1146 
1148  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1149  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1150  found = true;
1151  if (found)
1152  {
1154  fprintf(stderr, " pool for blocks of span objects:\n");
1155  else if (i == DSA_SCLASS_SPAN_LARGE)
1156  fprintf(stderr, " pool for large object spans:\n");
1157  else
1158  fprintf(stderr,
1159  " pool for size class %zu (object size %hu bytes):\n",
1160  i, dsa_size_classes[i]);
1161  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1162  {
1163  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1164  fprintf(stderr, " fullness class %zu is empty\n", j);
1165  else
1166  {
1167  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1168 
1169  fprintf(stderr, " fullness class %zu:\n", j);
1170  while (DsaPointerIsValid(span_pointer))
1171  {
1172  dsa_area_span *span;
1173 
1174  span = dsa_get_address(area, span_pointer);
1175  fprintf(stderr,
1176  " span descriptor at "
1177  DSA_POINTER_FORMAT ", superblock at "
1179  ", pages = %zu, objects free = %hu/%hu\n",
1180  span_pointer, span->start, span->npages,
1181  span->nallocatable, span->nmax);
1182  span_pointer = span->nextspan;
1183  }
1184  }
1185  }
1186  }
1188  }
1189 }
1190 
1191 /*
1192  * Return the smallest size that you can successfully provide to
1193  * dsa_create_in_place.
1194  */
1195 size_t
1197 {
1198  size_t size;
1199  int pages = 0;
1200 
1201  size = MAXALIGN(sizeof(dsa_area_control)) +
1202  MAXALIGN(sizeof(FreePageManager));
1203 
1204  /* Figure out how many pages we need, including the page map... */
1205  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1206  {
1207  ++pages;
1208  size += sizeof(dsa_pointer);
1209  }
1210 
1211  return pages * FPM_PAGE_SIZE;
1212 }
1213 
1214 /*
1215  * Workhorse function for dsa_create and dsa_create_in_place.
1216  */
1217 static dsa_area *
1218 create_internal(void *place, size_t size,
1219  int tranche_id,
1220  dsm_handle control_handle,
1221  dsm_segment *control_segment,
1222  size_t init_segment_size, size_t max_segment_size)
1223 {
1224  dsa_area_control *control;
1225  dsa_area *area;
1226  dsa_segment_map *segment_map;
1227  size_t usable_pages;
1228  size_t total_pages;
1229  size_t metadata_bytes;
1230  int i;
1231 
1232  /* Check the initial and maximum block sizes */
1233  Assert(init_segment_size >= DSA_MIN_SEGMENT_SIZE);
1234  Assert(max_segment_size >= init_segment_size);
1235  Assert(max_segment_size <= DSA_MAX_SEGMENT_SIZE);
1236 
1237  /* Sanity check on the space we have to work in. */
1238  if (size < dsa_minimum_size())
1239  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1240  dsa_minimum_size(), size);
1241 
1242  /* Now figure out how much space is usable */
1243  total_pages = size / FPM_PAGE_SIZE;
1244  metadata_bytes =
1245  MAXALIGN(sizeof(dsa_area_control)) +
1246  MAXALIGN(sizeof(FreePageManager)) +
1247  total_pages * sizeof(dsa_pointer);
1248  /* Add padding up to next page boundary. */
1249  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1250  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1251  Assert(metadata_bytes <= size);
1252  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1253 
1254  /*
1255  * Initialize the dsa_area_control object located at the start of the
1256  * space.
1257  */
1258  control = (dsa_area_control *) place;
1259  memset(place, 0, sizeof(*control));
1260  control->segment_header.magic =
1261  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1264  control->segment_header.usable_pages = usable_pages;
1265  control->segment_header.freed = false;
1266  control->segment_header.size = size;
1267  control->handle = control_handle;
1268  control->init_segment_size = init_segment_size;
1269  control->max_segment_size = max_segment_size;
1270  control->max_total_segment_size = (size_t) -1;
1271  control->total_segment_size = size;
1272  control->segment_handles[0] = control_handle;
1273  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1275  control->refcnt = 1;
1276  control->lwlock_tranche_id = tranche_id;
1277 
1278  /*
1279  * Create the dsa_area object that this backend will use to access the
1280  * area. Other backends will need to obtain their own dsa_area object by
1281  * attaching.
1282  */
1283  area = palloc(sizeof(dsa_area));
1284  area->control = control;
1286  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1287  area->high_segment_index = 0;
1288  area->freed_segment_counter = 0;
1289  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1290  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1292  control->lwlock_tranche_id);
1293 
1294  /* Set up the segment map for this process's mapping. */
1295  segment_map = &area->segment_maps[0];
1296  segment_map->segment = control_segment;
1297  segment_map->mapped_address = place;
1298  segment_map->header = (dsa_segment_header *) place;
1299  segment_map->fpm = (FreePageManager *)
1300  (segment_map->mapped_address +
1301  MAXALIGN(sizeof(dsa_area_control)));
1302  segment_map->pagemap = (dsa_pointer *)
1303  (segment_map->mapped_address +
1304  MAXALIGN(sizeof(dsa_area_control)) +
1305  MAXALIGN(sizeof(FreePageManager)));
1306 
1307  /* Set up the free page map. */
1308  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1309  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1310 
1311  if (usable_pages > 0)
1312  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1313  usable_pages);
1314 
1315  /* Put this segment into the appropriate bin. */
1316  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1317  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1318 
1319  return area;
1320 }
1321 
1322 /*
1323  * Workhorse function for dsa_attach and dsa_attach_in_place.
1324  */
1325 static dsa_area *
1326 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1327 {
1328  dsa_area_control *control;
1329  dsa_area *area;
1330  dsa_segment_map *segment_map;
1331 
1332  control = (dsa_area_control *) place;
1333  Assert(control->handle == handle);
1334  Assert(control->segment_handles[0] == handle);
1335  Assert(control->segment_header.magic ==
1336  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1337 
1338  /* Build the backend-local area object. */
1339  area = palloc(sizeof(dsa_area));
1340  area->control = control;
1342  memset(&area->segment_maps[0], 0,
1343  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1344  area->high_segment_index = 0;
1345 
1346  /* Set up the segment map for this process's mapping. */
1347  segment_map = &area->segment_maps[0];
1348  segment_map->segment = segment; /* NULL for in-place */
1349  segment_map->mapped_address = place;
1350  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1351  segment_map->fpm = (FreePageManager *)
1352  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1353  segment_map->pagemap = (dsa_pointer *)
1354  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1355  MAXALIGN(sizeof(FreePageManager)));
1356 
1357  /* Bump the reference count. */
1359  if (control->refcnt == 0)
1360  {
1361  /* We can't attach to a DSA area that has already been destroyed. */
1362  ereport(ERROR,
1363  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1364  errmsg("could not attach to dynamic shared area")));
1365  }
1366  ++control->refcnt;
1369 
1370  return area;
1371 }
1372 
1373 /*
1374  * Add a new span to fullness class 1 of the indicated pool.
1375  */
1376 static void
1378  dsa_pointer span_pointer,
1379  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1380  uint16 size_class)
1381 {
1382  dsa_area_span *span = dsa_get_address(area, span_pointer);
1383  size_t obsize = dsa_size_classes[size_class];
1384 
1385  /*
1386  * The per-pool lock must be held because we manipulate the span list for
1387  * this pool.
1388  */
1389  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1390 
1391  /* Push this span onto the front of the span list for fullness class 1. */
1392  if (DsaPointerIsValid(pool->spans[1]))
1393  {
1394  dsa_area_span *head = (dsa_area_span *)
1395  dsa_get_address(area, pool->spans[1]);
1396 
1397  head->prevspan = span_pointer;
1398  }
1399  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1400  span->nextspan = pool->spans[1];
1401  span->prevspan = InvalidDsaPointer;
1402  pool->spans[1] = span_pointer;
1403 
1404  span->start = start;
1405  span->npages = npages;
1406  span->size_class = size_class;
1407  span->ninitialized = 0;
1408  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1409  {
1410  /*
1411  * A block-of-spans contains its own descriptor, so mark one object as
1412  * initialized and reduce the count of allocatable objects by one.
1413  * Doing this here has the side effect of also reducing nmax by one,
1414  * which is important to make sure we free this object at the correct
1415  * time.
1416  */
1417  span->ninitialized = 1;
1418  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1419  }
1420  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1421  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1423  span->nmax = span->nallocatable;
1424  span->fclass = 1;
1425 }
1426 
1427 /*
1428  * Transfer the first span in one fullness class to the head of another
1429  * fullness class.
1430  */
1431 static bool
1433  dsa_area_pool *pool, int fromclass, int toclass)
1434 {
1435  dsa_pointer span_pointer;
1436  dsa_area_span *span;
1437  dsa_area_span *nextspan;
1438 
1439  /* Can't do it if source list is empty. */
1440  span_pointer = pool->spans[fromclass];
1441  if (!DsaPointerIsValid(span_pointer))
1442  return false;
1443 
1444  /* Remove span from head of source list. */
1445  span = dsa_get_address(area, span_pointer);
1446  pool->spans[fromclass] = span->nextspan;
1447  if (DsaPointerIsValid(span->nextspan))
1448  {
1449  nextspan = (dsa_area_span *)
1450  dsa_get_address(area, span->nextspan);
1451  nextspan->prevspan = InvalidDsaPointer;
1452  }
1453 
1454  /* Add span to head of target list. */
1455  span->nextspan = pool->spans[toclass];
1456  pool->spans[toclass] = span_pointer;
1457  if (DsaPointerIsValid(span->nextspan))
1458  {
1459  nextspan = (dsa_area_span *)
1460  dsa_get_address(area, span->nextspan);
1461  nextspan->prevspan = span_pointer;
1462  }
1463  span->fclass = toclass;
1464 
1465  return true;
1466 }
1467 
1468 /*
1469  * Allocate one object of the requested size class from the given area.
1470  */
1471 static inline dsa_pointer
1472 alloc_object(dsa_area *area, int size_class)
1473 {
1474  dsa_area_pool *pool = &area->control->pools[size_class];
1475  dsa_area_span *span;
1476  dsa_pointer block;
1477  dsa_pointer result;
1478  char *object;
1479  size_t size;
1480 
1481  /*
1482  * Even though ensure_active_superblock can in turn call alloc_object if
1483  * it needs to allocate a new span, that's always from a different pool,
1484  * and the order of lock acquisition is always the same, so it's OK that
1485  * we hold this lock for the duration of this function.
1486  */
1487  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1488  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1489 
1490  /*
1491  * If there's no active superblock, we must successfully obtain one or
1492  * fail the request.
1493  */
1494  if (!DsaPointerIsValid(pool->spans[1]) &&
1495  !ensure_active_superblock(area, pool, size_class))
1496  {
1497  result = InvalidDsaPointer;
1498  }
1499  else
1500  {
1501  /*
1502  * There should be a block in fullness class 1 at this point, and it
1503  * should never be completely full. Thus we can either pop an object
1504  * from the free list or, failing that, initialize a new object.
1505  */
1506  Assert(DsaPointerIsValid(pool->spans[1]));
1507  span = (dsa_area_span *)
1508  dsa_get_address(area, pool->spans[1]);
1509  Assert(span->nallocatable > 0);
1510  block = span->start;
1511  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1512  size = dsa_size_classes[size_class];
1513  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1514  {
1515  result = block + span->firstfree * size;
1516  object = dsa_get_address(area, result);
1517  span->firstfree = NextFreeObjectIndex(object);
1518  }
1519  else
1520  {
1521  result = block + span->ninitialized * size;
1522  ++span->ninitialized;
1523  }
1524  --span->nallocatable;
1525 
1526  /* If it's now full, move it to the highest-numbered fullness class. */
1527  if (span->nallocatable == 0)
1528  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1529  }
1530 
1531  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1532  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1533 
1534  return result;
1535 }
1536 
1537 /*
1538  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1539  * superblocks are completely full and no more can be allocated.
1540  *
1541  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1542  * utilization percentage is at least K/N, but we only enforce this rigorously
1543  * for the highest-numbered fullness class, which always contains exactly
1544  * those blocks that are completely full. It's otherwise acceptable for a
1545  * block to be in a higher-numbered fullness class than the one to which it
1546  * logically belongs. In addition, the active block, which is always the
1547  * first block in fullness class 1, is permitted to have a higher allocation
1548  * percentage than would normally be allowable for that fullness class; we
1549  * don't move it until it's completely full, and then it goes to the
1550  * highest-numbered fullness class.
1551  *
1552  * It might seem odd that the active block is the head of fullness class 1
1553  * rather than fullness class 0, but experience with other allocators has
1554  * shown that it's usually better to allocate from a block that's moderately
1555  * full rather than one that's nearly empty. Insofar as is reasonably
1556  * possible, we want to avoid performing new allocations in a block that would
1557  * otherwise become empty soon.
1558  */
1559 static bool
1561  int size_class)
1562 {
1563  dsa_pointer span_pointer;
1564  dsa_pointer start_pointer;
1565  size_t obsize = dsa_size_classes[size_class];
1566  size_t nmax;
1567  int fclass;
1568  size_t npages = 1;
1569  size_t first_page;
1570  size_t i;
1571  dsa_segment_map *segment_map;
1572 
1573  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1574 
1575  /*
1576  * Compute the number of objects that will fit in a block of this size
1577  * class. Span-of-spans blocks are just a single page, and the first
1578  * object isn't available for use because it describes the block-of-spans
1579  * itself.
1580  */
1581  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1582  nmax = FPM_PAGE_SIZE / obsize - 1;
1583  else
1584  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1585 
1586  /*
1587  * If fullness class 1 is empty, try to find a span to put in it by
1588  * scanning higher-numbered fullness classes (excluding the last one,
1589  * whose blocks are certain to all be completely full).
1590  */
1591  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1592  {
1593  span_pointer = pool->spans[fclass];
1594 
1595  while (DsaPointerIsValid(span_pointer))
1596  {
1597  int tfclass;
1598  dsa_area_span *span;
1599  dsa_area_span *nextspan;
1600  dsa_area_span *prevspan;
1601  dsa_pointer next_span_pointer;
1602 
1603  span = (dsa_area_span *)
1604  dsa_get_address(area, span_pointer);
1605  next_span_pointer = span->nextspan;
1606 
1607  /* Figure out what fullness class should contain this span. */
1608  tfclass = (nmax - span->nallocatable)
1609  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1610 
1611  /* Look up next span. */
1612  if (DsaPointerIsValid(span->nextspan))
1613  nextspan = (dsa_area_span *)
1614  dsa_get_address(area, span->nextspan);
1615  else
1616  nextspan = NULL;
1617 
1618  /*
1619  * If utilization has dropped enough that this now belongs in some
1620  * other fullness class, move it there.
1621  */
1622  if (tfclass < fclass)
1623  {
1624  /* Remove from the current fullness class list. */
1625  if (pool->spans[fclass] == span_pointer)
1626  {
1627  /* It was the head; remove it. */
1629  pool->spans[fclass] = span->nextspan;
1630  if (nextspan != NULL)
1631  nextspan->prevspan = InvalidDsaPointer;
1632  }
1633  else
1634  {
1635  /* It was not the head. */
1637  prevspan = (dsa_area_span *)
1638  dsa_get_address(area, span->prevspan);
1639  prevspan->nextspan = span->nextspan;
1640  }
1641  if (nextspan != NULL)
1642  nextspan->prevspan = span->prevspan;
1643 
1644  /* Push onto the head of the new fullness class list. */
1645  span->nextspan = pool->spans[tfclass];
1646  pool->spans[tfclass] = span_pointer;
1647  span->prevspan = InvalidDsaPointer;
1648  if (DsaPointerIsValid(span->nextspan))
1649  {
1650  nextspan = (dsa_area_span *)
1651  dsa_get_address(area, span->nextspan);
1652  nextspan->prevspan = span_pointer;
1653  }
1654  span->fclass = tfclass;
1655  }
1656 
1657  /* Advance to next span on list. */
1658  span_pointer = next_span_pointer;
1659  }
1660 
1661  /* Stop now if we found a suitable block. */
1662  if (DsaPointerIsValid(pool->spans[1]))
1663  return true;
1664  }
1665 
1666  /*
1667  * If there are no blocks that properly belong in fullness class 1, pick
1668  * one from some other fullness class and move it there anyway, so that we
1669  * have an allocation target. Our last choice is to transfer a block
1670  * that's almost empty (and might become completely empty soon if left
1671  * alone), but even that is better than failing, which is what we must do
1672  * if there are no blocks at all with freespace.
1673  */
1674  Assert(!DsaPointerIsValid(pool->spans[1]));
1675  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1676  if (transfer_first_span(area, pool, fclass, 1))
1677  return true;
1678  if (!DsaPointerIsValid(pool->spans[1]) &&
1679  transfer_first_span(area, pool, 0, 1))
1680  return true;
1681 
1682  /*
1683  * We failed to find an existing span with free objects, so we need to
1684  * allocate a new superblock and construct a new span to manage it.
1685  *
1686  * First, get a dsa_area_span object to describe the new superblock block
1687  * ... unless this allocation is for a dsa_area_span object, in which case
1688  * that's surely not going to work. We handle that case by storing the
1689  * span describing a block-of-spans inline.
1690  */
1691  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1692  {
1693  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1694  if (!DsaPointerIsValid(span_pointer))
1695  return false;
1696  npages = DSA_PAGES_PER_SUPERBLOCK;
1697  }
1698 
1699  /* Find or create a segment and allocate the superblock. */
1701  segment_map = get_best_segment(area, npages);
1702  if (segment_map == NULL)
1703  {
1704  segment_map = make_new_segment(area, npages);
1705  if (segment_map == NULL)
1706  {
1708  return false;
1709  }
1710  }
1711 
1712  /*
1713  * This shouldn't happen: get_best_segment() or make_new_segment()
1714  * promised that we can successfully allocate npages.
1715  */
1716  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1717  elog(FATAL,
1718  "dsa_allocate could not find %zu free pages for superblock",
1719  npages);
1721 
1722  /* Compute the start of the superblock. */
1723  start_pointer =
1724  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1725  first_page * FPM_PAGE_SIZE);
1726 
1727  /*
1728  * If this is a block-of-spans, carve the descriptor right out of the
1729  * allocated space.
1730  */
1731  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1732  {
1733  /*
1734  * We have a pointer into the segment. We need to build a dsa_pointer
1735  * from the segment index and offset into the segment.
1736  */
1737  span_pointer = start_pointer;
1738  }
1739 
1740  /* Initialize span and pagemap. */
1741  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1742  for (i = 0; i < npages; ++i)
1743  segment_map->pagemap[first_page + i] = span_pointer;
1744 
1745  return true;
1746 }
1747 
1748 /*
1749  * Return the segment map corresponding to a given segment index, mapping the
1750  * segment in if necessary. For internal segment book-keeping, this is called
1751  * with the area lock held. It is also called by dsa_free and dsa_get_address
1752  * without any locking, relying on the fact they have a known live segment
1753  * index and they always call check_for_freed_segments to ensures that any
1754  * freed segment occupying the same slot is detached first.
1755  */
1756 static dsa_segment_map *
1758 {
1759  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1760  {
1761  dsm_handle handle;
1762  dsm_segment *segment;
1763  dsa_segment_map *segment_map;
1764  ResourceOwner oldowner;
1765 
1766  /*
1767  * If we are reached by dsa_free or dsa_get_address, there must be at
1768  * least one object allocated in the referenced segment. Otherwise,
1769  * their caller has a double-free or access-after-free bug, which we
1770  * have no hope of detecting. So we know it's safe to access this
1771  * array slot without holding a lock; it won't change underneath us.
1772  * Furthermore, we know that we can see the latest contents of the
1773  * slot, as explained in check_for_freed_segments, which those
1774  * functions call before arriving here.
1775  */
1776  handle = area->control->segment_handles[index];
1777 
1778  /* It's an error to try to access an unused slot. */
1779  if (handle == DSM_HANDLE_INVALID)
1780  elog(ERROR,
1781  "dsa_area could not attach to a segment that has been freed");
1782 
1783  oldowner = CurrentResourceOwner;
1785  segment = dsm_attach(handle);
1786  CurrentResourceOwner = oldowner;
1787  if (segment == NULL)
1788  elog(ERROR, "dsa_area could not attach to segment");
1789  segment_map = &area->segment_maps[index];
1790  segment_map->segment = segment;
1791  segment_map->mapped_address = dsm_segment_address(segment);
1792  segment_map->header =
1793  (dsa_segment_header *) segment_map->mapped_address;
1794  segment_map->fpm = (FreePageManager *)
1795  (segment_map->mapped_address +
1796  MAXALIGN(sizeof(dsa_segment_header)));
1797  segment_map->pagemap = (dsa_pointer *)
1798  (segment_map->mapped_address +
1799  MAXALIGN(sizeof(dsa_segment_header)) +
1800  MAXALIGN(sizeof(FreePageManager)));
1801 
1802  /* Remember the highest index this backend has ever mapped. */
1803  if (area->high_segment_index < index)
1804  area->high_segment_index = index;
1805 
1806  Assert(segment_map->header->magic ==
1808  }
1809 
1810  /*
1811  * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1812  * but it's a bug in the calling code and undefined behavior if the
1813  * address is not live (ie if the segment might possibly have been freed,
1814  * they're trying to use a dangling pointer).
1815  *
1816  * For dsa.c code that holds the area lock to manipulate segment_bins
1817  * lists, it would be a bug if we ever reach a freed segment here. After
1818  * it's marked as freed, the only thing any backend should do with it is
1819  * unmap it, and it should always have done that in
1820  * check_for_freed_segments_locked() before arriving here to resolve an
1821  * index to a segment_map.
1822  *
1823  * Either way we can assert that we aren't returning a freed segment.
1824  */
1825  Assert(!area->segment_maps[index].header->freed);
1826 
1827  return &area->segment_maps[index];
1828 }
1829 
1830 /*
1831  * Return a superblock to the free page manager. If the underlying segment
1832  * has become entirely free, then return it to the operating system.
1833  *
1834  * The appropriate pool lock must be held.
1835  */
1836 static void
1838 {
1839  dsa_area_span *span = dsa_get_address(area, span_pointer);
1840  int size_class = span->size_class;
1841  dsa_segment_map *segment_map;
1842 
1843 
1844  /* Remove it from its fullness class list. */
1845  unlink_span(area, span);
1846 
1847  /*
1848  * Note: Here we acquire the area lock while we already hold a per-pool
1849  * lock. We never hold the area lock and then take a pool lock, or we
1850  * could deadlock.
1851  */
1854  segment_map =
1856  FreePageManagerPut(segment_map->fpm,
1858  span->npages);
1859  /* Check if the segment is now entirely free. */
1860  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1861  {
1862  dsa_segment_index index = get_segment_index(area, segment_map);
1863 
1864  /* If it's not the segment with extra control data, free it. */
1865  if (index != 0)
1866  {
1867  /*
1868  * Give it back to the OS, and allow other backends to detect that
1869  * they need to detach.
1870  */
1871  unlink_segment(area, segment_map);
1872  segment_map->header->freed = true;
1874  segment_map->header->size);
1875  area->control->total_segment_size -=
1876  segment_map->header->size;
1878  dsm_detach(segment_map->segment);
1880  ++area->control->freed_segment_counter;
1881  segment_map->segment = NULL;
1882  segment_map->header = NULL;
1883  segment_map->mapped_address = NULL;
1884  }
1885  }
1886 
1887  /* Move segment to appropriate bin if necessary. */
1888  if (segment_map->header != NULL)
1889  rebin_segment(area, segment_map);
1890 
1892 
1893  /*
1894  * Span-of-spans blocks store the span which describes them within the
1895  * block itself, so freeing the storage implicitly frees the descriptor
1896  * also. If this is a block of any other type, we need to separately free
1897  * the span object also. This recursive call to dsa_free will acquire the
1898  * span pool's lock. We can't deadlock because the acquisition order is
1899  * always some other pool and then the span pool.
1900  */
1901  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1902  dsa_free(area, span_pointer);
1903 }
1904 
1905 static void
1907 {
1908  if (DsaPointerIsValid(span->nextspan))
1909  {
1910  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1911 
1912  next->prevspan = span->prevspan;
1913  }
1914  if (DsaPointerIsValid(span->prevspan))
1915  {
1916  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1917 
1918  prev->nextspan = span->nextspan;
1919  }
1920  else
1921  {
1922  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1923 
1924  pool->spans[span->fclass] = span->nextspan;
1925  }
1926 }
1927 
1928 static void
1930  dsa_pointer span_pointer,
1931  int fclass)
1932 {
1933  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1934 
1935  if (DsaPointerIsValid(pool->spans[fclass]))
1936  {
1937  dsa_area_span *head = dsa_get_address(area,
1938  pool->spans[fclass]);
1939 
1940  head->prevspan = span_pointer;
1941  }
1942  span->prevspan = InvalidDsaPointer;
1943  span->nextspan = pool->spans[fclass];
1944  pool->spans[fclass] = span_pointer;
1945  span->fclass = fclass;
1946 }
1947 
1948 /*
1949  * Detach from an area that was either created or attached to by this process.
1950  */
1951 void
1953 {
1954  int i;
1955 
1956  /* Detach from all segments. */
1957  for (i = 0; i <= area->high_segment_index; ++i)
1958  if (area->segment_maps[i].segment != NULL)
1959  dsm_detach(area->segment_maps[i].segment);
1960 
1961  /*
1962  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1963  * 'releasing' (= adjusting the reference count). It would be nice to
1964  * combine these operations, but client code might never get around to
1965  * calling dsa_detach because of an error path, and a detach hook on any
1966  * particular segment is too late to detach other segments in the area
1967  * without risking a 'leak' warning in the non-error path.
1968  */
1969 
1970  /* Free the backend-local area object. */
1971  pfree(area);
1972 }
1973 
1974 /*
1975  * Unlink a segment from the bin that contains it.
1976  */
1977 static void
1979 {
1980  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1981  {
1982  dsa_segment_map *prev;
1983 
1984  prev = get_segment_by_index(area, segment_map->header->prev);
1985  prev->header->next = segment_map->header->next;
1986  }
1987  else
1988  {
1989  Assert(area->control->segment_bins[segment_map->header->bin] ==
1990  get_segment_index(area, segment_map));
1991  area->control->segment_bins[segment_map->header->bin] =
1992  segment_map->header->next;
1993  }
1994  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1995  {
1997 
1998  next = get_segment_by_index(area, segment_map->header->next);
1999  next->header->prev = segment_map->header->prev;
2000  }
2001 }
2002 
2003 /*
2004  * Find a segment that could satisfy a request for 'npages' of contiguous
2005  * memory, or return NULL if none can be found. This may involve attaching to
2006  * segments that weren't previously attached so that we can query their free
2007  * pages map.
2008  */
2009 static dsa_segment_map *
2010 get_best_segment(dsa_area *area, size_t npages)
2011 {
2012  size_t bin;
2013 
2016 
2017  /*
2018  * Start searching from the first bin that *might* have enough contiguous
2019  * pages.
2020  */
2021  for (bin = contiguous_pages_to_segment_bin(npages);
2022  bin < DSA_NUM_SEGMENT_BINS;
2023  ++bin)
2024  {
2025  /*
2026  * The minimum contiguous size that any segment in this bin should
2027  * have. We'll re-bin if we see segments with fewer.
2028  */
2029  size_t threshold = (size_t) 1 << (bin - 1);
2030  dsa_segment_index segment_index;
2031 
2032  /* Search this bin for a segment with enough contiguous space. */
2033  segment_index = area->control->segment_bins[bin];
2034  while (segment_index != DSA_SEGMENT_INDEX_NONE)
2035  {
2036  dsa_segment_map *segment_map;
2037  dsa_segment_index next_segment_index;
2038  size_t contiguous_pages;
2039 
2040  segment_map = get_segment_by_index(area, segment_index);
2041  next_segment_index = segment_map->header->next;
2042  contiguous_pages = fpm_largest(segment_map->fpm);
2043 
2044  /* Not enough for the request, still enough for this bin. */
2045  if (contiguous_pages >= threshold && contiguous_pages < npages)
2046  {
2047  segment_index = next_segment_index;
2048  continue;
2049  }
2050 
2051  /* Re-bin it if it's no longer in the appropriate bin. */
2052  if (contiguous_pages < threshold)
2053  {
2054  rebin_segment(area, segment_map);
2055 
2056  /*
2057  * But fall through to see if it's enough to satisfy this
2058  * request anyway....
2059  */
2060  }
2061 
2062  /* Check if we are done. */
2063  if (contiguous_pages >= npages)
2064  return segment_map;
2065 
2066  /* Continue searching the same bin. */
2067  segment_index = next_segment_index;
2068  }
2069  }
2070 
2071  /* Not found. */
2072  return NULL;
2073 }
2074 
2075 /*
2076  * Create a new segment that can handle at least requested_pages. Returns
2077  * NULL if the requested total size limit or maximum allowed number of
2078  * segments would be exceeded.
2079  */
2080 static dsa_segment_map *
2081 make_new_segment(dsa_area *area, size_t requested_pages)
2082 {
2083  dsa_segment_index new_index;
2084  size_t metadata_bytes;
2085  size_t total_size;
2086  size_t total_pages;
2087  size_t usable_pages;
2088  dsa_segment_map *segment_map;
2089  dsm_segment *segment;
2090  ResourceOwner oldowner;
2091 
2093 
2094  /* Find a segment slot that is not in use (linearly for now). */
2095  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2096  {
2097  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2098  break;
2099  }
2100  if (new_index == DSA_MAX_SEGMENTS)
2101  return NULL;
2102 
2103  /*
2104  * If the total size limit is already exceeded, then we exit early and
2105  * avoid arithmetic wraparound in the unsigned expressions below.
2106  */
2107  if (area->control->total_segment_size >=
2109  return NULL;
2110 
2111  /*
2112  * The size should be at least as big as requested, and at least big
2113  * enough to follow a geometric series that approximately doubles the
2114  * total storage each time we create a new segment. We use geometric
2115  * growth because the underlying DSM system isn't designed for large
2116  * numbers of segments (otherwise we might even consider just using one
2117  * DSM segment for each large allocation and for each superblock, and then
2118  * we wouldn't need to use FreePageManager).
2119  *
2120  * We decide on a total segment size first, so that we produce tidy
2121  * power-of-two sized segments. This is a good property to have if we
2122  * move to huge pages in the future. Then we work back to the number of
2123  * pages we can fit.
2124  */
2126  ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2130  area->control->total_segment_size);
2131 
2132  total_pages = total_size / FPM_PAGE_SIZE;
2133  metadata_bytes =
2134  MAXALIGN(sizeof(dsa_segment_header)) +
2135  MAXALIGN(sizeof(FreePageManager)) +
2136  sizeof(dsa_pointer) * total_pages;
2137 
2138  /* Add padding up to next page boundary. */
2139  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2140  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2141  if (total_size <= metadata_bytes)
2142  return NULL;
2143  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2144  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2145 
2146  /* See if that is enough... */
2147  if (requested_pages > usable_pages)
2148  {
2149  /*
2150  * We'll make an odd-sized segment, working forward from the requested
2151  * number of pages.
2152  */
2153  usable_pages = requested_pages;
2154  metadata_bytes =
2155  MAXALIGN(sizeof(dsa_segment_header)) +
2156  MAXALIGN(sizeof(FreePageManager)) +
2157  usable_pages * sizeof(dsa_pointer);
2158 
2159  /* Add padding up to next page boundary. */
2160  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2161  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2162  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2163 
2164  /* Is that too large for dsa_pointer's addressing scheme? */
2166  return NULL;
2167 
2168  /* Would that exceed the limit? */
2170  area->control->total_segment_size)
2171  return NULL;
2172  }
2173 
2174  /* Create the segment. */
2175  oldowner = CurrentResourceOwner;
2177  segment = dsm_create(total_size, 0);
2178  CurrentResourceOwner = oldowner;
2179  if (segment == NULL)
2180  return NULL;
2181  dsm_pin_segment(segment);
2182 
2183  /* Store the handle in shared memory to be found by index. */
2184  area->control->segment_handles[new_index] =
2185  dsm_segment_handle(segment);
2186  /* Track the highest segment index in the history of the area. */
2187  if (area->control->high_segment_index < new_index)
2188  area->control->high_segment_index = new_index;
2189  /* Track the highest segment index this backend has ever mapped. */
2190  if (area->high_segment_index < new_index)
2191  area->high_segment_index = new_index;
2192  /* Track total size of all segments. */
2196 
2197  /* Build a segment map for this segment in this backend. */
2198  segment_map = &area->segment_maps[new_index];
2199  segment_map->segment = segment;
2200  segment_map->mapped_address = dsm_segment_address(segment);
2201  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2202  segment_map->fpm = (FreePageManager *)
2203  (segment_map->mapped_address +
2204  MAXALIGN(sizeof(dsa_segment_header)));
2205  segment_map->pagemap = (dsa_pointer *)
2206  (segment_map->mapped_address +
2207  MAXALIGN(sizeof(dsa_segment_header)) +
2208  MAXALIGN(sizeof(FreePageManager)));
2209 
2210  /* Set up the free page map. */
2211  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2212  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2213  usable_pages);
2214 
2215  /* Set up the segment header and put it in the appropriate bin. */
2216  segment_map->header->magic =
2217  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2218  segment_map->header->usable_pages = usable_pages;
2219  segment_map->header->size = total_size;
2220  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2221  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2222  segment_map->header->next =
2223  area->control->segment_bins[segment_map->header->bin];
2224  segment_map->header->freed = false;
2225  area->control->segment_bins[segment_map->header->bin] = new_index;
2226  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2227  {
2229  get_segment_by_index(area, segment_map->header->next);
2230 
2231  Assert(next->header->bin == segment_map->header->bin);
2232  next->header->prev = new_index;
2233  }
2234 
2235  return segment_map;
2236 }
2237 
2238 /*
2239  * Check if any segments have been freed by destroy_superblock, so we can
2240  * detach from them in this backend. This function is called by
2241  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2242  * received can be resolved to the correct segment.
2243  *
2244  * The danger we want to defend against is that there could be an old segment
2245  * mapped into a given slot in this backend, and the dsa_pointer they have
2246  * might refer to some new segment in the same slot. So those functions must
2247  * be sure to process all instructions to detach from a freed segment that had
2248  * been generated by the time this process received the dsa_pointer, before
2249  * they call get_segment_by_index.
2250  */
2251 static void
2253 {
2254  size_t freed_segment_counter;
2255 
2256  /*
2257  * Any other process that has freed a segment has incremented
2258  * freed_segment_counter while holding an LWLock, and that must precede
2259  * any backend creating a new segment in the same slot while holding an
2260  * LWLock, and that must precede the creation of any dsa_pointer pointing
2261  * into the new segment which might reach us here, and the caller must
2262  * have sent the dsa_pointer to this process using appropriate memory
2263  * synchronization (some kind of locking or atomic primitive or system
2264  * call). So all we need to do on the reading side is ask for the load of
2265  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2266  * has, and we can be sure to detect any segments that had been freed as
2267  * of the time that the dsa_pointer reached this process.
2268  */
2269  pg_read_barrier();
2270  freed_segment_counter = area->control->freed_segment_counter;
2271  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2272  {
2273  /* Check all currently mapped segments to find what's been freed. */
2277  }
2278 }
2279 
2280 /*
2281  * Workhorse for check_for_freed_segments(), and also used directly in path
2282  * where the area lock is already held. This should be called after acquiring
2283  * the lock but before looking up any segment by index number, to make sure we
2284  * unmap any stale segments that might have previously had the same index as a
2285  * current segment.
2286  */
2287 static void
2289 {
2290  size_t freed_segment_counter;
2291  int i;
2292 
2294  freed_segment_counter = area->control->freed_segment_counter;
2295  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2296  {
2297  for (i = 0; i <= area->high_segment_index; ++i)
2298  {
2299  if (area->segment_maps[i].header != NULL &&
2300  area->segment_maps[i].header->freed)
2301  {
2302  dsm_detach(area->segment_maps[i].segment);
2303  area->segment_maps[i].segment = NULL;
2304  area->segment_maps[i].header = NULL;
2305  area->segment_maps[i].mapped_address = NULL;
2306  }
2307  }
2308  area->freed_segment_counter = freed_segment_counter;
2309  }
2310 }
2311 
2312 /*
2313  * Re-bin segment if it's no longer in the appropriate bin.
2314  */
2315 static void
2317 {
2318  size_t new_bin;
2319  dsa_segment_index segment_index;
2320 
2321  new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2322  if (segment_map->header->bin == new_bin)
2323  return;
2324 
2325  /* Remove it from its current bin. */
2326  unlink_segment(area, segment_map);
2327 
2328  /* Push it onto the front of its new bin. */
2329  segment_index = get_segment_index(area, segment_map);
2330  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2331  segment_map->header->next = area->control->segment_bins[new_bin];
2332  segment_map->header->bin = new_bin;
2333  area->control->segment_bins[new_bin] = segment_index;
2334  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2335  {
2337 
2338  next = get_segment_by_index(area, segment_map->header->next);
2339  Assert(next->header->bin == new_bin);
2340  next->header->prev = segment_index;
2341  }
2342 }
#define pg_read_barrier()
Definition: atomics.h:156
static int32 next
Definition: blutils.c:221
unsigned short uint16
Definition: c.h:505
unsigned int uint32
Definition: c.h:506
#define Min(x, y)
Definition: c.h:1004
#define MAXALIGN(LEN)
Definition: c.h:811
#define Assert(condition)
Definition: c.h:858
#define unlikely(x)
Definition: c.h:311
#define lengthof(array)
Definition: c.h:788
unsigned char uint8
Definition: c.h:504
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1978
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2252
static const uint16 dsa_size_classes[]
Definition: dsa.c:225
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:96
#define DSA_AREA_LOCK(area)
Definition: dsa.c:132
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:69
dsa_area * dsa_create_in_place_ext(void *place, size_t size, int tranche_id, dsm_segment *segment, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:471
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1929
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1560
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:105
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:1218
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:89
void dsa_trim(dsa_area *area)
Definition: dsa.c:1043
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:375
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:92
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:379
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:590
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:576
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1472
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:82
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:258
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:119
static const uint8 dsa_size_class_map[]
Definition: dsa.c:248
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:671
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2081
size_t dsa_get_total_size(dsa_area *area)
Definition: dsa.c:1027
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:510
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:376
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:322
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:942
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2288
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:99
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1837
#define DSA_MAX_SEGMENTS
Definition: dsa.c:75
size_t dsa_segment_index
Definition: dsa.c:102
static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2316
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:133
void dsa_release_in_place(void *place)
Definition: dsa.c:605
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1757
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1018
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:239
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1432
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1906
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:240
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:236
void dsa_unpin(dsa_area *area)
Definition: dsa.c:994
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:635
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1326
#define NextFreeObjectIndex(object)
Definition: dsa.c:202
void dsa_dump(dsa_area *area)
Definition: dsa.c:1088
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:498
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1377
void dsa_detach(dsa_area *area)
Definition: dsa.c:1952
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:2010
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:545
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:266
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:826
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:111
dsa_area * dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:421
size_t dsa_minimum_size(void)
Definition: dsa.c:1196
void dsa_pin(dsa_area *area)
Definition: dsa.c:975
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
#define DSA_MIN_SEGMENT_SIZE
Definition: dsa.h:100
dsm_handle dsa_handle
Definition: dsa.h:136
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:139
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.h:103
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1123
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1095
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:803
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1132
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:665
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:516
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:915
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:988
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:955
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
return str start
int j
Definition: isn.c:74
int i
Definition: isn.c:73
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1893
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:707
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:416
static int64 total_size
Definition: pg_checksums.c:63
#define fprintf
Definition: port.h:242
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
static pg_noinline void Size size
Definition: slab.c:607
Definition: lwlock.h:42
dsa_segment_header segment_header
Definition: dsa.c:290
size_t init_segment_size
Definition: dsa.c:300
size_t total_segment_size
Definition: dsa.c:304
int lwlock_tranche_id
Definition: dsa.c:316
size_t max_segment_size
Definition: dsa.c:302
dsa_segment_index high_segment_index
Definition: dsa.c:308
bool pinned
Definition: dsa.c:312
size_t max_total_segment_size
Definition: dsa.c:306
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:296
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:298
size_t freed_segment_counter
Definition: dsa.c:314
int refcnt
Definition: dsa.c:310
LWLock lock
Definition: dsa.c:318
dsa_handle handle
Definition: dsa.c:292
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:294
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:279
LWLock lock
Definition: dsa.c:277
dsa_pointer nextspan
Definition: dsa.c:187
uint16 fclass
Definition: dsa.c:195
dsa_pointer start
Definition: dsa.c:188
uint16 nallocatable
Definition: dsa.c:192
dsa_pointer prevspan
Definition: dsa.c:186
uint16 size_class
Definition: dsa.c:190
uint16 nmax
Definition: dsa.c:194
uint16 ninitialized
Definition: dsa.c:191
uint16 firstfree
Definition: dsa.c:193
dsa_pointer pool
Definition: dsa.c:185
size_t npages
Definition: dsa.c:189
Definition: dsa.c:348
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:366
dsa_segment_index high_segment_index
Definition: dsa.c:369
size_t freed_segment_counter
Definition: dsa.c:372
dsa_area_control * control
Definition: dsa.c:350
ResourceOwner resowner
Definition: dsa.c:358
uint32 magic
Definition: dsa.c:143
size_t size
Definition: dsa.c:147
dsa_segment_index next
Definition: dsa.c:159
dsa_segment_index prev
Definition: dsa.c:153
size_t usable_pages
Definition: dsa.c:145
size_t bin
Definition: dsa.c:161
dsa_segment_header * header
Definition: dsa.c:336
FreePageManager * fpm
Definition: dsa.c:337
dsm_segment * segment
Definition: dsa.c:334
dsa_pointer * pagemap
Definition: dsa.c:338
char * mapped_address
Definition: dsa.c:335
Definition: type.h:95