PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "storage/dsm.h"
55 #include "storage/ipc.h"
56 #include "storage/lwlock.h"
57 #include "storage/shmem.h"
58 #include "utils/dsa.h"
59 #include "utils/freepage.h"
60 #include "utils/memutils.h"
61 
62 /*
63  * The size of the initial DSM segment that backs a dsa_area created by
64  * dsa_create. After creating some number of segments of this size we'll
65  * double this size, and so on. Larger segments may be created if necessary
66  * to satisfy large requests.
67  */
68 #define DSA_INITIAL_SEGMENT_SIZE ((Size) (1 * 1024 * 1024))
69 
70 /*
71  * How many segments to create before we double the segment size. If this is
72  * low, then there is likely to be a lot of wasted space in the largest
73  * segment. If it is high, then we risk running out of segment slots (see
74  * dsm.c's limits on total number of segments), or limiting the total size
75  * an area can manage when using small pointers.
76  */
77 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4
78 
79 /*
80  * The number of bits used to represent the offset part of a dsa_pointer.
81  * This controls the maximum size of a segment, the maximum possible
82  * allocation size and also the maximum number of segments per area.
83  */
84 #if SIZEOF_DSA_POINTER == 4
85 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
86 #else
87 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
88 #endif
89 
90 /*
91  * The maximum number of DSM segments that an area can own, determined by
92  * the number of bits remaining (but capped at 1024).
93  */
94 #define DSA_MAX_SEGMENTS \
95  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96 
97 /* The bitmask for extracting the offset from a dsa_pointer. */
98 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99 
100 /* The maximum size of a DSM segment. */
101 #define DSA_MAX_SEGMENT_SIZE ((Size) 1 << DSA_OFFSET_WIDTH)
102 
103 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104 #define DSA_PAGES_PER_SUPERBLOCK 16
105 
106 /*
107  * A magic number used as a sanity check for following DSM segments belonging
108  * to a DSA area (this number will be XORed with the area handle and
109  * the segment index).
110  */
111 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112 
113 /* Build a dsa_pointer given a segment number and offset. */
114 #define DSA_MAKE_POINTER(segment_number, offset) \
115  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116 
117 /* Extract the segment number from a dsa_pointer. */
118 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119 
120 /* Extract the offset from a dsa_pointer. */
121 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122 
123 /* The type used for index segment indexes (zero based). */
125 
126 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128 
129 /*
130  * How many bins of segments do we have? The bins are used to categorize
131  * segments by their largest contiguous run of free pages.
132  */
133 #define DSA_NUM_SEGMENT_BINS 16
134 
135 /*
136  * What is the lowest bin that holds segments that *might* have n contiguous
137  * free pages? There is no point in looking in segments in lower bins; they
138  * definitely can't service a request for n free pages.
139  */
140 #define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
141 
142 /* Macros for access to locks. */
143 #define DSA_AREA_LOCK(area) (&area->control->lock)
144 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
145 
146 /*
147  * The header for an individual segment. This lives at the start of each DSM
148  * segment owned by a DSA area including the first segment (where it appears
149  * as part of the dsa_area_control struct).
150  */
151 typedef struct
152 {
153  /* Sanity check magic value. */
155  /* Total number of pages in this segment (excluding metadata area). */
157  /* Total size of this segment in bytes. */
159 
160  /*
161  * Index of the segment that precedes this one in the same segment bin, or
162  * DSA_SEGMENT_INDEX_NONE if this is the first one.
163  */
165 
166  /*
167  * Index of the segment that follows this one in the same segment bin, or
168  * DSA_SEGMENT_INDEX_NONE if this is the last one.
169  */
171  /* The index of the bin that contains this segment. */
173 
174  /*
175  * A flag raised to indicate that this segment is being returned to the
176  * operating system and has been unpinned.
177  */
178  bool freed;
180 
181 /*
182  * Metadata for one superblock.
183  *
184  * For most blocks, span objects are stored out-of-line; that is, the span
185  * object is not stored within the block itself. But, as an exception, for a
186  * "span of spans", the span object is stored "inline". The allocation is
187  * always exactly one page, and the dsa_area_span object is located at
188  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
189  * and the remaining fields are used just as they would be in an ordinary
190  * block. We can't allocate spans out of ordinary superblocks because
191  * creating an ordinary superblock requires us to be able to allocate a span
192  * *first*. Doing it this way avoids that circularity.
193  */
194 typedef struct
195 {
196  dsa_pointer pool; /* Containing pool. */
197  dsa_pointer prevspan; /* Previous span. */
198  dsa_pointer nextspan; /* Next span. */
199  dsa_pointer start; /* Starting address. */
200  Size npages; /* Length of span in pages. */
201  uint16 size_class; /* Size class. */
202  uint16 ninitialized; /* Maximum number of objects ever allocated. */
203  uint16 nallocatable; /* Number of objects currently allocatable. */
204  uint16 firstfree; /* First object on free list. */
205  uint16 nmax; /* Maximum number of objects ever possible. */
206  uint16 fclass; /* Current fullness class. */
207 } dsa_area_span;
208 
209 /*
210  * Given a pointer to an object in a span, access the index of the next free
211  * object in the same span (ie in the span's freelist) as an L-value.
212  */
213 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
214 
215 /*
216  * Small allocations are handled by dividing a single block of memory into
217  * many small objects of equal size. The possible allocation sizes are
218  * defined by the following array. Larger size classes are spaced more widely
219  * than smaller size classes. We fudge the spacing for size classes >1kB to
220  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
221  * blocks, we bump the maximum object size up to the largest multiple of
222  * 8 bytes that still lets us fit the same number of objects into one block.
223  *
224  * NB: Because of this fudging, if we were ever to use differently-sized blocks
225  * for small allocations, these size classes would need to be reworked to be
226  * optimal for the new size.
227  *
228  * NB: The optimal spacing for size classes, as well as the size of the blocks
229  * out of which small objects are allocated, is not a question that has one
230  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
231  * size classes than we do here, while others (like aset.c) use more
232  * widely-spaced classes. Spacing the classes more closely avoids wasting
233  * memory within individual chunks, but also means a larger number of
234  * potentially-unfilled blocks.
235  */
236 static const uint16 dsa_size_classes[] = {
237  sizeof(dsa_area_span), 0, /* special size classes */
238  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
239  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
240  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
241  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
242  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
243  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
244  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
245  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
246 };
247 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
248 
249 /* Special size classes. */
250 #define DSA_SCLASS_BLOCK_OF_SPANS 0
251 #define DSA_SCLASS_SPAN_LARGE 1
252 
253 /*
254  * The following lookup table is used to map the size of small objects
255  * (less than 1kB) onto the corresponding size class. To use this table,
256  * round the size of the object up to the next multiple of 8 bytes, and then
257  * index into this array.
258  */
259 static char dsa_size_class_map[] = {
260  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
261  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
262  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
263  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
264  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
265  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
266  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
267  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
268 };
269 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
270 
271 /*
272  * Superblocks are binned by how full they are. Generally, each fullness
273  * class corresponds to one quartile, but the block being used for
274  * allocations is always at the head of the list for fullness class 1,
275  * regardless of how full it really is.
276  */
277 #define DSA_FULLNESS_CLASSES 4
278 
279 /*
280  * A dsa_area_pool represents a set of objects of a given size class.
281  *
282  * Perhaps there should be multiple pools for the same size class for
283  * contention avoidance, but for now there is just one!
284  */
285 typedef struct
286 {
287  /* A lock protecting access to this pool. */
289  /* A set of linked lists of spans, arranged by fullness. */
291  /* Should we pad this out to a cacheline boundary? */
292 } dsa_area_pool;
293 
294 /*
295  * The control block for an area. This lives in shared memory, at the start of
296  * the first DSM segment controlled by this area.
297  */
298 typedef struct
299 {
300  /* The segment header for the first segment. */
302  /* The handle for this area. */
304  /* The handles of the segments owned by this area. */
305  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
306  /* Lists of segments, binned by maximum contiguous run of free pages. */
308  /* The object pools for each size class. */
310  /* The total size of all active segments. */
312  /* The maximum total size of backing storage we are allowed. */
314  /* Highest used segment index in the history of this area. */
316  /* The reference count for this area. */
317  int refcnt;
318  /* A flag indicating that this area has been pinned. */
319  bool pinned;
320  /* The number of times that segments have been freed. */
322  /* The LWLock tranche ID. */
324  /* The general lock (protects everything except object pools). */
327 
328 /* Given a pointer to a pool, find a dsa_pointer. */
329 #define DsaAreaPoolToDsaPointer(area, p) \
330  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
331 
332 /*
333  * A dsa_segment_map is stored within the backend-private memory of each
334  * individual backend. It holds the base address of the segment within that
335  * backend, plus the addresses of key objects within the segment. Those
336  * could instead be derived from the base address but it's handy to have them
337  * around.
338  */
339 typedef struct
340 {
341  dsm_segment *segment; /* DSM segment */
342  char *mapped_address; /* Address at which segment is mapped */
343  dsa_segment_header *header; /* Header (same as mapped_address) */
344  FreePageManager *fpm; /* Free page manager within segment. */
345  dsa_pointer *pagemap; /* Page map within segment. */
347 
348 /*
349  * Per-backend state for a storage area. Backends obtain one of these by
350  * creating an area or attaching to an existing one using a handle. Each
351  * process that needs to use an area uses its own object to track where the
352  * segments are mapped.
353  */
354 struct dsa_area
355 {
356  /* Pointer to the control object in shared memory. */
358 
359  /* Has the mapping been pinned? */
361 
362  /*
363  * This backend's array of segment maps, ordered by segment index
364  * corresponding to control->segment_handles. Some of the area's segments
365  * may not be mapped in in this backend yet, and some slots may have been
366  * freed and need to be detached; these operations happen on demand.
367  */
369 
370  /* The highest segment index this backend has ever mapped. */
372 
373  /* The last observed freed_segment_counter. */
375 };
376 
377 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
378 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
379 
380 /* Given a pointer to a segment_map, obtain a segment index number. */
381 #define get_segment_index(area, segment_map_ptr) \
382  (segment_map_ptr - &area->segment_maps[0])
383 
384 static void init_span(dsa_area *area, dsa_pointer span_pointer,
385  dsa_area_pool *pool, dsa_pointer start, Size npages,
386  uint16 size_class);
387 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
388  int fromclass, int toclass);
389 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
390 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
391  int size_class);
394 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
395 static void unlink_span(dsa_area *area, dsa_area_span *span);
396 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
397  dsa_pointer span_pointer, int fclass);
398 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
399 static dsa_segment_map *get_best_segment(dsa_area *area, Size npages);
400 static dsa_segment_map *make_new_segment(dsa_area *area, Size requested_pages);
401 static dsa_area *create_internal(void *place, size_t size,
402  int tranche_id,
403  dsm_handle control_handle,
404  dsm_segment *control_segment);
405 static dsa_area *attach_internal(void *place, dsm_segment *segment,
406  dsa_handle handle);
407 static void check_for_freed_segments(dsa_area *area);
408 
409 /*
410  * Create a new shared area in a new DSM segment. Further DSM segments will
411  * be allocated as required to extend the available space.
412  *
413  * We can't allocate a LWLock tranche_id within this function, because tranche
414  * IDs are a scarce resource; there are only 64k available, using low numbers
415  * when possible matters, and we have no provision for recycling them. So,
416  * we require the caller to provide one.
417  */
418 dsa_area *
419 dsa_create(int tranche_id)
420 {
421  dsm_segment *segment;
422  dsa_area *area;
423 
424  /*
425  * Create the DSM segment that will hold the shared control object and the
426  * first segment of usable space.
427  */
428  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
429 
430  /*
431  * All segments backing this area are pinned, so that DSA can explicitly
432  * control their lifetime (otherwise a newly created segment belonging to
433  * this area might be freed when the only backend that happens to have it
434  * mapped in ends, corrupting the area).
435  */
436  dsm_pin_segment(segment);
437 
438  /* Create a new DSA area with the control object in this segment. */
439  area = create_internal(dsm_segment_address(segment),
441  tranche_id,
442  dsm_segment_handle(segment), segment);
443 
444  /* Clean up when the control segment detaches. */
447 
448  return area;
449 }
450 
451 /*
452  * Create a new shared area in an existing shared memory space, which may be
453  * either DSM or Postmaster-initialized memory. DSM segments will be
454  * allocated as required to extend the available space, though that can be
455  * prevented with dsa_set_size_limit(area, size) using the same size provided
456  * to dsa_create_in_place.
457  *
458  * Areas created in-place must eventually be released by the backend that
459  * created them and all backends that attach to them. This can be done
460  * explicitly with dsa_release_in_place, or, in the special case that 'place'
461  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
462  * segment so that a detach hook can be registered with the containing DSM
463  * segment.
464  *
465  * See dsa_create() for a note about the tranche arguments.
466  */
467 dsa_area *
468 dsa_create_in_place(void *place, size_t size,
469  int tranche_id, dsm_segment *segment)
470 {
471  dsa_area *area;
472 
473  area = create_internal(place, size, tranche_id,
475 
476  /*
477  * Clean up when the control segment detaches, if a containing DSM segment
478  * was provided.
479  */
480  if (segment != NULL)
482  PointerGetDatum(place));
483 
484  return area;
485 }
486 
487 /*
488  * Obtain a handle that can be passed to other processes so that they can
489  * attach to the given area. Cannot be called for areas created with
490  * dsa_create_in_place.
491  */
494 {
496  return area->control->handle;
497 }
498 
499 /*
500  * Attach to an area given a handle generated (possibly in another process) by
501  * dsa_get_area_handle. The area must have been created with dsa_create (not
502  * dsa_create_in_place).
503  */
504 dsa_area *
506 {
507  dsm_segment *segment;
508  dsa_area *area;
509 
510  /*
511  * An area handle is really a DSM segment handle for the first segment, so
512  * we go ahead and attach to that.
513  */
514  segment = dsm_attach(handle);
515  if (segment == NULL)
516  ereport(ERROR,
517  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
518  errmsg("could not attach to dsa_handle")));
519 
520  area = attach_internal(dsm_segment_address(segment), segment, handle);
521 
522  /* Clean up when the control segment detaches. */
525 
526  return area;
527 }
528 
529 /*
530  * Attach to an area that was created with dsa_create_in_place. The caller
531  * must somehow know the location in memory that was used when the area was
532  * created, though it may be mapped at a different virtual address in this
533  * process.
534  *
535  * See dsa_create_in_place for note about releasing in-place areas, and the
536  * optional 'segment' argument which can be provided to allow automatic
537  * release if the containing memory happens to be a DSM segment.
538  */
539 dsa_area *
540 dsa_attach_in_place(void *place, dsm_segment *segment)
541 {
542  dsa_area *area;
543 
544  area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
545 
546  /*
547  * Clean up when the control segment detaches, if a containing DSM segment
548  * was provided.
549  */
550  if (segment != NULL)
552  PointerGetDatum(place));
553 
554  return area;
555 }
556 
557 /*
558  * Release a DSA area that was produced by dsa_create_in_place or
559  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
560  * interface suitable for on_dsm_detach, for the convenience of users who want
561  * to create a DSA segment inside an existing DSM segment and have it
562  * automatically released when the containing DSM segment is detached.
563  * 'place' should be the address of the place where the area was created.
564  *
565  * This callback is automatically registered for the DSM segment containing
566  * the control object of in-place areas when a segment is provided to
567  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
568  * with dsa_create.
569  */
570 void
572 {
574 }
575 
576 /*
577  * Release a DSA area that was produced by dsa_create_in_place or
578  * dsa_attach_in_place. The 'code' argument is ignored but provides an
579  * interface suitable for on_shmem_exit or before_shmem_exit, for the
580  * convenience of users who want to create a DSA segment inside shared memory
581  * other than a DSM segment and have it automatically release at backend exit.
582  * 'place' should be the address of the place where the area was created.
583  */
584 void
586 {
588 }
589 
590 /*
591  * Release a DSA area that was produced by dsa_create_in_place or
592  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
593  * callbacks so that this is managed automatically, because failure to release
594  * an area created in-place leaks its segments permanently.
595  *
596  * This is also called automatically for areas produced by dsa_create or
597  * dsa_attach as an implementation detail.
598  */
599 void
601 {
602  dsa_area_control *control = (dsa_area_control *) place;
603  int i;
604 
605  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
606  Assert(control->segment_header.magic ==
607  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
608  Assert(control->refcnt > 0);
609  if (--control->refcnt == 0)
610  {
611  for (i = 0; i <= control->high_segment_index; ++i)
612  {
613  dsm_handle handle;
614 
615  handle = control->segment_handles[i];
616  if (handle != DSM_HANDLE_INVALID)
617  dsm_unpin_segment(handle);
618  }
619  }
620  LWLockRelease(&control->lock);
621 }
622 
623 /*
624  * Keep a DSA area attached until end of session or explicit detach.
625  *
626  * By default, areas are owned by the current resource owner, which means they
627  * are detached automatically when that scope ends.
628  */
629 void
631 {
632  int i;
633 
634  Assert(!area->mapping_pinned);
635  area->mapping_pinned = true;
636 
637  for (i = 0; i <= area->high_segment_index; ++i)
638  if (area->segment_maps[i].segment != NULL)
640 }
641 
642 /*
643  * Allocate memory in this storage area. The return value is a dsa_pointer
644  * that can be passed to other processes, and converted to a local pointer
645  * with dsa_get_address. 'flags' is a bitmap which should be constructed
646  * from the following values:
647  *
648  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
649  * will result in an ERROR.
650  *
651  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
652  * no memory is available or a size limit establed by set_dsa_size_limit
653  * would be exceeded. Otherwise, such allocations will result in an ERROR.
654  *
655  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
656  * contents of newly-allocated memory are indeterminate.
657  *
658  * These flags correspond to similarly named flags used by
659  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
660  * dsa_allocate0 which expand to a call to this function with commonly used
661  * flags.
662  */
664 dsa_allocate_extended(dsa_area *area, Size size, int flags)
665 {
666  uint16 size_class;
667  dsa_pointer start_pointer;
668  dsa_segment_map *segment_map;
669  dsa_pointer result;
670 
671  Assert(size > 0);
672 
673  /* Sanity check on huge individual allocation size. */
674  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
675  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
676  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
677 
678  /*
679  * If bigger than the largest size class, just grab a run of pages from
680  * the free page manager, instead of allocating an object from a pool.
681  * There will still be a span, but it's a special class of span that
682  * manages this whole allocation and simply gives all pages back to the
683  * free page manager when dsa_free is called.
684  */
685  if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
686  {
687  Size npages = fpm_size_to_pages(size);
688  Size first_page;
689  dsa_pointer span_pointer;
691 
692  /* Obtain a span object. */
693  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
694  if (!DsaPointerIsValid(span_pointer))
695  return InvalidDsaPointer;
696 
698 
699  /* Find a segment from which to allocate. */
700  segment_map = get_best_segment(area, npages);
701  if (segment_map == NULL)
702  segment_map = make_new_segment(area, npages);
703  if (segment_map == NULL)
704  {
705  /* Can't make any more segments: game over. */
707  dsa_free(area, span_pointer);
708 
709  /* Raise error unless asked not to. */
710  if ((flags & MCXT_ALLOC_NO_OOM) == 0)
711  ereport(ERROR,
712  (errcode(ERRCODE_OUT_OF_MEMORY),
713  errmsg("out of memory"),
714  errdetail("Failed on DSA request of size %zu.",
715  size)));
716  return InvalidDsaPointer;
717  }
718 
719  /*
720  * Ask the free page manager for a run of pages. This should always
721  * succeed, since both get_best_segment and make_new_segment should
722  * only return a non-NULL pointer if it actually contains enough
723  * contiguous freespace. If it does fail, something in our backend
724  * private state is out of whack, so use FATAL to kill the process.
725  */
726  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
727  elog(FATAL,
728  "dsa_allocate could not find %zu free pages", npages);
730 
731  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
732  first_page * FPM_PAGE_SIZE);
733 
734  /* Initialize span and pagemap. */
736  LW_EXCLUSIVE);
737  init_span(area, span_pointer, pool, start_pointer, npages,
739  segment_map->pagemap[first_page] = span_pointer;
741 
742  /* Zero-initialize the memory if requested. */
743  if ((flags & DSA_ALLOC_ZERO) != 0)
744  memset(dsa_get_address(area, start_pointer), 0, size);
745 
746  return start_pointer;
747  }
748 
749  /* Map allocation to a size class. */
750  if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
751  {
752  int mapidx;
753 
754  /* For smaller sizes we have a lookup table... */
755  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
756  DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
757  size_class = dsa_size_class_map[mapidx];
758  }
759  else
760  {
761  uint16 min;
762  uint16 max;
763 
764  /* ... and for the rest we search by binary chop. */
765  min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
766  max = lengthof(dsa_size_classes) - 1;
767 
768  while (min < max)
769  {
770  uint16 mid = (min + max) / 2;
771  uint16 class_size = dsa_size_classes[mid];
772 
773  if (class_size < size)
774  min = mid + 1;
775  else
776  max = mid;
777  }
778 
779  size_class = min;
780  }
781  Assert(size <= dsa_size_classes[size_class]);
782  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
783 
784  /* Attempt to allocate an object from the appropriate pool. */
785  result = alloc_object(area, size_class);
786 
787  /* Check for failure to allocate. */
788  if (!DsaPointerIsValid(result))
789  {
790  /* Raise error unless asked not to. */
791  if ((flags & DSA_ALLOC_NO_OOM) == 0)
792  {
793  ereport(ERROR,
794  (errcode(ERRCODE_OUT_OF_MEMORY),
795  errmsg("out of memory"),
796  errdetail("Failed on DSA request of size %zu.", size)));
797  }
798  return InvalidDsaPointer;
799  }
800 
801  /* Zero-initialize the memory if requested. */
802  if ((flags & DSA_ALLOC_ZERO) != 0)
803  memset(dsa_get_address(area, result), 0, size);
804 
805  return result;
806 }
807 
808 /*
809  * Free memory obtained with dsa_allocate.
810  */
811 void
813 {
814  dsa_segment_map *segment_map;
815  int pageno;
816  dsa_pointer span_pointer;
817  dsa_area_span *span;
818  char *superblock;
819  char *object;
820  Size size;
821  int size_class;
822 
823  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
825 
826  /* Locate the object, span and pool. */
827  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
828  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
829  span_pointer = segment_map->pagemap[pageno];
830  span = dsa_get_address(area, span_pointer);
831  superblock = dsa_get_address(area, span->start);
832  object = dsa_get_address(area, dp);
833  size_class = span->size_class;
834  size = dsa_size_classes[size_class];
835 
836  /*
837  * Special case for large objects that live in a special span: we return
838  * those pages directly to the free page manager and free the span.
839  */
840  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
841  {
842 
843 #ifdef CLOBBER_FREED_MEMORY
844  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
845 #endif
846 
847  /* Give pages back to free page manager. */
849  FreePageManagerPut(segment_map->fpm,
851  span->npages);
853  /* Unlink span. */
855  LW_EXCLUSIVE);
856  unlink_span(area, span);
858  /* Free the span object so it can be reused. */
859  dsa_free(area, span_pointer);
860  return;
861  }
862 
863 #ifdef CLOBBER_FREED_MEMORY
864  memset(object, 0x7f, size);
865 #endif
866 
867  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
868 
869  /* Put the object on the span's freelist. */
870  Assert(object >= superblock);
871  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
872  Assert((object - superblock) % size == 0);
873  NextFreeObjectIndex(object) = span->firstfree;
874  span->firstfree = (object - superblock) / size;
875  ++span->nallocatable;
876 
877  /*
878  * See if the span needs to moved to a different fullness class, or be
879  * freed so its pages can be given back to the segment.
880  */
881  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
882  {
883  /*
884  * The block was completely full and is located in the
885  * highest-numbered fullness class, which is never scanned for free
886  * chunks. We must move it to the next-lower fullness class.
887  */
888  unlink_span(area, span);
889  add_span_to_fullness_class(area, span, span_pointer,
891 
892  /*
893  * If this is the only span, and there is no active span, then we
894  * should probably move this span to fullness class 1. (Otherwise if
895  * you allocate exactly all the objects in the only span, it moves to
896  * class 3, then you free them all, it moves to 2, and then is given
897  * back, leaving no active span).
898  */
899  }
900  else if (span->nallocatable == span->nmax &&
901  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
902  {
903  /*
904  * This entire block is free, and it's not the active block for this
905  * size class. Return the memory to the free page manager. We don't
906  * do this for the active block to prevent hysteresis: if we
907  * repeatedly allocate and free the only chunk in the active block, it
908  * will be very inefficient if we deallocate and reallocate the block
909  * every time.
910  */
911  destroy_superblock(area, span_pointer);
912  }
913 
914  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
915 }
916 
917 /*
918  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
919  * memory allocated by the given area (possibly in another process) that
920  * hasn't yet been freed. This may cause a segment to be mapped into the
921  * current process if required, and may cause freed segments to be unmapped.
922  */
923 void *
925 {
927  Size offset;
928 
929  /* Convert InvalidDsaPointer to NULL. */
930  if (!DsaPointerIsValid(dp))
931  return NULL;
932 
933  /* Process any requests to detach from freed segments. */
935 
936  /* Break the dsa_pointer into its components. */
937  index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
938  offset = DSA_EXTRACT_OFFSET(dp);
939  Assert(index < DSA_MAX_SEGMENTS);
940 
941  /* Check if we need to cause this segment to be mapped in. */
942  if (unlikely(area->segment_maps[index].mapped_address == NULL))
943  {
944  /* Call for effect (we don't need the result). */
945  get_segment_by_index(area, index);
946  }
947 
948  return area->segment_maps[index].mapped_address + offset;
949 }
950 
951 /*
952  * Pin this area, so that it will continue to exist even if all backends
953  * detach from it. In that case, the area can still be reattached to if a
954  * handle has been recorded somewhere.
955  */
956 void
958 {
960  if (area->control->pinned)
961  {
963  elog(ERROR, "dsa_area already pinned");
964  }
965  area->control->pinned = true;
966  ++area->control->refcnt;
968 }
969 
970 /*
971  * Undo the effects of dsa_pin, so that the given area can be freed when no
972  * backends are attached to it. May be called only if dsa_pin has been
973  * called.
974  */
975 void
977 {
979  Assert(area->control->refcnt > 1);
980  if (!area->control->pinned)
981  {
983  elog(ERROR, "dsa_area not pinned");
984  }
985  area->control->pinned = false;
986  --area->control->refcnt;
988 }
989 
990 /*
991  * Set the total size limit for this area. This limit is checked whenever new
992  * segments need to be allocated from the operating system. If the new size
993  * limit is already exceeded, this has no immediate effect.
994  *
995  * Note that the total virtual memory usage may be temporarily larger than
996  * this limit when segments have been freed, but not yet detached by all
997  * backends that have attached to them.
998  */
999 void
1001 {
1003  area->control->max_total_segment_size = limit;
1005 }
1006 
1007 /*
1008  * Aggressively free all spare memory in the hope of returning DSM segments to
1009  * the operating system.
1010  */
1011 void
1013 {
1014  int size_class;
1015 
1016  /*
1017  * Trim in reverse pool order so we get to the spans-of-spans last, just
1018  * in case any become entirely free while processing all the other pools.
1019  */
1020  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1021  {
1022  dsa_area_pool *pool = &area->control->pools[size_class];
1023  dsa_pointer span_pointer;
1024 
1025  if (size_class == DSA_SCLASS_SPAN_LARGE)
1026  {
1027  /* Large object frees give back segments aggressively already. */
1028  continue;
1029  }
1030 
1031  /*
1032  * Search fullness class 1 only. That is where we expect to find an
1033  * entirely empty superblock (entirely empty superblocks in other
1034  * fullness classes are returned to the free page map by dsa_free).
1035  */
1036  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1037  span_pointer = pool->spans[1];
1038  while (DsaPointerIsValid(span_pointer))
1039  {
1040  dsa_area_span *span = dsa_get_address(area, span_pointer);
1041  dsa_pointer next = span->nextspan;
1042 
1043  if (span->nallocatable == span->nmax)
1044  destroy_superblock(area, span_pointer);
1045 
1046  span_pointer = next;
1047  }
1048  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1049  }
1050 }
1051 
1052 /*
1053  * Print out debugging information about the internal state of the shared
1054  * memory area.
1055  */
1056 void
1058 {
1059  Size i,
1060  j;
1061 
1062  /*
1063  * Note: This gives an inconsistent snapshot as it acquires and releases
1064  * individual locks as it goes...
1065  */
1066 
1068  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1069  fprintf(stderr, " max_total_segment_size: %zu\n",
1071  fprintf(stderr, " total_segment_size: %zu\n",
1072  area->control->total_segment_size);
1073  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1074  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1075  fprintf(stderr, " segment bins:\n");
1076  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1077  {
1078  if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1079  {
1080  dsa_segment_index segment_index;
1081 
1082  fprintf(stderr,
1083  " segment bin %zu (at least %d contiguous pages free):\n",
1084  i, 1 << (i - 1));
1085  segment_index = area->control->segment_bins[i];
1086  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1087  {
1088  dsa_segment_map *segment_map;
1089 
1090  segment_map =
1091  get_segment_by_index(area, segment_index);
1092 
1093  fprintf(stderr,
1094  " segment index %zu, usable_pages = %zu, "
1095  "contiguous_pages = %zu, mapped at %p\n",
1096  segment_index,
1097  segment_map->header->usable_pages,
1098  fpm_largest(segment_map->fpm),
1099  segment_map->mapped_address);
1100  segment_index = segment_map->header->next;
1101  }
1102  }
1103  }
1105 
1106  fprintf(stderr, " pools:\n");
1107  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1108  {
1109  bool found = false;
1110 
1112  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1113  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1114  found = true;
1115  if (found)
1116  {
1117  if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1118  fprintf(stderr, " pool for blocks of span objects:\n");
1119  else if (i == DSA_SCLASS_SPAN_LARGE)
1120  fprintf(stderr, " pool for large object spans:\n");
1121  else
1122  fprintf(stderr,
1123  " pool for size class %zu (object size %hu bytes):\n",
1124  i, dsa_size_classes[i]);
1125  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1126  {
1127  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1128  fprintf(stderr, " fullness class %zu is empty\n", j);
1129  else
1130  {
1131  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1132 
1133  fprintf(stderr, " fullness class %zu:\n", j);
1134  while (DsaPointerIsValid(span_pointer))
1135  {
1136  dsa_area_span *span;
1137 
1138  span = dsa_get_address(area, span_pointer);
1139  fprintf(stderr,
1140  " span descriptor at "
1141  DSA_POINTER_FORMAT ", superblock at "
1143  ", pages = %zu, objects free = %hu/%hu\n",
1144  span_pointer, span->start, span->npages,
1145  span->nallocatable, span->nmax);
1146  span_pointer = span->nextspan;
1147  }
1148  }
1149  }
1150  }
1151  LWLockRelease(DSA_SCLASS_LOCK(area, i));
1152  }
1153 }
1154 
1155 /*
1156  * Return the smallest size that you can successfully provide to
1157  * dsa_create_in_place.
1158  */
1159 Size
1161 {
1162  Size size;
1163  int pages = 0;
1164 
1165  size = MAXALIGN(sizeof(dsa_area_control)) +
1166  MAXALIGN(sizeof(FreePageManager));
1167 
1168  /* Figure out how many pages we need, including the page map... */
1169  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1170  {
1171  ++pages;
1172  size += sizeof(dsa_pointer);
1173  }
1174 
1175  return pages * FPM_PAGE_SIZE;
1176 }
1177 
1178 /*
1179  * Workhorse function for dsa_create and dsa_create_in_place.
1180  */
1181 static dsa_area *
1182 create_internal(void *place, size_t size,
1183  int tranche_id,
1184  dsm_handle control_handle,
1185  dsm_segment *control_segment)
1186 {
1187  dsa_area_control *control;
1188  dsa_area *area;
1189  dsa_segment_map *segment_map;
1190  Size usable_pages;
1191  Size total_pages;
1192  Size metadata_bytes;
1193  int i;
1194 
1195  /* Sanity check on the space we have to work in. */
1196  if (size < dsa_minimum_size())
1197  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1198  dsa_minimum_size(), size);
1199 
1200  /* Now figure out how much space is usable */
1201  total_pages = size / FPM_PAGE_SIZE;
1202  metadata_bytes =
1203  MAXALIGN(sizeof(dsa_area_control)) +
1204  MAXALIGN(sizeof(FreePageManager)) +
1205  total_pages * sizeof(dsa_pointer);
1206  /* Add padding up to next page boundary. */
1207  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1208  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1209  Assert(metadata_bytes <= size);
1210  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1211 
1212  /*
1213  * Initialize the dsa_area_control object located at the start of the
1214  * space.
1215  */
1216  control = (dsa_area_control *) place;
1217  control->segment_header.magic =
1218  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1221  control->segment_header.usable_pages = usable_pages;
1222  control->segment_header.freed = false;
1224  control->handle = control_handle;
1225  control->max_total_segment_size = (Size) -1;
1226  control->total_segment_size = size;
1227  memset(&control->segment_handles[0], 0,
1228  sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
1229  control->segment_handles[0] = control_handle;
1230  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1231  control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1232  control->high_segment_index = 0;
1233  control->refcnt = 1;
1234  control->freed_segment_counter = 0;
1235  control->lwlock_tranche_id = tranche_id;
1236 
1237  /*
1238  * Create the dsa_area object that this backend will use to access the
1239  * area. Other backends will need to obtain their own dsa_area object by
1240  * attaching.
1241  */
1242  area = palloc(sizeof(dsa_area));
1243  area->control = control;
1244  area->mapping_pinned = false;
1245  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1246  area->high_segment_index = 0;
1247  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1248  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1250  control->lwlock_tranche_id);
1251 
1252  /* Set up the segment map for this process's mapping. */
1253  segment_map = &area->segment_maps[0];
1254  segment_map->segment = control_segment;
1255  segment_map->mapped_address = place;
1256  segment_map->header = (dsa_segment_header *) place;
1257  segment_map->fpm = (FreePageManager *)
1258  (segment_map->mapped_address +
1259  MAXALIGN(sizeof(dsa_area_control)));
1260  segment_map->pagemap = (dsa_pointer *)
1261  (segment_map->mapped_address +
1262  MAXALIGN(sizeof(dsa_area_control)) +
1263  MAXALIGN(sizeof(FreePageManager)));
1264 
1265  /* Set up the free page map. */
1266  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1267  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1268 
1269  if (usable_pages > 0)
1270  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1271  usable_pages);
1272 
1273  /* Put this segment into the appropriate bin. */
1274  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1275  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1276 
1277  return area;
1278 }
1279 
1280 /*
1281  * Workhorse function for dsa_attach and dsa_attach_in_place.
1282  */
1283 static dsa_area *
1284 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1285 {
1286  dsa_area_control *control;
1287  dsa_area *area;
1288  dsa_segment_map *segment_map;
1289 
1290  control = (dsa_area_control *) place;
1291  Assert(control->handle == handle);
1292  Assert(control->segment_handles[0] == handle);
1293  Assert(control->segment_header.magic ==
1294  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1295 
1296  /* Build the backend-local area object. */
1297  area = palloc(sizeof(dsa_area));
1298  area->control = control;
1299  area->mapping_pinned = false;
1300  memset(&area->segment_maps[0], 0,
1301  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1302  area->high_segment_index = 0;
1303 
1304  /* Set up the segment map for this process's mapping. */
1305  segment_map = &area->segment_maps[0];
1306  segment_map->segment = segment; /* NULL for in-place */
1307  segment_map->mapped_address = place;
1308  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1309  segment_map->fpm = (FreePageManager *)
1310  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1311  segment_map->pagemap = (dsa_pointer *)
1312  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1313  MAXALIGN(sizeof(FreePageManager)));
1314 
1315  /* Bump the reference count. */
1317  ++control->refcnt;
1319 
1320  return area;
1321 }
1322 
1323 /*
1324  * Add a new span to fullness class 1 of the indicated pool.
1325  */
1326 static void
1328  dsa_pointer span_pointer,
1329  dsa_area_pool *pool, dsa_pointer start, Size npages,
1330  uint16 size_class)
1331 {
1332  dsa_area_span *span = dsa_get_address(area, span_pointer);
1333  Size obsize = dsa_size_classes[size_class];
1334 
1335  /*
1336  * The per-pool lock must be held because we manipulate the span list for
1337  * this pool.
1338  */
1339  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1340 
1341  /* Push this span onto the front of the span list for fullness class 1. */
1342  if (DsaPointerIsValid(pool->spans[1]))
1343  {
1344  dsa_area_span *head = (dsa_area_span *)
1345  dsa_get_address(area, pool->spans[1]);
1346 
1347  head->prevspan = span_pointer;
1348  }
1349  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1350  span->nextspan = pool->spans[1];
1351  span->prevspan = InvalidDsaPointer;
1352  pool->spans[1] = span_pointer;
1353 
1354  span->start = start;
1355  span->npages = npages;
1356  span->size_class = size_class;
1357  span->ninitialized = 0;
1358  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1359  {
1360  /*
1361  * A block-of-spans contains its own descriptor, so mark one object as
1362  * initialized and reduce the count of allocatable objects by one.
1363  * Doing this here has the side effect of also reducing nmax by one,
1364  * which is important to make sure we free this object at the correct
1365  * time.
1366  */
1367  span->ninitialized = 1;
1368  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1369  }
1370  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1371  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1373  span->nmax = span->nallocatable;
1374  span->fclass = 1;
1375 }
1376 
1377 /*
1378  * Transfer the first span in one fullness class to the head of another
1379  * fullness class.
1380  */
1381 static bool
1383  dsa_area_pool *pool, int fromclass, int toclass)
1384 {
1385  dsa_pointer span_pointer;
1386  dsa_area_span *span;
1387  dsa_area_span *nextspan;
1388 
1389  /* Can't do it if source list is empty. */
1390  span_pointer = pool->spans[fromclass];
1391  if (!DsaPointerIsValid(span_pointer))
1392  return false;
1393 
1394  /* Remove span from head of source list. */
1395  span = dsa_get_address(area, span_pointer);
1396  pool->spans[fromclass] = span->nextspan;
1397  if (DsaPointerIsValid(span->nextspan))
1398  {
1399  nextspan = (dsa_area_span *)
1400  dsa_get_address(area, span->nextspan);
1401  nextspan->prevspan = InvalidDsaPointer;
1402  }
1403 
1404  /* Add span to head of target list. */
1405  span->nextspan = pool->spans[toclass];
1406  pool->spans[toclass] = span_pointer;
1407  if (DsaPointerIsValid(span->nextspan))
1408  {
1409  nextspan = (dsa_area_span *)
1410  dsa_get_address(area, span->nextspan);
1411  nextspan->prevspan = span_pointer;
1412  }
1413  span->fclass = toclass;
1414 
1415  return true;
1416 }
1417 
1418 /*
1419  * Allocate one object of the requested size class from the given area.
1420  */
1421 static inline dsa_pointer
1422 alloc_object(dsa_area *area, int size_class)
1423 {
1424  dsa_area_pool *pool = &area->control->pools[size_class];
1425  dsa_area_span *span;
1426  dsa_pointer block;
1427  dsa_pointer result;
1428  char *object;
1429  Size size;
1430 
1431  /*
1432  * Even though ensure_active_superblock can in turn call alloc_object if
1433  * it needs to allocate a new span, that's always from a different pool,
1434  * and the order of lock acquisition is always the same, so it's OK that
1435  * we hold this lock for the duration of this function.
1436  */
1437  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1438  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1439 
1440  /*
1441  * If there's no active superblock, we must successfully obtain one or
1442  * fail the request.
1443  */
1444  if (!DsaPointerIsValid(pool->spans[1]) &&
1445  !ensure_active_superblock(area, pool, size_class))
1446  {
1447  result = InvalidDsaPointer;
1448  }
1449  else
1450  {
1451  /*
1452  * There should be a block in fullness class 1 at this point, and it
1453  * should never be completely full. Thus we can either pop an object
1454  * from the free list or, failing that, initialize a new object.
1455  */
1456  Assert(DsaPointerIsValid(pool->spans[1]));
1457  span = (dsa_area_span *)
1458  dsa_get_address(area, pool->spans[1]);
1459  Assert(span->nallocatable > 0);
1460  block = span->start;
1461  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1462  size = dsa_size_classes[size_class];
1463  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1464  {
1465  result = block + span->firstfree * size;
1466  object = dsa_get_address(area, result);
1467  span->firstfree = NextFreeObjectIndex(object);
1468  }
1469  else
1470  {
1471  result = block + span->ninitialized * size;
1472  ++span->ninitialized;
1473  }
1474  --span->nallocatable;
1475 
1476  /* If it's now full, move it to the highest-numbered fullness class. */
1477  if (span->nallocatable == 0)
1478  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1479  }
1480 
1481  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1482  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1483 
1484  return result;
1485 }
1486 
1487 /*
1488  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1489  * superblocks are completely full and no more can be allocated.
1490  *
1491  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1492  * utilization percentage is at least K/N, but we only enforce this rigorously
1493  * for the highest-numbered fullness class, which always contains exactly
1494  * those blocks that are completely full. It's otherwise acceptable for a
1495  * block to be in a higher-numbered fullness class than the one to which it
1496  * logically belongs. In addition, the active block, which is always the
1497  * first block in fullness class 1, is permitted to have a higher allocation
1498  * percentage than would normally be allowable for that fullness class; we
1499  * don't move it until it's completely full, and then it goes to the
1500  * highest-numbered fullness class.
1501  *
1502  * It might seem odd that the active block is the head of fullness class 1
1503  * rather than fullness class 0, but experience with other allocators has
1504  * shown that it's usually better to allocate from a block that's moderately
1505  * full rather than one that's nearly empty. Insofar as is reasonably
1506  * possible, we want to avoid performing new allocations in a block that would
1507  * otherwise become empty soon.
1508  */
1509 static bool
1511  int size_class)
1512 {
1513  dsa_pointer span_pointer;
1514  dsa_pointer start_pointer;
1515  Size obsize = dsa_size_classes[size_class];
1516  Size nmax;
1517  int fclass;
1518  Size npages = 1;
1519  Size first_page;
1520  Size i;
1521  dsa_segment_map *segment_map;
1522 
1523  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1524 
1525  /*
1526  * Compute the number of objects that will fit in a block of this size
1527  * class. Span-of-spans blocks are just a single page, and the first
1528  * object isn't available for use because it describes the block-of-spans
1529  * itself.
1530  */
1531  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1532  nmax = FPM_PAGE_SIZE / obsize - 1;
1533  else
1534  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1535 
1536  /*
1537  * If fullness class 1 is empty, try to find a span to put in it by
1538  * scanning higher-numbered fullness classes (excluding the last one,
1539  * whose blocks are certain to all be completely full).
1540  */
1541  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1542  {
1543  span_pointer = pool->spans[fclass];
1544 
1545  while (DsaPointerIsValid(span_pointer))
1546  {
1547  int tfclass;
1548  dsa_area_span *span;
1549  dsa_area_span *nextspan;
1550  dsa_area_span *prevspan;
1551  dsa_pointer next_span_pointer;
1552 
1553  span = (dsa_area_span *)
1554  dsa_get_address(area, span_pointer);
1555  next_span_pointer = span->nextspan;
1556 
1557  /* Figure out what fullness class should contain this span. */
1558  tfclass = (nmax - span->nallocatable)
1559  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1560 
1561  /* Look up next span. */
1562  if (DsaPointerIsValid(span->nextspan))
1563  nextspan = (dsa_area_span *)
1564  dsa_get_address(area, span->nextspan);
1565  else
1566  nextspan = NULL;
1567 
1568  /*
1569  * If utilization has dropped enough that this now belongs in some
1570  * other fullness class, move it there.
1571  */
1572  if (tfclass < fclass)
1573  {
1574  /* Remove from the current fullness class list. */
1575  if (pool->spans[fclass] == span_pointer)
1576  {
1577  /* It was the head; remove it. */
1579  pool->spans[fclass] = span->nextspan;
1580  if (nextspan != NULL)
1581  nextspan->prevspan = InvalidDsaPointer;
1582  }
1583  else
1584  {
1585  /* It was not the head. */
1587  prevspan = (dsa_area_span *)
1588  dsa_get_address(area, span->prevspan);
1589  prevspan->nextspan = span->nextspan;
1590  }
1591  if (nextspan != NULL)
1592  nextspan->prevspan = span->prevspan;
1593 
1594  /* Push onto the head of the new fullness class list. */
1595  span->nextspan = pool->spans[tfclass];
1596  pool->spans[tfclass] = span_pointer;
1597  span->prevspan = InvalidDsaPointer;
1598  if (DsaPointerIsValid(span->nextspan))
1599  {
1600  nextspan = (dsa_area_span *)
1601  dsa_get_address(area, span->nextspan);
1602  nextspan->prevspan = span_pointer;
1603  }
1604  span->fclass = tfclass;
1605  }
1606 
1607  /* Advance to next span on list. */
1608  span_pointer = next_span_pointer;
1609  }
1610 
1611  /* Stop now if we found a suitable block. */
1612  if (DsaPointerIsValid(pool->spans[1]))
1613  return true;
1614  }
1615 
1616  /*
1617  * If there are no blocks that properly belong in fullness class 1, pick
1618  * one from some other fullness class and move it there anyway, so that we
1619  * have an allocation target. Our last choice is to transfer a block
1620  * that's almost empty (and might become completely empty soon if left
1621  * alone), but even that is better than failing, which is what we must do
1622  * if there are no blocks at all with freespace.
1623  */
1624  Assert(!DsaPointerIsValid(pool->spans[1]));
1625  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1626  if (transfer_first_span(area, pool, fclass, 1))
1627  return true;
1628  if (!DsaPointerIsValid(pool->spans[1]) &&
1629  transfer_first_span(area, pool, 0, 1))
1630  return true;
1631 
1632  /*
1633  * We failed to find an existing span with free objects, so we need to
1634  * allocate a new superblock and construct a new span to manage it.
1635  *
1636  * First, get a dsa_area_span object to describe the new superblock block
1637  * ... unless this allocation is for a dsa_area_span object, in which case
1638  * that's surely not going to work. We handle that case by storing the
1639  * span describing a block-of-spans inline.
1640  */
1641  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1642  {
1643  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1644  if (!DsaPointerIsValid(span_pointer))
1645  return false;
1646  npages = DSA_PAGES_PER_SUPERBLOCK;
1647  }
1648 
1649  /* Find or create a segment and allocate the superblock. */
1651  segment_map = get_best_segment(area, npages);
1652  if (segment_map == NULL)
1653  {
1654  segment_map = make_new_segment(area, npages);
1655  if (segment_map == NULL)
1656  {
1658  return false;
1659  }
1660  }
1661  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1662  {
1664  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1665  dsa_free(area, span_pointer);
1666  return false;
1667  }
1669 
1670  /* Compute the start of the superblock. */
1671  start_pointer =
1672  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1673  first_page * FPM_PAGE_SIZE);
1674 
1675  /*
1676  * If this is a block-of-spans, carve the descriptor right out of the
1677  * allocated space.
1678  */
1679  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1680  {
1681  /*
1682  * We have a pointer into the segment. We need to build a dsa_pointer
1683  * from the segment index and offset into the segment.
1684  */
1685  span_pointer = start_pointer;
1686  }
1687 
1688  /* Initialize span and pagemap. */
1689  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1690  for (i = 0; i < npages; ++i)
1691  segment_map->pagemap[first_page + i] = span_pointer;
1692 
1693  return true;
1694 }
1695 
1696 /*
1697  * Return the segment map corresponding to a given segment index, mapping the
1698  * segment in if necessary. For internal segment book-keeping, this is called
1699  * with the area lock held. It is also called by dsa_free and dsa_get_address
1700  * without any locking, relying on the fact they have a known live segment
1701  * index and they always call check_for_freed_segments to ensures that any
1702  * freed segment occupying the same slot is detached first.
1703  */
1704 static dsa_segment_map *
1706 {
1707  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1708  {
1709  dsm_handle handle;
1710  dsm_segment *segment;
1711  dsa_segment_map *segment_map;
1712 
1713  /*
1714  * If we are reached by dsa_free or dsa_get_address, there must be at
1715  * least one object allocated in the referenced segment. Otherwise,
1716  * their caller has a double-free or access-after-free bug, which we
1717  * have no hope of detecting. So we know it's safe to access this
1718  * array slot without holding a lock; it won't change underneath us.
1719  * Furthermore, we know that we can see the latest contents of the
1720  * slot, as explained in check_for_freed_segments, which those
1721  * functions call before arriving here.
1722  */
1723  handle = area->control->segment_handles[index];
1724 
1725  /* It's an error to try to access an unused slot. */
1726  if (handle == DSM_HANDLE_INVALID)
1727  elog(ERROR,
1728  "dsa_area could not attach to a segment that has been freed");
1729 
1730  segment = dsm_attach(handle);
1731  if (segment == NULL)
1732  elog(ERROR, "dsa_area could not attach to segment");
1733  if (area->mapping_pinned)
1734  dsm_pin_mapping(segment);
1735  segment_map = &area->segment_maps[index];
1736  segment_map->segment = segment;
1737  segment_map->mapped_address = dsm_segment_address(segment);
1738  segment_map->header =
1739  (dsa_segment_header *) segment_map->mapped_address;
1740  segment_map->fpm = (FreePageManager *)
1741  (segment_map->mapped_address +
1742  MAXALIGN(sizeof(dsa_segment_header)));
1743  segment_map->pagemap = (dsa_pointer *)
1744  (segment_map->mapped_address +
1745  MAXALIGN(sizeof(dsa_segment_header)) +
1746  MAXALIGN(sizeof(FreePageManager)));
1747 
1748  /* Remember the highest index this backend has ever mapped. */
1749  if (area->high_segment_index < index)
1750  area->high_segment_index = index;
1751 
1752  Assert(segment_map->header->magic ==
1753  (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1754  }
1755 
1756  return &area->segment_maps[index];
1757 }
1758 
1759 /*
1760  * Return a superblock to the free page manager. If the underlying segment
1761  * has become entirely free, then return it to the operating system.
1762  *
1763  * The appropriate pool lock must be held.
1764  */
1765 static void
1767 {
1768  dsa_area_span *span = dsa_get_address(area, span_pointer);
1769  int size_class = span->size_class;
1770  dsa_segment_map *segment_map;
1771 
1772  segment_map =
1774 
1775  /* Remove it from its fullness class list. */
1776  unlink_span(area, span);
1777 
1778  /*
1779  * Note: Here we acquire the area lock while we already hold a per-pool
1780  * lock. We never hold the area lock and then take a pool lock, or we
1781  * could deadlock.
1782  */
1784  FreePageManagerPut(segment_map->fpm,
1786  span->npages);
1787  /* Check if the segment is now entirely free. */
1788  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1789  {
1790  dsa_segment_index index = get_segment_index(area, segment_map);
1791 
1792  /* If it's not the segment with extra control data, free it. */
1793  if (index != 0)
1794  {
1795  /*
1796  * Give it back to the OS, and allow other backends to detect that
1797  * they need to detach.
1798  */
1799  unlink_segment(area, segment_map);
1800  segment_map->header->freed = true;
1802  segment_map->header->size);
1803  area->control->total_segment_size -=
1804  segment_map->header->size;
1806  dsm_detach(segment_map->segment);
1807  area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1808  ++area->control->freed_segment_counter;
1809  segment_map->segment = NULL;
1810  segment_map->header = NULL;
1811  segment_map->mapped_address = NULL;
1812  }
1813  }
1815 
1816  /*
1817  * Span-of-spans blocks store the span which describes them within the
1818  * block itself, so freeing the storage implicitly frees the descriptor
1819  * also. If this is a block of any other type, we need to separately free
1820  * the span object also. This recursive call to dsa_free will acquire the
1821  * span pool's lock. We can't deadlock because the acquisition order is
1822  * always some other pool and then the span pool.
1823  */
1824  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1825  dsa_free(area, span_pointer);
1826 }
1827 
1828 static void
1830 {
1831  if (DsaPointerIsValid(span->nextspan))
1832  {
1833  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1834 
1835  next->prevspan = span->prevspan;
1836  }
1837  if (DsaPointerIsValid(span->prevspan))
1838  {
1839  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1840 
1841  prev->nextspan = span->nextspan;
1842  }
1843  else
1844  {
1845  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1846 
1847  pool->spans[span->fclass] = span->nextspan;
1848  }
1849 }
1850 
1851 static void
1853  dsa_pointer span_pointer,
1854  int fclass)
1855 {
1856  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1857 
1858  if (DsaPointerIsValid(pool->spans[fclass]))
1859  {
1860  dsa_area_span *head = dsa_get_address(area,
1861  pool->spans[fclass]);
1862 
1863  head->prevspan = span_pointer;
1864  }
1865  span->prevspan = InvalidDsaPointer;
1866  span->nextspan = pool->spans[fclass];
1867  pool->spans[fclass] = span_pointer;
1868  span->fclass = fclass;
1869 }
1870 
1871 /*
1872  * Detach from an area that was either created or attached to by this process.
1873  */
1874 void
1876 {
1877  int i;
1878 
1879  /* Detach from all segments. */
1880  for (i = 0; i <= area->high_segment_index; ++i)
1881  if (area->segment_maps[i].segment != NULL)
1882  dsm_detach(area->segment_maps[i].segment);
1883 
1884  /*
1885  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1886  * 'releasing' (= adjusting the reference count). It would be nice to
1887  * combine these operations, but client code might never get around to
1888  * calling dsa_detach because of an error path, and a detach hook on any
1889  * particular segment is too late to detach other segments in the area
1890  * without risking a 'leak' warning in the non-error path.
1891  */
1892 
1893  /* Free the backend-local area object. */
1894  pfree(area);
1895 }
1896 
1897 /*
1898  * Unlink a segment from the bin that contains it.
1899  */
1900 static void
1902 {
1903  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1904  {
1905  dsa_segment_map *prev;
1906 
1907  prev = get_segment_by_index(area, segment_map->header->prev);
1908  prev->header->next = segment_map->header->next;
1909  }
1910  else
1911  {
1912  Assert(area->control->segment_bins[segment_map->header->bin] ==
1913  get_segment_index(area, segment_map));
1914  area->control->segment_bins[segment_map->header->bin] =
1915  segment_map->header->next;
1916  }
1917  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1918  {
1920 
1921  next = get_segment_by_index(area, segment_map->header->next);
1922  next->header->prev = segment_map->header->prev;
1923  }
1924 }
1925 
1926 /*
1927  * Find a segment that could satisfy a request for 'npages' of contiguous
1928  * memory, or return NULL if none can be found. This may involve attaching to
1929  * segments that weren't previously attached so that we can query their free
1930  * pages map.
1931  */
1932 static dsa_segment_map *
1934 {
1935  Size bin;
1936 
1938 
1939  /*
1940  * Start searching from the first bin that *might* have enough contiguous
1941  * pages.
1942  */
1943  for (bin = contiguous_pages_to_segment_bin(npages);
1944  bin < DSA_NUM_SEGMENT_BINS;
1945  ++bin)
1946  {
1947  /*
1948  * The minimum contiguous size that any segment in this bin should
1949  * have. We'll re-bin if we see segments with fewer.
1950  */
1951  Size threshold = (Size) 1 << (bin - 1);
1952  dsa_segment_index segment_index;
1953 
1954  /* Search this bin for a segment with enough contiguous space. */
1955  segment_index = area->control->segment_bins[bin];
1956  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1957  {
1958  dsa_segment_map *segment_map;
1959  dsa_segment_index next_segment_index;
1960  Size contiguous_pages;
1961 
1962  segment_map = get_segment_by_index(area, segment_index);
1963  next_segment_index = segment_map->header->next;
1964  contiguous_pages = fpm_largest(segment_map->fpm);
1965 
1966  /* Not enough for the request, still enough for this bin. */
1967  if (contiguous_pages >= threshold && contiguous_pages < npages)
1968  {
1969  segment_index = next_segment_index;
1970  continue;
1971  }
1972 
1973  /* Re-bin it if it's no longer in the appropriate bin. */
1974  if (contiguous_pages < threshold)
1975  {
1976  Size new_bin;
1977 
1978  new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
1979 
1980  /* Remove it from its current bin. */
1981  unlink_segment(area, segment_map);
1982 
1983  /* Push it onto the front of its new bin. */
1984  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
1985  segment_map->header->next =
1986  area->control->segment_bins[new_bin];
1987  segment_map->header->bin = new_bin;
1988  area->control->segment_bins[new_bin] = segment_index;
1989  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1990  {
1992 
1993  next = get_segment_by_index(area,
1994  segment_map->header->next);
1995  Assert(next->header->bin == new_bin);
1996  next->header->prev = segment_index;
1997  }
1998 
1999  /*
2000  * But fall through to see if it's enough to satisfy this
2001  * request anyway....
2002  */
2003  }
2004 
2005  /* Check if we are done. */
2006  if (contiguous_pages >= npages)
2007  return segment_map;
2008 
2009  /* Continue searching the same bin. */
2010  segment_index = next_segment_index;
2011  }
2012  }
2013 
2014  /* Not found. */
2015  return NULL;
2016 }
2017 
2018 /*
2019  * Create a new segment that can handle at least requested_pages. Returns
2020  * NULL if the requested total size limit or maximum allowed number of
2021  * segments would be exceeded.
2022  */
2023 static dsa_segment_map *
2024 make_new_segment(dsa_area *area, Size requested_pages)
2025 {
2026  dsa_segment_index new_index;
2027  Size metadata_bytes;
2028  Size total_size;
2029  Size total_pages;
2030  Size usable_pages;
2031  dsa_segment_map *segment_map;
2032  dsm_segment *segment;
2033 
2035 
2036  /* Find a segment slot that is not in use (linearly for now). */
2037  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2038  {
2039  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2040  break;
2041  }
2042  if (new_index == DSA_MAX_SEGMENTS)
2043  return NULL;
2044 
2045  /*
2046  * If the total size limit is already exceeded, then we exit early and
2047  * avoid arithmetic wraparound in the unsigned expressions below.
2048  */
2049  if (area->control->total_segment_size >=
2051  return NULL;
2052 
2053  /*
2054  * The size should be at least as big as requested, and at least big
2055  * enough to follow a geometric series that approximately doubles the
2056  * total storage each time we create a new segment. We use geometric
2057  * growth because the underlying DSM system isn't designed for large
2058  * numbers of segments (otherwise we might even consider just using one
2059  * DSM segment for each large allocation and for each superblock, and then
2060  * we wouldn't need to use FreePageManager).
2061  *
2062  * We decide on a total segment size first, so that we produce tidy
2063  * power-of-two sized segments. This is a good property to have if we
2064  * move to huge pages in the future. Then we work back to the number of
2065  * pages we can fit.
2066  */
2067  total_size = DSA_INITIAL_SEGMENT_SIZE *
2068  ((Size) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2069  total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2070  total_size = Min(total_size,
2072  area->control->total_segment_size);
2073 
2074  total_pages = total_size / FPM_PAGE_SIZE;
2075  metadata_bytes =
2076  MAXALIGN(sizeof(dsa_segment_header)) +
2077  MAXALIGN(sizeof(FreePageManager)) +
2078  sizeof(dsa_pointer) * total_pages;
2079 
2080  /* Add padding up to next page boundary. */
2081  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2082  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2083  if (total_size <= metadata_bytes)
2084  return NULL;
2085  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2086  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2087 
2088  /* See if that is enough... */
2089  if (requested_pages > usable_pages)
2090  {
2091  /*
2092  * We'll make an odd-sized segment, working forward from the requested
2093  * number of pages.
2094  */
2095  usable_pages = requested_pages;
2096  metadata_bytes =
2097  MAXALIGN(sizeof(dsa_segment_header)) +
2098  MAXALIGN(sizeof(FreePageManager)) +
2099  usable_pages * sizeof(dsa_pointer);
2100 
2101  /* Add padding up to next page boundary. */
2102  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2103  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2104  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2105 
2106  /* Is that too large for dsa_pointer's addressing scheme? */
2107  if (total_size > DSA_MAX_SEGMENT_SIZE)
2108  return NULL;
2109 
2110  /* Would that exceed the limit? */
2111  if (total_size > area->control->max_total_segment_size -
2112  area->control->total_segment_size)
2113  return NULL;
2114  }
2115 
2116  /* Create the segment. */
2117  segment = dsm_create(total_size, 0);
2118  if (segment == NULL)
2119  return NULL;
2120  dsm_pin_segment(segment);
2121  if (area->mapping_pinned)
2122  dsm_pin_mapping(segment);
2123 
2124  /* Store the handle in shared memory to be found by index. */
2125  area->control->segment_handles[new_index] =
2126  dsm_segment_handle(segment);
2127  /* Track the highest segment index in the history of the area. */
2128  if (area->control->high_segment_index < new_index)
2129  area->control->high_segment_index = new_index;
2130  /* Track the highest segment index this backend has ever mapped. */
2131  if (area->high_segment_index < new_index)
2132  area->high_segment_index = new_index;
2133  /* Track total size of all segments. */
2134  area->control->total_segment_size += total_size;
2137 
2138  /* Build a segment map for this segment in this backend. */
2139  segment_map = &area->segment_maps[new_index];
2140  segment_map->segment = segment;
2141  segment_map->mapped_address = dsm_segment_address(segment);
2142  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2143  segment_map->fpm = (FreePageManager *)
2144  (segment_map->mapped_address +
2145  MAXALIGN(sizeof(dsa_segment_header)));
2146  segment_map->pagemap = (dsa_pointer *)
2147  (segment_map->mapped_address +
2148  MAXALIGN(sizeof(dsa_segment_header)) +
2149  MAXALIGN(sizeof(FreePageManager)));
2150 
2151  /* Set up the free page map. */
2152  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2153  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2154  usable_pages);
2155 
2156  /* Set up the segment header and put it in the appropriate bin. */
2157  segment_map->header->magic =
2158  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2159  segment_map->header->usable_pages = usable_pages;
2160  segment_map->header->size = total_size;
2161  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2162  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2163  segment_map->header->next =
2164  area->control->segment_bins[segment_map->header->bin];
2165  segment_map->header->freed = false;
2166  area->control->segment_bins[segment_map->header->bin] = new_index;
2167  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2168  {
2170  get_segment_by_index(area, segment_map->header->next);
2171 
2172  Assert(next->header->bin == segment_map->header->bin);
2173  next->header->prev = new_index;
2174  }
2175 
2176  return segment_map;
2177 }
2178 
2179 /*
2180  * Check if any segments have been freed by destroy_superblock, so we can
2181  * detach from them in this backend. This function is called by
2182  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2183  * received can be resolved to the correct segment.
2184  *
2185  * The danger we want to defend against is that there could be an old segment
2186  * mapped into a given slot in this backend, and the dsa_pointer they have
2187  * might refer to some new segment in the same slot. So those functions must
2188  * be sure to process all instructions to detach from a freed segment that had
2189  * been generated by the time this process received the dsa_pointer, before
2190  * they call get_segment_by_index.
2191  */
2192 static void
2194 {
2195  Size freed_segment_counter;
2196 
2197  /*
2198  * Any other process that has freed a segment has incremented
2199  * free_segment_counter while holding an LWLock, and that must precede any
2200  * backend creating a new segment in the same slot while holding an
2201  * LWLock, and that must precede the creation of any dsa_pointer pointing
2202  * into the new segment which might reach us here, and the caller must
2203  * have sent the dsa_pointer to this process using appropriate memory
2204  * synchronization (some kind of locking or atomic primitive or system
2205  * call). So all we need to do on the reading side is ask for the load of
2206  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2207  * has, and we can be sure to detect any segments that had been freed as
2208  * of the time that the dsa_pointer reached this process.
2209  */
2210  pg_read_barrier();
2211  freed_segment_counter = area->control->freed_segment_counter;
2212  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2213  {
2214  int i;
2215 
2216  /* Check all currently mapped segments to find what's been freed. */
2218  for (i = 0; i <= area->high_segment_index; ++i)
2219  {
2220  if (area->segment_maps[i].header != NULL &&
2221  area->segment_maps[i].header->freed)
2222  {
2223  dsm_detach(area->segment_maps[i].segment);
2224  area->segment_maps[i].segment = NULL;
2225  area->segment_maps[i].header = NULL;
2226  area->segment_maps[i].mapped_address = NULL;
2227  }
2228  }
2230  area->freed_segment_counter = freed_segment_counter;
2231  }
2232 }
uint16 nmax
Definition: dsa.c:205
bool mapping_pinned
Definition: dsa.c:360
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:307
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:377
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:378
dsa_pointer nextspan
Definition: dsa.c:198
Definition: lwlock.h:32
void dsa_unpin(dsa_area *area)
Definition: dsa.c:976
Size freed_segment_counter
Definition: dsa.c:321
static int32 next
Definition: blutils.c:210
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define InvalidDsaPointer
Definition: dsa.h:78
uint32 dsm_handle
Definition: dsm_impl.h:55
dsa_pointer dsa_allocate_extended(dsa_area *area, Size size, int flags)
Definition: dsa.c:664
uint16 ninitialized
Definition: dsa.c:202
#define contiguous_pages_to_segment_bin(n)
Definition: dsa.c:140
Size max_total_segment_size
Definition: dsa.c:313
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1831
#define PointerGetDatum(X)
Definition: postgres.h:562
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:549
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:247
dsa_segment_index high_segment_index
Definition: dsa.c:371
#define Min(x, y)
Definition: c.h:806
dsa_segment_index high_segment_index
Definition: dsa.c:315
dsm_segment * segment
Definition: dsa.c:341
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:540
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
static char dsa_size_class_map[]
Definition: dsa.c:259
dsa_segment_index prev
Definition: dsa.c:164
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
bool pinned
Definition: dsa.c:319
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1016
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:18
int errcode(int sqlerrcode)
Definition: elog.c:575
static dsa_segment_map * get_best_segment(dsa_area *area, Size npages)
Definition: dsa.c:1933
dsm_handle dsa_handle
Definition: dsa.h:100
int lwlock_tranche_id
Definition: dsa.c:323
#define lengthof(array)
Definition: c.h:562
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1284
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1025
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:277
dsa_pointer start
Definition: dsa.c:199
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
FreePageManager * fpm
Definition: dsa.c:344
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:101
dsa_segment_header * header
Definition: dsa.c:343
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:419
uint16 fclass
Definition: dsa.c:206
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:381
Size total_segment_size
Definition: dsa.c:311
Definition: type.h:90
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1715
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:493
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:855
#define DSM_HANDLE_INVALID
Definition: dsm.h:23
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:924
#define DSA_POINTER_FORMAT
Definition: dsa.h:60
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
LWLock lock
Definition: dsa.c:325
unsigned short uint16
Definition: c.h:267
void pfree(void *pointer)
Definition: mcxt.c:950
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1510
int refcnt
Definition: dsa.c:317
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:68
#define ERROR
Definition: elog.h:43
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1182
uint16 size_class
Definition: dsa.c:201
#define FATAL
Definition: elog.h:52
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1382
void dsa_set_size_limit(dsa_area *area, Size limit)
Definition: dsa.c:1000
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:815
void dsa_release_in_place(void *place)
Definition: dsa.c:600
dsa_segment_index next
Definition: dsa.c:170
dsa_area_control * control
Definition: dsa.c:357
int errdetail(const char *fmt,...)
Definition: elog.c:873
void dsa_detach(dsa_area *area)
Definition: dsa.c:1875
unsigned int uint32
Definition: c.h:268
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:630
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:887
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:468
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:77
#define ereport(elevel, rest)
Definition: elog.h:122
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:133
static const uint16 dsa_size_classes[]
Definition: dsa.c:236
#define AllocSizeIsValid(size)
Definition: memutils.h:42
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1829
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:667
Size dsa_segment_index
Definition: dsa.c:124
Size npages
Definition: dsa.c:200
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, Size npages, uint16 size_class)
Definition: dsa.c:1327
Size freed_segment_counter
Definition: dsa.c:374
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:571
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1901
uint32 dsa_pointer
Definition: dsa.h:53
void dsa_trim(dsa_area *area)
Definition: dsa.c:1012
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
uintptr_t Datum
Definition: postgres.h:372
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:458
static dsa_segment_map * make_new_segment(dsa_area *area, Size requested_pages)
Definition: dsa.c:2024
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:144
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:505
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:269
dsa_segment_header segment_header
Definition: dsa.c:301
dsa_pointer prevspan
Definition: dsa.c:197
Size dsa_minimum_size(void)
Definition: dsa.c:1160
#define FPM_PAGE_SIZE
Definition: freepage.h:30
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:250
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1852
#define NULL
Definition: c.h:229
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:118
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:988
#define Assert(condition)
Definition: c.h:675
#define fpm_largest(fpm)
Definition: freepage.h:88
#define pg_read_barrier()
Definition: atomics.h:160
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:127
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:121
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:114
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:309
size_t Size
Definition: c.h:356
dsa_pointer pool
Definition: dsa.c:196
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:305
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1111
#define MAXALIGN(LEN)
Definition: c.h:588
dsa_pointer * pagemap
Definition: dsa.c:345
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:46
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:290
char * mapped_address
Definition: dsa.c:342
#define DatumGetPointer(X)
Definition: postgres.h:555
#define DSA_MAX_SEGMENTS
Definition: dsa.c:94
#define DsaPointerIsValid(x)
Definition: dsa.h:81
Size usable_pages
Definition: dsa.c:156
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1766
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:368
Definition: dsa.c:354
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:812
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:714
uint16 nallocatable
Definition: dsa.c:203
void * palloc(Size size)
Definition: mcxt.c:849
int errmsg(const char *fmt,...)
Definition: elog.c:797
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2193
int i
void dsa_dump(dsa_area *area)
Definition: dsa.c:1057
#define unlikely(x)
Definition: c.h:962
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1422
uint16 firstfree
Definition: dsa.c:204
LWLock lock
Definition: dsa.c:288
#define elog
Definition: elog.h:219
#define DSA_AREA_LOCK(area)
Definition: dsa.c:143
void dsa_pin(dsa_area *area)
Definition: dsa.c:957
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:251
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:111
uint32 magic
Definition: dsa.c:154
dsa_handle handle
Definition: dsa.c:303
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:104
#define NextFreeObjectIndex(object)
Definition: dsa.c:213
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:329
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:585
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1705