PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "storage/dsm.h"
55 #include "storage/ipc.h"
56 #include "storage/lwlock.h"
57 #include "storage/shmem.h"
58 #include "utils/dsa.h"
59 #include "utils/freepage.h"
60 #include "utils/memutils.h"
61 
62 /*
63  * The size of the initial DSM segment that backs a dsa_area created by
64  * dsa_create. After creating some number of segments of this size we'll
65  * double this size, and so on. Larger segments may be created if necessary
66  * to satisfy large requests.
67  */
68 #define DSA_INITIAL_SEGMENT_SIZE ((Size) (1 * 1024 * 1024))
69 
70 /*
71  * How many segments to create before we double the segment size. If this is
72  * low, then there is likely to be a lot of wasted space in the largest
73  * segment. If it is high, then we risk running out of segment slots (see
74  * dsm.c's limits on total number of segments), or limiting the total size
75  * an area can manage when using small pointers.
76  */
77 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4
78 
79 /*
80  * The number of bits used to represent the offset part of a dsa_pointer.
81  * This controls the maximum size of a segment, the maximum possible
82  * allocation size and also the maximum number of segments per area.
83  */
84 #if SIZEOF_DSA_POINTER == 4
85 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
86 #else
87 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
88 #endif
89 
90 /*
91  * The maximum number of DSM segments that an area can own, determined by
92  * the number of bits remaining (but capped at 1024).
93  */
94 #define DSA_MAX_SEGMENTS \
95  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96 
97 /* The bitmask for extracting the offset from a dsa_pointer. */
98 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99 
100 /* The maximum size of a DSM segment. */
101 #define DSA_MAX_SEGMENT_SIZE ((Size) 1 << DSA_OFFSET_WIDTH)
102 
103 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104 #define DSA_PAGES_PER_SUPERBLOCK 16
105 
106 /*
107  * A magic number used as a sanity check for following DSM segments belonging
108  * to a DSA area (this number will be XORed with the area handle and
109  * the segment index).
110  */
111 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112 
113 /* Build a dsa_pointer given a segment number and offset. */
114 #define DSA_MAKE_POINTER(segment_number, offset) \
115  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116 
117 /* Extract the segment number from a dsa_pointer. */
118 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119 
120 /* Extract the offset from a dsa_pointer. */
121 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122 
123 /* The type used for index segment indexes (zero based). */
125 
126 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128 
129 /*
130  * How many bins of segments do we have? The bins are used to categorize
131  * segments by their largest contiguous run of free pages.
132  */
133 #define DSA_NUM_SEGMENT_BINS 16
134 
135 /*
136  * What is the lowest bin that holds segments that *might* have n contiguous
137  * free pages? There is no point in looking in segments in lower bins; they
138  * definitely can't service a request for n free pages.
139  */
140 #define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
141 
142 /* Macros for access to locks. */
143 #define DSA_AREA_LOCK(area) (&area->control->lock)
144 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
145 
146 /*
147  * The header for an individual segment. This lives at the start of each DSM
148  * segment owned by a DSA area including the first segment (where it appears
149  * as part of the dsa_area_control struct).
150  */
151 typedef struct
152 {
153  /* Sanity check magic value. */
155  /* Total number of pages in this segment (excluding metadata area). */
157  /* Total size of this segment in bytes. */
159 
160  /*
161  * Index of the segment that precedes this one in the same segment bin, or
162  * DSA_SEGMENT_INDEX_NONE if this is the first one.
163  */
165 
166  /*
167  * Index of the segment that follows this one in the same segment bin, or
168  * DSA_SEGMENT_INDEX_NONE if this is the last one.
169  */
171  /* The index of the bin that contains this segment. */
173 
174  /*
175  * A flag raised to indicate that this segment is being returned to the
176  * operating system and has been unpinned.
177  */
178  bool freed;
180 
181 /*
182  * Metadata for one superblock.
183  *
184  * For most blocks, span objects are stored out-of-line; that is, the span
185  * object is not stored within the block itself. But, as an exception, for a
186  * "span of spans", the span object is stored "inline". The allocation is
187  * always exactly one page, and the dsa_area_span object is located at
188  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
189  * and the remaining fields are used just as they would be in an ordinary
190  * block. We can't allocate spans out of ordinary superblocks because
191  * creating an ordinary superblock requires us to be able to allocate a span
192  * *first*. Doing it this way avoids that circularity.
193  */
194 typedef struct
195 {
196  dsa_pointer pool; /* Containing pool. */
197  dsa_pointer prevspan; /* Previous span. */
198  dsa_pointer nextspan; /* Next span. */
199  dsa_pointer start; /* Starting address. */
200  Size npages; /* Length of span in pages. */
201  uint16 size_class; /* Size class. */
202  uint16 ninitialized; /* Maximum number of objects ever allocated. */
203  uint16 nallocatable; /* Number of objects currently allocatable. */
204  uint16 firstfree; /* First object on free list. */
205  uint16 nmax; /* Maximum number of objects ever possible. */
206  uint16 fclass; /* Current fullness class. */
207 } dsa_area_span;
208 
209 /*
210  * Given a pointer to an object in a span, access the index of the next free
211  * object in the same span (ie in the span's freelist) as an L-value.
212  */
213 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
214 
215 /*
216  * Small allocations are handled by dividing a single block of memory into
217  * many small objects of equal size. The possible allocation sizes are
218  * defined by the following array. Larger size classes are spaced more widely
219  * than smaller size classes. We fudge the spacing for size classes >1kB to
220  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
221  * blocks, we bump the maximum object size up to the largest multiple of
222  * 8 bytes that still lets us fit the same number of objects into one block.
223  *
224  * NB: Because of this fudging, if we were ever to use differently-sized blocks
225  * for small allocations, these size classes would need to be reworked to be
226  * optimal for the new size.
227  *
228  * NB: The optimal spacing for size classes, as well as the size of the blocks
229  * out of which small objects are allocated, is not a question that has one
230  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
231  * size classes than we do here, while others (like aset.c) use more
232  * widely-spaced classes. Spacing the classes more closely avoids wasting
233  * memory within individual chunks, but also means a larger number of
234  * potentially-unfilled blocks.
235  */
236 static const uint16 dsa_size_classes[] = {
237  sizeof(dsa_area_span), 0, /* special size classes */
238  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
239  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
240  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
241  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
242  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
243  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
244  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
245  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
246 };
247 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
248 
249 /* Special size classes. */
250 #define DSA_SCLASS_BLOCK_OF_SPANS 0
251 #define DSA_SCLASS_SPAN_LARGE 1
252 
253 /*
254  * The following lookup table is used to map the size of small objects
255  * (less than 1kB) onto the corresponding size class. To use this table,
256  * round the size of the object up to the next multiple of 8 bytes, and then
257  * index into this array.
258  */
259 static char dsa_size_class_map[] = {
260  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
261  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
262  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
263  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
264  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
265  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
266  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
267  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
268 };
269 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
270 
271 /*
272  * Superblocks are binned by how full they are. Generally, each fullness
273  * class corresponds to one quartile, but the block being used for
274  * allocations is always at the head of the list for fullness class 1,
275  * regardless of how full it really is.
276  */
277 #define DSA_FULLNESS_CLASSES 4
278 
279 /*
280  * A dsa_area_pool represents a set of objects of a given size class.
281  *
282  * Perhaps there should be multiple pools for the same size class for
283  * contention avoidance, but for now there is just one!
284  */
285 typedef struct
286 {
287  /* A lock protecting access to this pool. */
289  /* A set of linked lists of spans, arranged by fullness. */
291  /* Should we pad this out to a cacheline boundary? */
292 } dsa_area_pool;
293 
294 /*
295  * The control block for an area. This lives in shared memory, at the start of
296  * the first DSM segment controlled by this area.
297  */
298 typedef struct
299 {
300  /* The segment header for the first segment. */
302  /* The handle for this area. */
304  /* The handles of the segments owned by this area. */
305  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
306  /* Lists of segments, binned by maximum contiguous run of free pages. */
308  /* The object pools for each size class. */
310  /* The total size of all active segments. */
312  /* The maximum total size of backing storage we are allowed. */
314  /* Highest used segment index in the history of this area. */
316  /* The reference count for this area. */
317  int refcnt;
318  /* A flag indicating that this area has been pinned. */
319  bool pinned;
320  /* The number of times that segments have been freed. */
322  /* The LWLock tranche ID. */
324  /* The general lock (protects everything except object pools). */
327 
328 /* Given a pointer to a pool, find a dsa_pointer. */
329 #define DsaAreaPoolToDsaPointer(area, p) \
330  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
331 
332 /*
333  * A dsa_segment_map is stored within the backend-private memory of each
334  * individual backend. It holds the base address of the segment within that
335  * backend, plus the addresses of key objects within the segment. Those
336  * could instead be derived from the base address but it's handy to have them
337  * around.
338  */
339 typedef struct
340 {
341  dsm_segment *segment; /* DSM segment */
342  char *mapped_address; /* Address at which segment is mapped */
343  dsa_segment_header *header; /* Header (same as mapped_address) */
344  FreePageManager *fpm; /* Free page manager within segment. */
345  dsa_pointer *pagemap; /* Page map within segment. */
347 
348 /*
349  * Per-backend state for a storage area. Backends obtain one of these by
350  * creating an area or attaching to an existing one using a handle. Each
351  * process that needs to use an area uses its own object to track where the
352  * segments are mapped.
353  */
354 struct dsa_area
355 {
356  /* Pointer to the control object in shared memory. */
358 
359  /* Has the mapping been pinned? */
361 
362  /*
363  * This backend's array of segment maps, ordered by segment index
364  * corresponding to control->segment_handles. Some of the area's segments
365  * may not be mapped in in this backend yet, and some slots may have been
366  * freed and need to be detached; these operations happen on demand.
367  */
369 
370  /* The highest segment index this backend has ever mapped. */
372 
373  /* The last observed freed_segment_counter. */
375 };
376 
377 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
378 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
379 
380 /* Given a pointer to a segment_map, obtain a segment index number. */
381 #define get_segment_index(area, segment_map_ptr) \
382  (segment_map_ptr - &area->segment_maps[0])
383 
384 static void init_span(dsa_area *area, dsa_pointer span_pointer,
385  dsa_area_pool *pool, dsa_pointer start, Size npages,
386  uint16 size_class);
387 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
388  int fromclass, int toclass);
389 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
390 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
391  int size_class);
394 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
395 static void unlink_span(dsa_area *area, dsa_area_span *span);
396 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
397  dsa_pointer span_pointer, int fclass);
398 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
399 static dsa_segment_map *get_best_segment(dsa_area *area, Size npages);
400 static dsa_segment_map *make_new_segment(dsa_area *area, Size requested_pages);
401 static dsa_area *create_internal(void *place, size_t size,
402  int tranche_id,
403  dsm_handle control_handle,
404  dsm_segment *control_segment);
405 static dsa_area *attach_internal(void *place, dsm_segment *segment,
406  dsa_handle handle);
407 static void check_for_freed_segments(dsa_area *area);
408 
409 /*
410  * Create a new shared area in a new DSM segment. Further DSM segments will
411  * be allocated as required to extend the available space.
412  *
413  * We can't allocate a LWLock tranche_id within this function, because tranche
414  * IDs are a scarce resource; there are only 64k available, using low numbers
415  * when possible matters, and we have no provision for recycling them. So,
416  * we require the caller to provide one.
417  */
418 dsa_area *
419 dsa_create(int tranche_id)
420 {
421  dsm_segment *segment;
422  dsa_area *area;
423 
424  /*
425  * Create the DSM segment that will hold the shared control object and the
426  * first segment of usable space.
427  */
428  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
429 
430  /*
431  * All segments backing this area are pinned, so that DSA can explicitly
432  * control their lifetime (otherwise a newly created segment belonging to
433  * this area might be freed when the only backend that happens to have it
434  * mapped in ends, corrupting the area).
435  */
436  dsm_pin_segment(segment);
437 
438  /* Create a new DSA area with the control object in this segment. */
439  area = create_internal(dsm_segment_address(segment),
441  tranche_id,
442  dsm_segment_handle(segment), segment);
443 
444  /* Clean up when the control segment detaches. */
447 
448  return area;
449 }
450 
451 /*
452  * Create a new shared area in an existing shared memory space, which may be
453  * either DSM or Postmaster-initialized memory. DSM segments will be
454  * allocated as required to extend the available space, though that can be
455  * prevented with dsa_set_size_limit(area, size) using the same size provided
456  * to dsa_create_in_place.
457  *
458  * Areas created in-place must eventually be released by the backend that
459  * created them and all backends that attach to them. This can be done
460  * explicitly with dsa_release_in_place, or, in the special case that 'place'
461  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
462  * segment so that a detach hook can be registered with the containing DSM
463  * segment.
464  *
465  * See dsa_create() for a note about the tranche arguments.
466  */
467 dsa_area *
468 dsa_create_in_place(void *place, size_t size,
469  int tranche_id, dsm_segment *segment)
470 {
471  dsa_area *area;
472 
473  area = create_internal(place, size, tranche_id,
474  DSM_HANDLE_INVALID, NULL);
475 
476  /*
477  * Clean up when the control segment detaches, if a containing DSM segment
478  * was provided.
479  */
480  if (segment != NULL)
482  PointerGetDatum(place));
483 
484  return area;
485 }
486 
487 /*
488  * Obtain a handle that can be passed to other processes so that they can
489  * attach to the given area. Cannot be called for areas created with
490  * dsa_create_in_place.
491  */
494 {
496  return area->control->handle;
497 }
498 
499 /*
500  * Attach to an area given a handle generated (possibly in another process) by
501  * dsa_get_handle. The area must have been created with dsa_create (not
502  * dsa_create_in_place).
503  */
504 dsa_area *
506 {
507  dsm_segment *segment;
508  dsa_area *area;
509 
510  /*
511  * An area handle is really a DSM segment handle for the first segment, so
512  * we go ahead and attach to that.
513  */
514  segment = dsm_attach(handle);
515  if (segment == NULL)
516  ereport(ERROR,
517  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
518  errmsg("could not attach to dynamic shared area")));
519 
520  area = attach_internal(dsm_segment_address(segment), segment, handle);
521 
522  /* Clean up when the control segment detaches. */
525 
526  return area;
527 }
528 
529 /*
530  * Attach to an area that was created with dsa_create_in_place. The caller
531  * must somehow know the location in memory that was used when the area was
532  * created, though it may be mapped at a different virtual address in this
533  * process.
534  *
535  * See dsa_create_in_place for note about releasing in-place areas, and the
536  * optional 'segment' argument which can be provided to allow automatic
537  * release if the containing memory happens to be a DSM segment.
538  */
539 dsa_area *
540 dsa_attach_in_place(void *place, dsm_segment *segment)
541 {
542  dsa_area *area;
543 
544  area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
545 
546  /*
547  * Clean up when the control segment detaches, if a containing DSM segment
548  * was provided.
549  */
550  if (segment != NULL)
552  PointerGetDatum(place));
553 
554  return area;
555 }
556 
557 /*
558  * Release a DSA area that was produced by dsa_create_in_place or
559  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
560  * interface suitable for on_dsm_detach, for the convenience of users who want
561  * to create a DSA segment inside an existing DSM segment and have it
562  * automatically released when the containing DSM segment is detached.
563  * 'place' should be the address of the place where the area was created.
564  *
565  * This callback is automatically registered for the DSM segment containing
566  * the control object of in-place areas when a segment is provided to
567  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
568  * with dsa_create.
569  */
570 void
572 {
574 }
575 
576 /*
577  * Release a DSA area that was produced by dsa_create_in_place or
578  * dsa_attach_in_place. The 'code' argument is ignored but provides an
579  * interface suitable for on_shmem_exit or before_shmem_exit, for the
580  * convenience of users who want to create a DSA segment inside shared memory
581  * other than a DSM segment and have it automatically release at backend exit.
582  * 'place' should be the address of the place where the area was created.
583  */
584 void
586 {
588 }
589 
590 /*
591  * Release a DSA area that was produced by dsa_create_in_place or
592  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
593  * callbacks so that this is managed automatically, because failure to release
594  * an area created in-place leaks its segments permanently.
595  *
596  * This is also called automatically for areas produced by dsa_create or
597  * dsa_attach as an implementation detail.
598  */
599 void
601 {
603  int i;
604 
605  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
606  Assert(control->segment_header.magic ==
607  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
608  Assert(control->refcnt > 0);
609  if (--control->refcnt == 0)
610  {
611  for (i = 0; i <= control->high_segment_index; ++i)
612  {
613  dsm_handle handle;
614 
615  handle = control->segment_handles[i];
616  if (handle != DSM_HANDLE_INVALID)
617  dsm_unpin_segment(handle);
618  }
619  }
620  LWLockRelease(&control->lock);
621 }
622 
623 /*
624  * Keep a DSA area attached until end of session or explicit detach.
625  *
626  * By default, areas are owned by the current resource owner, which means they
627  * are detached automatically when that scope ends.
628  */
629 void
631 {
632  int i;
633 
634  Assert(!area->mapping_pinned);
635  area->mapping_pinned = true;
636 
637  for (i = 0; i <= area->high_segment_index; ++i)
638  if (area->segment_maps[i].segment != NULL)
640 }
641 
642 /*
643  * Allocate memory in this storage area. The return value is a dsa_pointer
644  * that can be passed to other processes, and converted to a local pointer
645  * with dsa_get_address. 'flags' is a bitmap which should be constructed
646  * from the following values:
647  *
648  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
649  * will result in an ERROR.
650  *
651  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
652  * no memory is available or a size limit establed by set_dsa_size_limit
653  * would be exceeded. Otherwise, such allocations will result in an ERROR.
654  *
655  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
656  * contents of newly-allocated memory are indeterminate.
657  *
658  * These flags correspond to similarly named flags used by
659  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
660  * dsa_allocate0 which expand to a call to this function with commonly used
661  * flags.
662  */
664 dsa_allocate_extended(dsa_area *area, Size size, int flags)
665 {
666  uint16 size_class;
667  dsa_pointer start_pointer;
668  dsa_segment_map *segment_map;
669  dsa_pointer result;
670 
671  Assert(size > 0);
672 
673  /* Sanity check on huge individual allocation size. */
674  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
675  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
676  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
677 
678  /*
679  * If bigger than the largest size class, just grab a run of pages from
680  * the free page manager, instead of allocating an object from a pool.
681  * There will still be a span, but it's a special class of span that
682  * manages this whole allocation and simply gives all pages back to the
683  * free page manager when dsa_free is called.
684  */
685  if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
686  {
687  Size npages = fpm_size_to_pages(size);
688  Size first_page;
689  dsa_pointer span_pointer;
691 
692  /* Obtain a span object. */
693  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
694  if (!DsaPointerIsValid(span_pointer))
695  return InvalidDsaPointer;
696 
698 
699  /* Find a segment from which to allocate. */
700  segment_map = get_best_segment(area, npages);
701  if (segment_map == NULL)
702  segment_map = make_new_segment(area, npages);
703  if (segment_map == NULL)
704  {
705  /* Can't make any more segments: game over. */
707  dsa_free(area, span_pointer);
708 
709  /* Raise error unless asked not to. */
710  if ((flags & DSA_ALLOC_NO_OOM) == 0)
711  ereport(ERROR,
712  (errcode(ERRCODE_OUT_OF_MEMORY),
713  errmsg("out of memory"),
714  errdetail("Failed on DSA request of size %zu.",
715  size)));
716  return InvalidDsaPointer;
717  }
718 
719  /*
720  * Ask the free page manager for a run of pages. This should always
721  * succeed, since both get_best_segment and make_new_segment should
722  * only return a non-NULL pointer if it actually contains enough
723  * contiguous freespace. If it does fail, something in our backend
724  * private state is out of whack, so use FATAL to kill the process.
725  */
726  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
727  elog(FATAL,
728  "dsa_allocate could not find %zu free pages", npages);
730 
731  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
732  first_page * FPM_PAGE_SIZE);
733 
734  /* Initialize span and pagemap. */
736  LW_EXCLUSIVE);
737  init_span(area, span_pointer, pool, start_pointer, npages,
739  segment_map->pagemap[first_page] = span_pointer;
741 
742  /* Zero-initialize the memory if requested. */
743  if ((flags & DSA_ALLOC_ZERO) != 0)
744  memset(dsa_get_address(area, start_pointer), 0, size);
745 
746  return start_pointer;
747  }
748 
749  /* Map allocation to a size class. */
751  {
752  int mapidx;
753 
754  /* For smaller sizes we have a lookup table... */
755  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
756  DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
757  size_class = dsa_size_class_map[mapidx];
758  }
759  else
760  {
761  uint16 min;
762  uint16 max;
763 
764  /* ... and for the rest we search by binary chop. */
766  max = lengthof(dsa_size_classes) - 1;
767 
768  while (min < max)
769  {
770  uint16 mid = (min + max) / 2;
771  uint16 class_size = dsa_size_classes[mid];
772 
773  if (class_size < size)
774  min = mid + 1;
775  else
776  max = mid;
777  }
778 
779  size_class = min;
780  }
781  Assert(size <= dsa_size_classes[size_class]);
782  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
783 
784  /* Attempt to allocate an object from the appropriate pool. */
785  result = alloc_object(area, size_class);
786 
787  /* Check for failure to allocate. */
788  if (!DsaPointerIsValid(result))
789  {
790  /* Raise error unless asked not to. */
791  if ((flags & DSA_ALLOC_NO_OOM) == 0)
792  {
793  ereport(ERROR,
794  (errcode(ERRCODE_OUT_OF_MEMORY),
795  errmsg("out of memory"),
796  errdetail("Failed on DSA request of size %zu.", size)));
797  }
798  return InvalidDsaPointer;
799  }
800 
801  /* Zero-initialize the memory if requested. */
802  if ((flags & DSA_ALLOC_ZERO) != 0)
803  memset(dsa_get_address(area, result), 0, size);
804 
805  return result;
806 }
807 
808 /*
809  * Free memory obtained with dsa_allocate.
810  */
811 void
813 {
814  dsa_segment_map *segment_map;
815  int pageno;
816  dsa_pointer span_pointer;
817  dsa_area_span *span;
818  char *superblock;
819  char *object;
820  Size size;
821  int size_class;
822 
823  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
825 
826  /* Locate the object, span and pool. */
827  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
828  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
829  span_pointer = segment_map->pagemap[pageno];
830  span = dsa_get_address(area, span_pointer);
831  superblock = dsa_get_address(area, span->start);
832  object = dsa_get_address(area, dp);
833  size_class = span->size_class;
834  size = dsa_size_classes[size_class];
835 
836  /*
837  * Special case for large objects that live in a special span: we return
838  * those pages directly to the free page manager and free the span.
839  */
840  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
841  {
842 
843 #ifdef CLOBBER_FREED_MEMORY
844  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
845 #endif
846 
847  /* Give pages back to free page manager. */
849  FreePageManagerPut(segment_map->fpm,
851  span->npages);
853  /* Unlink span. */
855  LW_EXCLUSIVE);
856  unlink_span(area, span);
858  /* Free the span object so it can be reused. */
859  dsa_free(area, span_pointer);
860  return;
861  }
862 
863 #ifdef CLOBBER_FREED_MEMORY
864  memset(object, 0x7f, size);
865 #endif
866 
867  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
868 
869  /* Put the object on the span's freelist. */
870  Assert(object >= superblock);
871  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
872  Assert((object - superblock) % size == 0);
873  NextFreeObjectIndex(object) = span->firstfree;
874  span->firstfree = (object - superblock) / size;
875  ++span->nallocatable;
876 
877  /*
878  * See if the span needs to moved to a different fullness class, or be
879  * freed so its pages can be given back to the segment.
880  */
881  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
882  {
883  /*
884  * The block was completely full and is located in the
885  * highest-numbered fullness class, which is never scanned for free
886  * chunks. We must move it to the next-lower fullness class.
887  */
888  unlink_span(area, span);
889  add_span_to_fullness_class(area, span, span_pointer,
891 
892  /*
893  * If this is the only span, and there is no active span, then we
894  * should probably move this span to fullness class 1. (Otherwise if
895  * you allocate exactly all the objects in the only span, it moves to
896  * class 3, then you free them all, it moves to 2, and then is given
897  * back, leaving no active span).
898  */
899  }
900  else if (span->nallocatable == span->nmax &&
901  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
902  {
903  /*
904  * This entire block is free, and it's not the active block for this
905  * size class. Return the memory to the free page manager. We don't
906  * do this for the active block to prevent hysteresis: if we
907  * repeatedly allocate and free the only chunk in the active block, it
908  * will be very inefficient if we deallocate and reallocate the block
909  * every time.
910  */
911  destroy_superblock(area, span_pointer);
912  }
913 
914  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
915 }
916 
917 /*
918  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
919  * memory allocated by the given area (possibly in another process) that
920  * hasn't yet been freed. This may cause a segment to be mapped into the
921  * current process if required, and may cause freed segments to be unmapped.
922  */
923 void *
925 {
927  Size offset;
928 
929  /* Convert InvalidDsaPointer to NULL. */
930  if (!DsaPointerIsValid(dp))
931  return NULL;
932 
933  /* Process any requests to detach from freed segments. */
935 
936  /* Break the dsa_pointer into its components. */
937  index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
938  offset = DSA_EXTRACT_OFFSET(dp);
939  Assert(index < DSA_MAX_SEGMENTS);
940 
941  /* Check if we need to cause this segment to be mapped in. */
942  if (unlikely(area->segment_maps[index].mapped_address == NULL))
943  {
944  /* Call for effect (we don't need the result). */
945  get_segment_by_index(area, index);
946  }
947 
948  return area->segment_maps[index].mapped_address + offset;
949 }
950 
951 /*
952  * Pin this area, so that it will continue to exist even if all backends
953  * detach from it. In that case, the area can still be reattached to if a
954  * handle has been recorded somewhere.
955  */
956 void
958 {
960  if (area->control->pinned)
961  {
963  elog(ERROR, "dsa_area already pinned");
964  }
965  area->control->pinned = true;
966  ++area->control->refcnt;
968 }
969 
970 /*
971  * Undo the effects of dsa_pin, so that the given area can be freed when no
972  * backends are attached to it. May be called only if dsa_pin has been
973  * called.
974  */
975 void
977 {
979  Assert(area->control->refcnt > 1);
980  if (!area->control->pinned)
981  {
983  elog(ERROR, "dsa_area not pinned");
984  }
985  area->control->pinned = false;
986  --area->control->refcnt;
988 }
989 
990 /*
991  * Set the total size limit for this area. This limit is checked whenever new
992  * segments need to be allocated from the operating system. If the new size
993  * limit is already exceeded, this has no immediate effect.
994  *
995  * Note that the total virtual memory usage may be temporarily larger than
996  * this limit when segments have been freed, but not yet detached by all
997  * backends that have attached to them.
998  */
999 void
1001 {
1003  area->control->max_total_segment_size = limit;
1005 }
1006 
1007 /*
1008  * Aggressively free all spare memory in the hope of returning DSM segments to
1009  * the operating system.
1010  */
1011 void
1013 {
1014  int size_class;
1015 
1016  /*
1017  * Trim in reverse pool order so we get to the spans-of-spans last, just
1018  * in case any become entirely free while processing all the other pools.
1019  */
1020  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1021  {
1022  dsa_area_pool *pool = &area->control->pools[size_class];
1023  dsa_pointer span_pointer;
1024 
1025  if (size_class == DSA_SCLASS_SPAN_LARGE)
1026  {
1027  /* Large object frees give back segments aggressively already. */
1028  continue;
1029  }
1030 
1031  /*
1032  * Search fullness class 1 only. That is where we expect to find an
1033  * entirely empty superblock (entirely empty superblocks in other
1034  * fullness classes are returned to the free page map by dsa_free).
1035  */
1036  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1037  span_pointer = pool->spans[1];
1038  while (DsaPointerIsValid(span_pointer))
1039  {
1040  dsa_area_span *span = dsa_get_address(area, span_pointer);
1041  dsa_pointer next = span->nextspan;
1042 
1043  if (span->nallocatable == span->nmax)
1044  destroy_superblock(area, span_pointer);
1045 
1046  span_pointer = next;
1047  }
1048  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1049  }
1050 }
1051 
1052 /*
1053  * Print out debugging information about the internal state of the shared
1054  * memory area.
1055  */
1056 void
1058 {
1059  Size i,
1060  j;
1061 
1062  /*
1063  * Note: This gives an inconsistent snapshot as it acquires and releases
1064  * individual locks as it goes...
1065  */
1066 
1068  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1069  fprintf(stderr, " max_total_segment_size: %zu\n",
1071  fprintf(stderr, " total_segment_size: %zu\n",
1072  area->control->total_segment_size);
1073  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1074  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1075  fprintf(stderr, " segment bins:\n");
1076  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1077  {
1078  if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1079  {
1080  dsa_segment_index segment_index;
1081 
1082  fprintf(stderr,
1083  " segment bin %zu (at least %d contiguous pages free):\n",
1084  i, 1 << (i - 1));
1085  segment_index = area->control->segment_bins[i];
1086  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1087  {
1088  dsa_segment_map *segment_map;
1089 
1090  segment_map =
1091  get_segment_by_index(area, segment_index);
1092 
1093  fprintf(stderr,
1094  " segment index %zu, usable_pages = %zu, "
1095  "contiguous_pages = %zu, mapped at %p\n",
1096  segment_index,
1097  segment_map->header->usable_pages,
1098  fpm_largest(segment_map->fpm),
1099  segment_map->mapped_address);
1100  segment_index = segment_map->header->next;
1101  }
1102  }
1103  }
1105 
1106  fprintf(stderr, " pools:\n");
1107  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1108  {
1109  bool found = false;
1110 
1112  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1113  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1114  found = true;
1115  if (found)
1116  {
1117  if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1118  fprintf(stderr, " pool for blocks of span objects:\n");
1119  else if (i == DSA_SCLASS_SPAN_LARGE)
1120  fprintf(stderr, " pool for large object spans:\n");
1121  else
1122  fprintf(stderr,
1123  " pool for size class %zu (object size %hu bytes):\n",
1124  i, dsa_size_classes[i]);
1125  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1126  {
1127  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1128  fprintf(stderr, " fullness class %zu is empty\n", j);
1129  else
1130  {
1131  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1132 
1133  fprintf(stderr, " fullness class %zu:\n", j);
1134  while (DsaPointerIsValid(span_pointer))
1135  {
1136  dsa_area_span *span;
1137 
1138  span = dsa_get_address(area, span_pointer);
1139  fprintf(stderr,
1140  " span descriptor at "
1141  DSA_POINTER_FORMAT ", superblock at "
1143  ", pages = %zu, objects free = %hu/%hu\n",
1144  span_pointer, span->start, span->npages,
1145  span->nallocatable, span->nmax);
1146  span_pointer = span->nextspan;
1147  }
1148  }
1149  }
1150  }
1151  LWLockRelease(DSA_SCLASS_LOCK(area, i));
1152  }
1153 }
1154 
1155 /*
1156  * Return the smallest size that you can successfully provide to
1157  * dsa_create_in_place.
1158  */
1159 Size
1161 {
1162  Size size;
1163  int pages = 0;
1164 
1165  size = MAXALIGN(sizeof(dsa_area_control)) +
1166  MAXALIGN(sizeof(FreePageManager));
1167 
1168  /* Figure out how many pages we need, including the page map... */
1169  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1170  {
1171  ++pages;
1172  size += sizeof(dsa_pointer);
1173  }
1174 
1175  return pages * FPM_PAGE_SIZE;
1176 }
1177 
1178 /*
1179  * Workhorse function for dsa_create and dsa_create_in_place.
1180  */
1181 static dsa_area *
1182 create_internal(void *place, size_t size,
1183  int tranche_id,
1184  dsm_handle control_handle,
1185  dsm_segment *control_segment)
1186 {
1188  dsa_area *area;
1189  dsa_segment_map *segment_map;
1190  Size usable_pages;
1191  Size total_pages;
1192  Size metadata_bytes;
1193  int i;
1194 
1195  /* Sanity check on the space we have to work in. */
1196  if (size < dsa_minimum_size())
1197  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1198  dsa_minimum_size(), size);
1199 
1200  /* Now figure out how much space is usable */
1201  total_pages = size / FPM_PAGE_SIZE;
1202  metadata_bytes =
1203  MAXALIGN(sizeof(dsa_area_control)) +
1204  MAXALIGN(sizeof(FreePageManager)) +
1205  total_pages * sizeof(dsa_pointer);
1206  /* Add padding up to next page boundary. */
1207  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1208  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1209  Assert(metadata_bytes <= size);
1210  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1211 
1212  /*
1213  * Initialize the dsa_area_control object located at the start of the
1214  * space.
1215  */
1216  control = (dsa_area_control *) place;
1217  control->segment_header.magic =
1218  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1221  control->segment_header.usable_pages = usable_pages;
1222  control->segment_header.freed = false;
1224  control->handle = control_handle;
1225  control->max_total_segment_size = (Size) -1;
1226  control->total_segment_size = size;
1227  memset(&control->segment_handles[0], 0,
1228  sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
1229  control->segment_handles[0] = control_handle;
1230  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1231  control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1232  control->high_segment_index = 0;
1233  control->refcnt = 1;
1234  control->freed_segment_counter = 0;
1235  control->lwlock_tranche_id = tranche_id;
1236 
1237  /*
1238  * Create the dsa_area object that this backend will use to access the
1239  * area. Other backends will need to obtain their own dsa_area object by
1240  * attaching.
1241  */
1242  area = palloc(sizeof(dsa_area));
1243  area->control = control;
1244  area->mapping_pinned = false;
1245  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1246  area->high_segment_index = 0;
1247  area->freed_segment_counter = 0;
1248  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1249  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1251  control->lwlock_tranche_id);
1252 
1253  /* Set up the segment map for this process's mapping. */
1254  segment_map = &area->segment_maps[0];
1255  segment_map->segment = control_segment;
1256  segment_map->mapped_address = place;
1257  segment_map->header = (dsa_segment_header *) place;
1258  segment_map->fpm = (FreePageManager *)
1259  (segment_map->mapped_address +
1260  MAXALIGN(sizeof(dsa_area_control)));
1261  segment_map->pagemap = (dsa_pointer *)
1262  (segment_map->mapped_address +
1263  MAXALIGN(sizeof(dsa_area_control)) +
1264  MAXALIGN(sizeof(FreePageManager)));
1265 
1266  /* Set up the free page map. */
1267  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1268  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1269 
1270  if (usable_pages > 0)
1271  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1272  usable_pages);
1273 
1274  /* Put this segment into the appropriate bin. */
1275  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1276  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1277 
1278  return area;
1279 }
1280 
1281 /*
1282  * Workhorse function for dsa_attach and dsa_attach_in_place.
1283  */
1284 static dsa_area *
1285 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1286 {
1288  dsa_area *area;
1289  dsa_segment_map *segment_map;
1290 
1291  control = (dsa_area_control *) place;
1292  Assert(control->handle == handle);
1293  Assert(control->segment_handles[0] == handle);
1294  Assert(control->segment_header.magic ==
1295  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1296 
1297  /* Build the backend-local area object. */
1298  area = palloc(sizeof(dsa_area));
1299  area->control = control;
1300  area->mapping_pinned = false;
1301  memset(&area->segment_maps[0], 0,
1302  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1303  area->high_segment_index = 0;
1304 
1305  /* Set up the segment map for this process's mapping. */
1306  segment_map = &area->segment_maps[0];
1307  segment_map->segment = segment; /* NULL for in-place */
1308  segment_map->mapped_address = place;
1309  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1310  segment_map->fpm = (FreePageManager *)
1311  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1312  segment_map->pagemap = (dsa_pointer *)
1313  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1314  MAXALIGN(sizeof(FreePageManager)));
1315 
1316  /* Bump the reference count. */
1318  if (control->refcnt == 0)
1319  {
1320  /* We can't attach to a DSA area that has already been destroyed. */
1321  ereport(ERROR,
1322  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1323  errmsg("could not attach to dynamic shared area")));
1324  }
1325  ++control->refcnt;
1328 
1329  return area;
1330 }
1331 
1332 /*
1333  * Add a new span to fullness class 1 of the indicated pool.
1334  */
1335 static void
1337  dsa_pointer span_pointer,
1338  dsa_area_pool *pool, dsa_pointer start, Size npages,
1339  uint16 size_class)
1340 {
1341  dsa_area_span *span = dsa_get_address(area, span_pointer);
1342  Size obsize = dsa_size_classes[size_class];
1343 
1344  /*
1345  * The per-pool lock must be held because we manipulate the span list for
1346  * this pool.
1347  */
1348  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1349 
1350  /* Push this span onto the front of the span list for fullness class 1. */
1351  if (DsaPointerIsValid(pool->spans[1]))
1352  {
1353  dsa_area_span *head = (dsa_area_span *)
1354  dsa_get_address(area, pool->spans[1]);
1355 
1356  head->prevspan = span_pointer;
1357  }
1358  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1359  span->nextspan = pool->spans[1];
1360  span->prevspan = InvalidDsaPointer;
1361  pool->spans[1] = span_pointer;
1362 
1363  span->start = start;
1364  span->npages = npages;
1365  span->size_class = size_class;
1366  span->ninitialized = 0;
1367  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1368  {
1369  /*
1370  * A block-of-spans contains its own descriptor, so mark one object as
1371  * initialized and reduce the count of allocatable objects by one.
1372  * Doing this here has the side effect of also reducing nmax by one,
1373  * which is important to make sure we free this object at the correct
1374  * time.
1375  */
1376  span->ninitialized = 1;
1377  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1378  }
1379  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1380  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1382  span->nmax = span->nallocatable;
1383  span->fclass = 1;
1384 }
1385 
1386 /*
1387  * Transfer the first span in one fullness class to the head of another
1388  * fullness class.
1389  */
1390 static bool
1392  dsa_area_pool *pool, int fromclass, int toclass)
1393 {
1394  dsa_pointer span_pointer;
1395  dsa_area_span *span;
1396  dsa_area_span *nextspan;
1397 
1398  /* Can't do it if source list is empty. */
1399  span_pointer = pool->spans[fromclass];
1400  if (!DsaPointerIsValid(span_pointer))
1401  return false;
1402 
1403  /* Remove span from head of source list. */
1404  span = dsa_get_address(area, span_pointer);
1405  pool->spans[fromclass] = span->nextspan;
1406  if (DsaPointerIsValid(span->nextspan))
1407  {
1408  nextspan = (dsa_area_span *)
1409  dsa_get_address(area, span->nextspan);
1410  nextspan->prevspan = InvalidDsaPointer;
1411  }
1412 
1413  /* Add span to head of target list. */
1414  span->nextspan = pool->spans[toclass];
1415  pool->spans[toclass] = span_pointer;
1416  if (DsaPointerIsValid(span->nextspan))
1417  {
1418  nextspan = (dsa_area_span *)
1419  dsa_get_address(area, span->nextspan);
1420  nextspan->prevspan = span_pointer;
1421  }
1422  span->fclass = toclass;
1423 
1424  return true;
1425 }
1426 
1427 /*
1428  * Allocate one object of the requested size class from the given area.
1429  */
1430 static inline dsa_pointer
1431 alloc_object(dsa_area *area, int size_class)
1432 {
1433  dsa_area_pool *pool = &area->control->pools[size_class];
1434  dsa_area_span *span;
1435  dsa_pointer block;
1436  dsa_pointer result;
1437  char *object;
1438  Size size;
1439 
1440  /*
1441  * Even though ensure_active_superblock can in turn call alloc_object if
1442  * it needs to allocate a new span, that's always from a different pool,
1443  * and the order of lock acquisition is always the same, so it's OK that
1444  * we hold this lock for the duration of this function.
1445  */
1446  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1447  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1448 
1449  /*
1450  * If there's no active superblock, we must successfully obtain one or
1451  * fail the request.
1452  */
1453  if (!DsaPointerIsValid(pool->spans[1]) &&
1454  !ensure_active_superblock(area, pool, size_class))
1455  {
1456  result = InvalidDsaPointer;
1457  }
1458  else
1459  {
1460  /*
1461  * There should be a block in fullness class 1 at this point, and it
1462  * should never be completely full. Thus we can either pop an object
1463  * from the free list or, failing that, initialize a new object.
1464  */
1465  Assert(DsaPointerIsValid(pool->spans[1]));
1466  span = (dsa_area_span *)
1467  dsa_get_address(area, pool->spans[1]);
1468  Assert(span->nallocatable > 0);
1469  block = span->start;
1470  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1471  size = dsa_size_classes[size_class];
1472  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1473  {
1474  result = block + span->firstfree * size;
1475  object = dsa_get_address(area, result);
1476  span->firstfree = NextFreeObjectIndex(object);
1477  }
1478  else
1479  {
1480  result = block + span->ninitialized * size;
1481  ++span->ninitialized;
1482  }
1483  --span->nallocatable;
1484 
1485  /* If it's now full, move it to the highest-numbered fullness class. */
1486  if (span->nallocatable == 0)
1487  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1488  }
1489 
1490  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1491  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1492 
1493  return result;
1494 }
1495 
1496 /*
1497  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1498  * superblocks are completely full and no more can be allocated.
1499  *
1500  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1501  * utilization percentage is at least K/N, but we only enforce this rigorously
1502  * for the highest-numbered fullness class, which always contains exactly
1503  * those blocks that are completely full. It's otherwise acceptable for a
1504  * block to be in a higher-numbered fullness class than the one to which it
1505  * logically belongs. In addition, the active block, which is always the
1506  * first block in fullness class 1, is permitted to have a higher allocation
1507  * percentage than would normally be allowable for that fullness class; we
1508  * don't move it until it's completely full, and then it goes to the
1509  * highest-numbered fullness class.
1510  *
1511  * It might seem odd that the active block is the head of fullness class 1
1512  * rather than fullness class 0, but experience with other allocators has
1513  * shown that it's usually better to allocate from a block that's moderately
1514  * full rather than one that's nearly empty. Insofar as is reasonably
1515  * possible, we want to avoid performing new allocations in a block that would
1516  * otherwise become empty soon.
1517  */
1518 static bool
1520  int size_class)
1521 {
1522  dsa_pointer span_pointer;
1523  dsa_pointer start_pointer;
1524  Size obsize = dsa_size_classes[size_class];
1525  Size nmax;
1526  int fclass;
1527  Size npages = 1;
1528  Size first_page;
1529  Size i;
1530  dsa_segment_map *segment_map;
1531 
1532  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1533 
1534  /*
1535  * Compute the number of objects that will fit in a block of this size
1536  * class. Span-of-spans blocks are just a single page, and the first
1537  * object isn't available for use because it describes the block-of-spans
1538  * itself.
1539  */
1540  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1541  nmax = FPM_PAGE_SIZE / obsize - 1;
1542  else
1543  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1544 
1545  /*
1546  * If fullness class 1 is empty, try to find a span to put in it by
1547  * scanning higher-numbered fullness classes (excluding the last one,
1548  * whose blocks are certain to all be completely full).
1549  */
1550  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1551  {
1552  span_pointer = pool->spans[fclass];
1553 
1554  while (DsaPointerIsValid(span_pointer))
1555  {
1556  int tfclass;
1557  dsa_area_span *span;
1558  dsa_area_span *nextspan;
1559  dsa_area_span *prevspan;
1560  dsa_pointer next_span_pointer;
1561 
1562  span = (dsa_area_span *)
1563  dsa_get_address(area, span_pointer);
1564  next_span_pointer = span->nextspan;
1565 
1566  /* Figure out what fullness class should contain this span. */
1567  tfclass = (nmax - span->nallocatable)
1568  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1569 
1570  /* Look up next span. */
1571  if (DsaPointerIsValid(span->nextspan))
1572  nextspan = (dsa_area_span *)
1573  dsa_get_address(area, span->nextspan);
1574  else
1575  nextspan = NULL;
1576 
1577  /*
1578  * If utilization has dropped enough that this now belongs in some
1579  * other fullness class, move it there.
1580  */
1581  if (tfclass < fclass)
1582  {
1583  /* Remove from the current fullness class list. */
1584  if (pool->spans[fclass] == span_pointer)
1585  {
1586  /* It was the head; remove it. */
1588  pool->spans[fclass] = span->nextspan;
1589  if (nextspan != NULL)
1590  nextspan->prevspan = InvalidDsaPointer;
1591  }
1592  else
1593  {
1594  /* It was not the head. */
1596  prevspan = (dsa_area_span *)
1597  dsa_get_address(area, span->prevspan);
1598  prevspan->nextspan = span->nextspan;
1599  }
1600  if (nextspan != NULL)
1601  nextspan->prevspan = span->prevspan;
1602 
1603  /* Push onto the head of the new fullness class list. */
1604  span->nextspan = pool->spans[tfclass];
1605  pool->spans[tfclass] = span_pointer;
1606  span->prevspan = InvalidDsaPointer;
1607  if (DsaPointerIsValid(span->nextspan))
1608  {
1609  nextspan = (dsa_area_span *)
1610  dsa_get_address(area, span->nextspan);
1611  nextspan->prevspan = span_pointer;
1612  }
1613  span->fclass = tfclass;
1614  }
1615 
1616  /* Advance to next span on list. */
1617  span_pointer = next_span_pointer;
1618  }
1619 
1620  /* Stop now if we found a suitable block. */
1621  if (DsaPointerIsValid(pool->spans[1]))
1622  return true;
1623  }
1624 
1625  /*
1626  * If there are no blocks that properly belong in fullness class 1, pick
1627  * one from some other fullness class and move it there anyway, so that we
1628  * have an allocation target. Our last choice is to transfer a block
1629  * that's almost empty (and might become completely empty soon if left
1630  * alone), but even that is better than failing, which is what we must do
1631  * if there are no blocks at all with freespace.
1632  */
1633  Assert(!DsaPointerIsValid(pool->spans[1]));
1634  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1635  if (transfer_first_span(area, pool, fclass, 1))
1636  return true;
1637  if (!DsaPointerIsValid(pool->spans[1]) &&
1638  transfer_first_span(area, pool, 0, 1))
1639  return true;
1640 
1641  /*
1642  * We failed to find an existing span with free objects, so we need to
1643  * allocate a new superblock and construct a new span to manage it.
1644  *
1645  * First, get a dsa_area_span object to describe the new superblock block
1646  * ... unless this allocation is for a dsa_area_span object, in which case
1647  * that's surely not going to work. We handle that case by storing the
1648  * span describing a block-of-spans inline.
1649  */
1650  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1651  {
1652  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1653  if (!DsaPointerIsValid(span_pointer))
1654  return false;
1655  npages = DSA_PAGES_PER_SUPERBLOCK;
1656  }
1657 
1658  /* Find or create a segment and allocate the superblock. */
1660  segment_map = get_best_segment(area, npages);
1661  if (segment_map == NULL)
1662  {
1663  segment_map = make_new_segment(area, npages);
1664  if (segment_map == NULL)
1665  {
1667  return false;
1668  }
1669  }
1670  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1671  {
1673  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1674  dsa_free(area, span_pointer);
1675  return false;
1676  }
1678 
1679  /* Compute the start of the superblock. */
1680  start_pointer =
1681  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1682  first_page * FPM_PAGE_SIZE);
1683 
1684  /*
1685  * If this is a block-of-spans, carve the descriptor right out of the
1686  * allocated space.
1687  */
1688  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1689  {
1690  /*
1691  * We have a pointer into the segment. We need to build a dsa_pointer
1692  * from the segment index and offset into the segment.
1693  */
1694  span_pointer = start_pointer;
1695  }
1696 
1697  /* Initialize span and pagemap. */
1698  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1699  for (i = 0; i < npages; ++i)
1700  segment_map->pagemap[first_page + i] = span_pointer;
1701 
1702  return true;
1703 }
1704 
1705 /*
1706  * Return the segment map corresponding to a given segment index, mapping the
1707  * segment in if necessary. For internal segment book-keeping, this is called
1708  * with the area lock held. It is also called by dsa_free and dsa_get_address
1709  * without any locking, relying on the fact they have a known live segment
1710  * index and they always call check_for_freed_segments to ensures that any
1711  * freed segment occupying the same slot is detached first.
1712  */
1713 static dsa_segment_map *
1715 {
1716  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1717  {
1718  dsm_handle handle;
1719  dsm_segment *segment;
1720  dsa_segment_map *segment_map;
1721 
1722  /*
1723  * If we are reached by dsa_free or dsa_get_address, there must be at
1724  * least one object allocated in the referenced segment. Otherwise,
1725  * their caller has a double-free or access-after-free bug, which we
1726  * have no hope of detecting. So we know it's safe to access this
1727  * array slot without holding a lock; it won't change underneath us.
1728  * Furthermore, we know that we can see the latest contents of the
1729  * slot, as explained in check_for_freed_segments, which those
1730  * functions call before arriving here.
1731  */
1732  handle = area->control->segment_handles[index];
1733 
1734  /* It's an error to try to access an unused slot. */
1735  if (handle == DSM_HANDLE_INVALID)
1736  elog(ERROR,
1737  "dsa_area could not attach to a segment that has been freed");
1738 
1739  segment = dsm_attach(handle);
1740  if (segment == NULL)
1741  elog(ERROR, "dsa_area could not attach to segment");
1742  if (area->mapping_pinned)
1743  dsm_pin_mapping(segment);
1744  segment_map = &area->segment_maps[index];
1745  segment_map->segment = segment;
1746  segment_map->mapped_address = dsm_segment_address(segment);
1747  segment_map->header =
1748  (dsa_segment_header *) segment_map->mapped_address;
1749  segment_map->fpm = (FreePageManager *)
1750  (segment_map->mapped_address +
1751  MAXALIGN(sizeof(dsa_segment_header)));
1752  segment_map->pagemap = (dsa_pointer *)
1753  (segment_map->mapped_address +
1754  MAXALIGN(sizeof(dsa_segment_header)) +
1755  MAXALIGN(sizeof(FreePageManager)));
1756 
1757  /* Remember the highest index this backend has ever mapped. */
1758  if (area->high_segment_index < index)
1759  area->high_segment_index = index;
1760 
1761  Assert(segment_map->header->magic ==
1762  (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1763  }
1764 
1765  return &area->segment_maps[index];
1766 }
1767 
1768 /*
1769  * Return a superblock to the free page manager. If the underlying segment
1770  * has become entirely free, then return it to the operating system.
1771  *
1772  * The appropriate pool lock must be held.
1773  */
1774 static void
1776 {
1777  dsa_area_span *span = dsa_get_address(area, span_pointer);
1778  int size_class = span->size_class;
1779  dsa_segment_map *segment_map;
1780 
1781  segment_map =
1783 
1784  /* Remove it from its fullness class list. */
1785  unlink_span(area, span);
1786 
1787  /*
1788  * Note: Here we acquire the area lock while we already hold a per-pool
1789  * lock. We never hold the area lock and then take a pool lock, or we
1790  * could deadlock.
1791  */
1793  FreePageManagerPut(segment_map->fpm,
1795  span->npages);
1796  /* Check if the segment is now entirely free. */
1797  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1798  {
1799  dsa_segment_index index = get_segment_index(area, segment_map);
1800 
1801  /* If it's not the segment with extra control data, free it. */
1802  if (index != 0)
1803  {
1804  /*
1805  * Give it back to the OS, and allow other backends to detect that
1806  * they need to detach.
1807  */
1808  unlink_segment(area, segment_map);
1809  segment_map->header->freed = true;
1811  segment_map->header->size);
1812  area->control->total_segment_size -=
1813  segment_map->header->size;
1815  dsm_detach(segment_map->segment);
1816  area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1817  ++area->control->freed_segment_counter;
1818  segment_map->segment = NULL;
1819  segment_map->header = NULL;
1820  segment_map->mapped_address = NULL;
1821  }
1822  }
1824 
1825  /*
1826  * Span-of-spans blocks store the span which describes them within the
1827  * block itself, so freeing the storage implicitly frees the descriptor
1828  * also. If this is a block of any other type, we need to separately free
1829  * the span object also. This recursive call to dsa_free will acquire the
1830  * span pool's lock. We can't deadlock because the acquisition order is
1831  * always some other pool and then the span pool.
1832  */
1833  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1834  dsa_free(area, span_pointer);
1835 }
1836 
1837 static void
1839 {
1840  if (DsaPointerIsValid(span->nextspan))
1841  {
1842  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1843 
1844  next->prevspan = span->prevspan;
1845  }
1846  if (DsaPointerIsValid(span->prevspan))
1847  {
1848  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1849 
1850  prev->nextspan = span->nextspan;
1851  }
1852  else
1853  {
1854  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1855 
1856  pool->spans[span->fclass] = span->nextspan;
1857  }
1858 }
1859 
1860 static void
1862  dsa_pointer span_pointer,
1863  int fclass)
1864 {
1865  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1866 
1867  if (DsaPointerIsValid(pool->spans[fclass]))
1868  {
1869  dsa_area_span *head = dsa_get_address(area,
1870  pool->spans[fclass]);
1871 
1872  head->prevspan = span_pointer;
1873  }
1874  span->prevspan = InvalidDsaPointer;
1875  span->nextspan = pool->spans[fclass];
1876  pool->spans[fclass] = span_pointer;
1877  span->fclass = fclass;
1878 }
1879 
1880 /*
1881  * Detach from an area that was either created or attached to by this process.
1882  */
1883 void
1885 {
1886  int i;
1887 
1888  /* Detach from all segments. */
1889  for (i = 0; i <= area->high_segment_index; ++i)
1890  if (area->segment_maps[i].segment != NULL)
1891  dsm_detach(area->segment_maps[i].segment);
1892 
1893  /*
1894  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1895  * 'releasing' (= adjusting the reference count). It would be nice to
1896  * combine these operations, but client code might never get around to
1897  * calling dsa_detach because of an error path, and a detach hook on any
1898  * particular segment is too late to detach other segments in the area
1899  * without risking a 'leak' warning in the non-error path.
1900  */
1901 
1902  /* Free the backend-local area object. */
1903  pfree(area);
1904 }
1905 
1906 /*
1907  * Unlink a segment from the bin that contains it.
1908  */
1909 static void
1911 {
1912  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1913  {
1914  dsa_segment_map *prev;
1915 
1916  prev = get_segment_by_index(area, segment_map->header->prev);
1917  prev->header->next = segment_map->header->next;
1918  }
1919  else
1920  {
1921  Assert(area->control->segment_bins[segment_map->header->bin] ==
1922  get_segment_index(area, segment_map));
1923  area->control->segment_bins[segment_map->header->bin] =
1924  segment_map->header->next;
1925  }
1926  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1927  {
1929 
1930  next = get_segment_by_index(area, segment_map->header->next);
1931  next->header->prev = segment_map->header->prev;
1932  }
1933 }
1934 
1935 /*
1936  * Find a segment that could satisfy a request for 'npages' of contiguous
1937  * memory, or return NULL if none can be found. This may involve attaching to
1938  * segments that weren't previously attached so that we can query their free
1939  * pages map.
1940  */
1941 static dsa_segment_map *
1943 {
1944  Size bin;
1945 
1947 
1948  /*
1949  * Start searching from the first bin that *might* have enough contiguous
1950  * pages.
1951  */
1952  for (bin = contiguous_pages_to_segment_bin(npages);
1953  bin < DSA_NUM_SEGMENT_BINS;
1954  ++bin)
1955  {
1956  /*
1957  * The minimum contiguous size that any segment in this bin should
1958  * have. We'll re-bin if we see segments with fewer.
1959  */
1960  Size threshold = (Size) 1 << (bin - 1);
1961  dsa_segment_index segment_index;
1962 
1963  /* Search this bin for a segment with enough contiguous space. */
1964  segment_index = area->control->segment_bins[bin];
1965  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1966  {
1967  dsa_segment_map *segment_map;
1968  dsa_segment_index next_segment_index;
1969  Size contiguous_pages;
1970 
1971  segment_map = get_segment_by_index(area, segment_index);
1972  next_segment_index = segment_map->header->next;
1973  contiguous_pages = fpm_largest(segment_map->fpm);
1974 
1975  /* Not enough for the request, still enough for this bin. */
1976  if (contiguous_pages >= threshold && contiguous_pages < npages)
1977  {
1978  segment_index = next_segment_index;
1979  continue;
1980  }
1981 
1982  /* Re-bin it if it's no longer in the appropriate bin. */
1983  if (contiguous_pages < threshold)
1984  {
1985  Size new_bin;
1986 
1987  new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
1988 
1989  /* Remove it from its current bin. */
1990  unlink_segment(area, segment_map);
1991 
1992  /* Push it onto the front of its new bin. */
1993  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
1994  segment_map->header->next =
1995  area->control->segment_bins[new_bin];
1996  segment_map->header->bin = new_bin;
1997  area->control->segment_bins[new_bin] = segment_index;
1998  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1999  {
2001 
2002  next = get_segment_by_index(area,
2003  segment_map->header->next);
2004  Assert(next->header->bin == new_bin);
2005  next->header->prev = segment_index;
2006  }
2007 
2008  /*
2009  * But fall through to see if it's enough to satisfy this
2010  * request anyway....
2011  */
2012  }
2013 
2014  /* Check if we are done. */
2015  if (contiguous_pages >= npages)
2016  return segment_map;
2017 
2018  /* Continue searching the same bin. */
2019  segment_index = next_segment_index;
2020  }
2021  }
2022 
2023  /* Not found. */
2024  return NULL;
2025 }
2026 
2027 /*
2028  * Create a new segment that can handle at least requested_pages. Returns
2029  * NULL if the requested total size limit or maximum allowed number of
2030  * segments would be exceeded.
2031  */
2032 static dsa_segment_map *
2033 make_new_segment(dsa_area *area, Size requested_pages)
2034 {
2035  dsa_segment_index new_index;
2036  Size metadata_bytes;
2037  Size total_size;
2038  Size total_pages;
2039  Size usable_pages;
2040  dsa_segment_map *segment_map;
2041  dsm_segment *segment;
2042 
2044 
2045  /* Find a segment slot that is not in use (linearly for now). */
2046  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2047  {
2048  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2049  break;
2050  }
2051  if (new_index == DSA_MAX_SEGMENTS)
2052  return NULL;
2053 
2054  /*
2055  * If the total size limit is already exceeded, then we exit early and
2056  * avoid arithmetic wraparound in the unsigned expressions below.
2057  */
2058  if (area->control->total_segment_size >=
2060  return NULL;
2061 
2062  /*
2063  * The size should be at least as big as requested, and at least big
2064  * enough to follow a geometric series that approximately doubles the
2065  * total storage each time we create a new segment. We use geometric
2066  * growth because the underlying DSM system isn't designed for large
2067  * numbers of segments (otherwise we might even consider just using one
2068  * DSM segment for each large allocation and for each superblock, and then
2069  * we wouldn't need to use FreePageManager).
2070  *
2071  * We decide on a total segment size first, so that we produce tidy
2072  * power-of-two sized segments. This is a good property to have if we
2073  * move to huge pages in the future. Then we work back to the number of
2074  * pages we can fit.
2075  */
2076  total_size = DSA_INITIAL_SEGMENT_SIZE *
2077  ((Size) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2078  total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2079  total_size = Min(total_size,
2081  area->control->total_segment_size);
2082 
2083  total_pages = total_size / FPM_PAGE_SIZE;
2084  metadata_bytes =
2085  MAXALIGN(sizeof(dsa_segment_header)) +
2086  MAXALIGN(sizeof(FreePageManager)) +
2087  sizeof(dsa_pointer) * total_pages;
2088 
2089  /* Add padding up to next page boundary. */
2090  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2091  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2092  if (total_size <= metadata_bytes)
2093  return NULL;
2094  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2095  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2096 
2097  /* See if that is enough... */
2098  if (requested_pages > usable_pages)
2099  {
2100  /*
2101  * We'll make an odd-sized segment, working forward from the requested
2102  * number of pages.
2103  */
2104  usable_pages = requested_pages;
2105  metadata_bytes =
2106  MAXALIGN(sizeof(dsa_segment_header)) +
2107  MAXALIGN(sizeof(FreePageManager)) +
2108  usable_pages * sizeof(dsa_pointer);
2109 
2110  /* Add padding up to next page boundary. */
2111  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2112  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2113  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2114 
2115  /* Is that too large for dsa_pointer's addressing scheme? */
2116  if (total_size > DSA_MAX_SEGMENT_SIZE)
2117  return NULL;
2118 
2119  /* Would that exceed the limit? */
2120  if (total_size > area->control->max_total_segment_size -
2121  area->control->total_segment_size)
2122  return NULL;
2123  }
2124 
2125  /* Create the segment. */
2126  segment = dsm_create(total_size, 0);
2127  if (segment == NULL)
2128  return NULL;
2129  dsm_pin_segment(segment);
2130  if (area->mapping_pinned)
2131  dsm_pin_mapping(segment);
2132 
2133  /* Store the handle in shared memory to be found by index. */
2134  area->control->segment_handles[new_index] =
2135  dsm_segment_handle(segment);
2136  /* Track the highest segment index in the history of the area. */
2137  if (area->control->high_segment_index < new_index)
2138  area->control->high_segment_index = new_index;
2139  /* Track the highest segment index this backend has ever mapped. */
2140  if (area->high_segment_index < new_index)
2141  area->high_segment_index = new_index;
2142  /* Track total size of all segments. */
2143  area->control->total_segment_size += total_size;
2146 
2147  /* Build a segment map for this segment in this backend. */
2148  segment_map = &area->segment_maps[new_index];
2149  segment_map->segment = segment;
2150  segment_map->mapped_address = dsm_segment_address(segment);
2151  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2152  segment_map->fpm = (FreePageManager *)
2153  (segment_map->mapped_address +
2154  MAXALIGN(sizeof(dsa_segment_header)));
2155  segment_map->pagemap = (dsa_pointer *)
2156  (segment_map->mapped_address +
2157  MAXALIGN(sizeof(dsa_segment_header)) +
2158  MAXALIGN(sizeof(FreePageManager)));
2159 
2160  /* Set up the free page map. */
2161  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2162  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2163  usable_pages);
2164 
2165  /* Set up the segment header and put it in the appropriate bin. */
2166  segment_map->header->magic =
2167  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2168  segment_map->header->usable_pages = usable_pages;
2169  segment_map->header->size = total_size;
2170  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2171  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2172  segment_map->header->next =
2173  area->control->segment_bins[segment_map->header->bin];
2174  segment_map->header->freed = false;
2175  area->control->segment_bins[segment_map->header->bin] = new_index;
2176  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2177  {
2179  get_segment_by_index(area, segment_map->header->next);
2180 
2181  Assert(next->header->bin == segment_map->header->bin);
2182  next->header->prev = new_index;
2183  }
2184 
2185  return segment_map;
2186 }
2187 
2188 /*
2189  * Check if any segments have been freed by destroy_superblock, so we can
2190  * detach from them in this backend. This function is called by
2191  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2192  * received can be resolved to the correct segment.
2193  *
2194  * The danger we want to defend against is that there could be an old segment
2195  * mapped into a given slot in this backend, and the dsa_pointer they have
2196  * might refer to some new segment in the same slot. So those functions must
2197  * be sure to process all instructions to detach from a freed segment that had
2198  * been generated by the time this process received the dsa_pointer, before
2199  * they call get_segment_by_index.
2200  */
2201 static void
2203 {
2205 
2206  /*
2207  * Any other process that has freed a segment has incremented
2208  * free_segment_counter while holding an LWLock, and that must precede any
2209  * backend creating a new segment in the same slot while holding an
2210  * LWLock, and that must precede the creation of any dsa_pointer pointing
2211  * into the new segment which might reach us here, and the caller must
2212  * have sent the dsa_pointer to this process using appropriate memory
2213  * synchronization (some kind of locking or atomic primitive or system
2214  * call). So all we need to do on the reading side is ask for the load of
2215  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2216  * has, and we can be sure to detect any segments that had been freed as
2217  * of the time that the dsa_pointer reached this process.
2218  */
2219  pg_read_barrier();
2220  freed_segment_counter = area->control->freed_segment_counter;
2221  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2222  {
2223  int i;
2224 
2225  /* Check all currently mapped segments to find what's been freed. */
2227  for (i = 0; i <= area->high_segment_index; ++i)
2228  {
2229  if (area->segment_maps[i].header != NULL &&
2230  area->segment_maps[i].header->freed)
2231  {
2232  dsm_detach(area->segment_maps[i].segment);
2233  area->segment_maps[i].segment = NULL;
2234  area->segment_maps[i].header = NULL;
2235  area->segment_maps[i].mapped_address = NULL;
2236  }
2237  }
2240  }
2241 }
uint16 nmax
Definition: dsa.c:205
bool mapping_pinned
Definition: dsa.c:360
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:307
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:377
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:378
dsa_pointer nextspan
Definition: dsa.c:198
Definition: lwlock.h:32
void dsa_unpin(dsa_area *area)
Definition: dsa.c:976
Size freed_segment_counter
Definition: dsa.c:321
static int32 next
Definition: blutils.c:210
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define InvalidDsaPointer
Definition: dsa.h:78
uint32 dsm_handle
Definition: dsm_impl.h:55
dsa_pointer dsa_allocate_extended(dsa_area *area, Size size, int flags)
Definition: dsa.c:664
uint16 ninitialized
Definition: dsa.c:202
#define contiguous_pages_to_segment_bin(n)
Definition: dsa.c:140
Size max_total_segment_size
Definition: dsa.c:313
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1838
#define PointerGetDatum(X)
Definition: postgres.h:562
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:548
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:247
dsa_segment_index high_segment_index
Definition: dsa.c:371
#define Min(x, y)
Definition: c.h:812
dsa_segment_index high_segment_index
Definition: dsa.c:315
dsm_segment * segment
Definition: dsa.c:341
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:540
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
static char dsa_size_class_map[]
Definition: dsa.c:259
dsa_segment_index prev
Definition: dsa.c:164
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
bool pinned
Definition: dsa.c:319
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1015
int errcode(int sqlerrcode)
Definition: elog.c:575
static dsa_segment_map * get_best_segment(dsa_area *area, Size npages)
Definition: dsa.c:1942
dsm_handle dsa_handle
Definition: dsa.h:100
int lwlock_tranche_id
Definition: dsa.c:323
#define lengthof(array)
Definition: c.h:610
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1285
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1024
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:277
dsa_pointer start
Definition: dsa.c:199
uint64 dsa_pointer
Definition: dsa.h:62
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
FreePageManager * fpm
Definition: dsa.c:344
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:101
dsa_segment_header * header
Definition: dsa.c:343
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:419
uint16 fclass
Definition: dsa.c:206
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:381
Size total_segment_size
Definition: dsa.c:311
Definition: type.h:89
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1722
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:493
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:854
#define DSM_HANDLE_INVALID
Definition: dsm.h:23
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:924
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
LWLock lock
Definition: dsa.c:325
unsigned short uint16
Definition: c.h:305
void pfree(void *pointer)
Definition: mcxt.c:936
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1519
int refcnt
Definition: dsa.c:317
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:68
#define ERROR
Definition: elog.h:43
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1182
uint16 size_class
Definition: dsa.c:201
#define FATAL
Definition: elog.h:52
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1391
void dsa_set_size_limit(dsa_area *area, Size limit)
Definition: dsa.c:1000
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:814
void dsa_release_in_place(void *place)
Definition: dsa.c:600
dsa_segment_index next
Definition: dsa.c:170
dsa_area_control * control
Definition: dsa.c:357
int errdetail(const char *fmt,...)
Definition: elog.c:873
void dsa_detach(dsa_area *area)
Definition: dsa.c:1884
unsigned int uint32
Definition: c.h:306
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:630
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:886
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:468
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:77
#define ereport(elevel, rest)
Definition: elog.h:122
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:133
static const uint16 dsa_size_classes[]
Definition: dsa.c:236
#define AllocSizeIsValid(size)
Definition: memutils.h:42
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1838
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:674
Size dsa_segment_index
Definition: dsa.c:124
Size npages
Definition: dsa.c:200
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, Size npages, uint16 size_class)
Definition: dsa.c:1336
Size freed_segment_counter
Definition: dsa.c:374
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:571
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1910
void dsa_trim(dsa_area *area)
Definition: dsa.c:1012
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
uintptr_t Datum
Definition: postgres.h:372
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:452
static dsa_segment_map * make_new_segment(dsa_area *area, Size requested_pages)
Definition: dsa.c:2033
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:144
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:505
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:269
dsa_segment_header segment_header
Definition: dsa.c:301
dsa_pointer prevspan
Definition: dsa.c:197
Size dsa_minimum_size(void)
Definition: dsa.c:1160
#define FPM_PAGE_SIZE
Definition: freepage.h:30
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:250
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1861
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:118
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:987
#define Assert(condition)
Definition: c.h:680
#define fpm_largest(fpm)
Definition: freepage.h:88
#define pg_read_barrier()
Definition: atomics.h:161
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:127
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:121
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:114
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:309
size_t Size
Definition: c.h:414
dsa_pointer pool
Definition: dsa.c:196
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:305
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1118
#define MAXALIGN(LEN)
Definition: c.h:633
dsa_pointer * pagemap
Definition: dsa.c:345
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:46
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:290
char * mapped_address
Definition: dsa.c:342
#define DatumGetPointer(X)
Definition: postgres.h:555
#define DSA_MAX_SEGMENTS
Definition: dsa.c:94
#define DsaPointerIsValid(x)
Definition: dsa.h:81
Size usable_pages
Definition: dsa.c:156
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1775
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:368
Definition: dsa.c:354
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:812
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:713
uint16 nallocatable
Definition: dsa.c:203
void * palloc(Size size)
Definition: mcxt.c:835
int errmsg(const char *fmt,...)
Definition: elog.c:797
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2202
int i
void dsa_dump(dsa_area *area)
Definition: dsa.c:1057
#define unlikely(x)
Definition: c.h:200
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1431
uint16 firstfree
Definition: dsa.c:204
LWLock lock
Definition: dsa.c:288
#define elog
Definition: elog.h:219
#define DSA_AREA_LOCK(area)
Definition: dsa.c:143
void dsa_pin(dsa_area *area)
Definition: dsa.c:957
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:251
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:111
uint32 magic
Definition: dsa.c:154
dsa_handle handle
Definition: dsa.c:303
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:104
#define NextFreeObjectIndex(object)
Definition: dsa.c:213
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:329
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:585
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1714