PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "storage/dsm.h"
55 #include "storage/ipc.h"
56 #include "storage/lwlock.h"
57 #include "storage/shmem.h"
58 #include "utils/dsa.h"
59 #include "utils/freepage.h"
60 #include "utils/memutils.h"
61 
62 /*
63  * The size of the initial DSM segment that backs a dsa_area created by
64  * dsa_create. After creating some number of segments of this size we'll
65  * double this size, and so on. Larger segments may be created if necessary
66  * to satisfy large requests.
67  */
68 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
69 
70 /*
71  * How many segments to create before we double the segment size. If this is
72  * low, then there is likely to be a lot of wasted space in the largest
73  * segment. If it is high, then we risk running out of segment slots (see
74  * dsm.c's limits on total number of segments), or limiting the total size
75  * an area can manage when using small pointers.
76  */
77 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4
78 
79 /*
80  * The number of bits used to represent the offset part of a dsa_pointer.
81  * This controls the maximum size of a segment, the maximum possible
82  * allocation size and also the maximum number of segments per area.
83  */
84 #if SIZEOF_DSA_POINTER == 4
85 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
86 #else
87 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
88 #endif
89 
90 /*
91  * The maximum number of DSM segments that an area can own, determined by
92  * the number of bits remaining (but capped at 1024).
93  */
94 #define DSA_MAX_SEGMENTS \
95  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96 
97 /* The bitmask for extracting the offset from a dsa_pointer. */
98 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99 
100 /* The maximum size of a DSM segment. */
101 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
102 
103 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104 #define DSA_PAGES_PER_SUPERBLOCK 16
105 
106 /*
107  * A magic number used as a sanity check for following DSM segments belonging
108  * to a DSA area (this number will be XORed with the area handle and
109  * the segment index).
110  */
111 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112 
113 /* Build a dsa_pointer given a segment number and offset. */
114 #define DSA_MAKE_POINTER(segment_number, offset) \
115  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116 
117 /* Extract the segment number from a dsa_pointer. */
118 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119 
120 /* Extract the offset from a dsa_pointer. */
121 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122 
123 /* The type used for index segment indexes (zero based). */
124 typedef size_t dsa_segment_index;
125 
126 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128 
129 /*
130  * How many bins of segments do we have? The bins are used to categorize
131  * segments by their largest contiguous run of free pages.
132  */
133 #define DSA_NUM_SEGMENT_BINS 16
134 
135 /*
136  * What is the lowest bin that holds segments that *might* have n contiguous
137  * free pages? There is no point in looking in segments in lower bins; they
138  * definitely can't service a request for n free pages.
139  */
140 #define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
141 
142 /* Macros for access to locks. */
143 #define DSA_AREA_LOCK(area) (&area->control->lock)
144 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
145 
146 /*
147  * The header for an individual segment. This lives at the start of each DSM
148  * segment owned by a DSA area including the first segment (where it appears
149  * as part of the dsa_area_control struct).
150  */
151 typedef struct
152 {
153  /* Sanity check magic value. */
155  /* Total number of pages in this segment (excluding metadata area). */
156  size_t usable_pages;
157  /* Total size of this segment in bytes. */
158  size_t size;
159 
160  /*
161  * Index of the segment that precedes this one in the same segment bin, or
162  * DSA_SEGMENT_INDEX_NONE if this is the first one.
163  */
165 
166  /*
167  * Index of the segment that follows this one in the same segment bin, or
168  * DSA_SEGMENT_INDEX_NONE if this is the last one.
169  */
171  /* The index of the bin that contains this segment. */
172  size_t bin;
173 
174  /*
175  * A flag raised to indicate that this segment is being returned to the
176  * operating system and has been unpinned.
177  */
178  bool freed;
180 
181 /*
182  * Metadata for one superblock.
183  *
184  * For most blocks, span objects are stored out-of-line; that is, the span
185  * object is not stored within the block itself. But, as an exception, for a
186  * "span of spans", the span object is stored "inline". The allocation is
187  * always exactly one page, and the dsa_area_span object is located at
188  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
189  * and the remaining fields are used just as they would be in an ordinary
190  * block. We can't allocate spans out of ordinary superblocks because
191  * creating an ordinary superblock requires us to be able to allocate a span
192  * *first*. Doing it this way avoids that circularity.
193  */
194 typedef struct
195 {
196  dsa_pointer pool; /* Containing pool. */
197  dsa_pointer prevspan; /* Previous span. */
198  dsa_pointer nextspan; /* Next span. */
199  dsa_pointer start; /* Starting address. */
200  size_t npages; /* Length of span in pages. */
201  uint16 size_class; /* Size class. */
202  uint16 ninitialized; /* Maximum number of objects ever allocated. */
203  uint16 nallocatable; /* Number of objects currently allocatable. */
204  uint16 firstfree; /* First object on free list. */
205  uint16 nmax; /* Maximum number of objects ever possible. */
206  uint16 fclass; /* Current fullness class. */
207 } dsa_area_span;
208 
209 /*
210  * Given a pointer to an object in a span, access the index of the next free
211  * object in the same span (ie in the span's freelist) as an L-value.
212  */
213 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
214 
215 /*
216  * Small allocations are handled by dividing a single block of memory into
217  * many small objects of equal size. The possible allocation sizes are
218  * defined by the following array. Larger size classes are spaced more widely
219  * than smaller size classes. We fudge the spacing for size classes >1kB to
220  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
221  * blocks, we bump the maximum object size up to the largest multiple of
222  * 8 bytes that still lets us fit the same number of objects into one block.
223  *
224  * NB: Because of this fudging, if we were ever to use differently-sized blocks
225  * for small allocations, these size classes would need to be reworked to be
226  * optimal for the new size.
227  *
228  * NB: The optimal spacing for size classes, as well as the size of the blocks
229  * out of which small objects are allocated, is not a question that has one
230  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
231  * size classes than we do here, while others (like aset.c) use more
232  * widely-spaced classes. Spacing the classes more closely avoids wasting
233  * memory within individual chunks, but also means a larger number of
234  * potentially-unfilled blocks.
235  */
236 static const uint16 dsa_size_classes[] = {
237  sizeof(dsa_area_span), 0, /* special size classes */
238  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
239  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
240  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
241  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
242  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
243  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
244  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
245  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
246 };
247 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
248 
249 /* Special size classes. */
250 #define DSA_SCLASS_BLOCK_OF_SPANS 0
251 #define DSA_SCLASS_SPAN_LARGE 1
252 
253 /*
254  * The following lookup table is used to map the size of small objects
255  * (less than 1kB) onto the corresponding size class. To use this table,
256  * round the size of the object up to the next multiple of 8 bytes, and then
257  * index into this array.
258  */
259 static const uint8 dsa_size_class_map[] = {
260  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
261  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
262  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
263  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
264  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
265  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
266  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
267  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
268 };
269 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
270 
271 /*
272  * Superblocks are binned by how full they are. Generally, each fullness
273  * class corresponds to one quartile, but the block being used for
274  * allocations is always at the head of the list for fullness class 1,
275  * regardless of how full it really is.
276  */
277 #define DSA_FULLNESS_CLASSES 4
278 
279 /*
280  * A dsa_area_pool represents a set of objects of a given size class.
281  *
282  * Perhaps there should be multiple pools for the same size class for
283  * contention avoidance, but for now there is just one!
284  */
285 typedef struct
286 {
287  /* A lock protecting access to this pool. */
289  /* A set of linked lists of spans, arranged by fullness. */
291  /* Should we pad this out to a cacheline boundary? */
292 } dsa_area_pool;
293 
294 /*
295  * The control block for an area. This lives in shared memory, at the start of
296  * the first DSM segment controlled by this area.
297  */
298 typedef struct
299 {
300  /* The segment header for the first segment. */
302  /* The handle for this area. */
304  /* The handles of the segments owned by this area. */
305  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
306  /* Lists of segments, binned by maximum contiguous run of free pages. */
308  /* The object pools for each size class. */
310  /* The total size of all active segments. */
312  /* The maximum total size of backing storage we are allowed. */
314  /* Highest used segment index in the history of this area. */
316  /* The reference count for this area. */
317  int refcnt;
318  /* A flag indicating that this area has been pinned. */
319  bool pinned;
320  /* The number of times that segments have been freed. */
322  /* The LWLock tranche ID. */
324  /* The general lock (protects everything except object pools). */
327 
328 /* Given a pointer to a pool, find a dsa_pointer. */
329 #define DsaAreaPoolToDsaPointer(area, p) \
330  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
331 
332 /*
333  * A dsa_segment_map is stored within the backend-private memory of each
334  * individual backend. It holds the base address of the segment within that
335  * backend, plus the addresses of key objects within the segment. Those
336  * could instead be derived from the base address but it's handy to have them
337  * around.
338  */
339 typedef struct
340 {
341  dsm_segment *segment; /* DSM segment */
342  char *mapped_address; /* Address at which segment is mapped */
343  dsa_segment_header *header; /* Header (same as mapped_address) */
344  FreePageManager *fpm; /* Free page manager within segment. */
345  dsa_pointer *pagemap; /* Page map within segment. */
347 
348 /*
349  * Per-backend state for a storage area. Backends obtain one of these by
350  * creating an area or attaching to an existing one using a handle. Each
351  * process that needs to use an area uses its own object to track where the
352  * segments are mapped.
353  */
354 struct dsa_area
355 {
356  /* Pointer to the control object in shared memory. */
358 
359  /* Has the mapping been pinned? */
361 
362  /*
363  * This backend's array of segment maps, ordered by segment index
364  * corresponding to control->segment_handles. Some of the area's segments
365  * may not be mapped in this backend yet, and some slots may have been
366  * freed and need to be detached; these operations happen on demand.
367  */
369 
370  /* The highest segment index this backend has ever mapped. */
372 
373  /* The last observed freed_segment_counter. */
375 };
376 
377 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
378 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
379 
380 /* Given a pointer to a segment_map, obtain a segment index number. */
381 #define get_segment_index(area, segment_map_ptr) \
382  (segment_map_ptr - &area->segment_maps[0])
383 
384 static void init_span(dsa_area *area, dsa_pointer span_pointer,
385  dsa_area_pool *pool, dsa_pointer start, size_t npages,
386  uint16 size_class);
387 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
388  int fromclass, int toclass);
389 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
390 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
391  int size_class);
394 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
395 static void unlink_span(dsa_area *area, dsa_area_span *span);
396 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
397  dsa_pointer span_pointer, int fclass);
398 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
399 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
400 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
401 static dsa_area *create_internal(void *place, size_t size,
402  int tranche_id,
403  dsm_handle control_handle,
404  dsm_segment *control_segment);
405 static dsa_area *attach_internal(void *place, dsm_segment *segment,
406  dsa_handle handle);
407 static void check_for_freed_segments(dsa_area *area);
408 static void check_for_freed_segments_locked(dsa_area *area);
409 
410 /*
411  * Create a new shared area in a new DSM segment. Further DSM segments will
412  * be allocated as required to extend the available space.
413  *
414  * We can't allocate a LWLock tranche_id within this function, because tranche
415  * IDs are a scarce resource; there are only 64k available, using low numbers
416  * when possible matters, and we have no provision for recycling them. So,
417  * we require the caller to provide one.
418  */
419 dsa_area *
420 dsa_create(int tranche_id)
421 {
422  dsm_segment *segment;
423  dsa_area *area;
424 
425  /*
426  * Create the DSM segment that will hold the shared control object and the
427  * first segment of usable space.
428  */
429  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
430 
431  /*
432  * All segments backing this area are pinned, so that DSA can explicitly
433  * control their lifetime (otherwise a newly created segment belonging to
434  * this area might be freed when the only backend that happens to have it
435  * mapped in ends, corrupting the area).
436  */
437  dsm_pin_segment(segment);
438 
439  /* Create a new DSA area with the control object in this segment. */
440  area = create_internal(dsm_segment_address(segment),
442  tranche_id,
443  dsm_segment_handle(segment), segment);
444 
445  /* Clean up when the control segment detaches. */
448 
449  return area;
450 }
451 
452 /*
453  * Create a new shared area in an existing shared memory space, which may be
454  * either DSM or Postmaster-initialized memory. DSM segments will be
455  * allocated as required to extend the available space, though that can be
456  * prevented with dsa_set_size_limit(area, size) using the same size provided
457  * to dsa_create_in_place.
458  *
459  * Areas created in-place must eventually be released by the backend that
460  * created them and all backends that attach to them. This can be done
461  * explicitly with dsa_release_in_place, or, in the special case that 'place'
462  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
463  * segment so that a detach hook can be registered with the containing DSM
464  * segment.
465  *
466  * See dsa_create() for a note about the tranche arguments.
467  */
468 dsa_area *
469 dsa_create_in_place(void *place, size_t size,
470  int tranche_id, dsm_segment *segment)
471 {
472  dsa_area *area;
473 
474  area = create_internal(place, size, tranche_id,
475  DSM_HANDLE_INVALID, NULL);
476 
477  /*
478  * Clean up when the control segment detaches, if a containing DSM segment
479  * was provided.
480  */
481  if (segment != NULL)
483  PointerGetDatum(place));
484 
485  return area;
486 }
487 
488 /*
489  * Obtain a handle that can be passed to other processes so that they can
490  * attach to the given area. Cannot be called for areas created with
491  * dsa_create_in_place.
492  */
495 {
497  return area->control->handle;
498 }
499 
500 /*
501  * Attach to an area given a handle generated (possibly in another process) by
502  * dsa_get_handle. The area must have been created with dsa_create (not
503  * dsa_create_in_place).
504  */
505 dsa_area *
507 {
508  dsm_segment *segment;
509  dsa_area *area;
510 
511  /*
512  * An area handle is really a DSM segment handle for the first segment, so
513  * we go ahead and attach to that.
514  */
515  segment = dsm_attach(handle);
516  if (segment == NULL)
517  ereport(ERROR,
518  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
519  errmsg("could not attach to dynamic shared area")));
520 
521  area = attach_internal(dsm_segment_address(segment), segment, handle);
522 
523  /* Clean up when the control segment detaches. */
526 
527  return area;
528 }
529 
530 /*
531  * Attach to an area that was created with dsa_create_in_place. The caller
532  * must somehow know the location in memory that was used when the area was
533  * created, though it may be mapped at a different virtual address in this
534  * process.
535  *
536  * See dsa_create_in_place for note about releasing in-place areas, and the
537  * optional 'segment' argument which can be provided to allow automatic
538  * release if the containing memory happens to be a DSM segment.
539  */
540 dsa_area *
541 dsa_attach_in_place(void *place, dsm_segment *segment)
542 {
543  dsa_area *area;
544 
545  area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
546 
547  /*
548  * Clean up when the control segment detaches, if a containing DSM segment
549  * was provided.
550  */
551  if (segment != NULL)
553  PointerGetDatum(place));
554 
555  return area;
556 }
557 
558 /*
559  * Release a DSA area that was produced by dsa_create_in_place or
560  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
561  * interface suitable for on_dsm_detach, for the convenience of users who want
562  * to create a DSA segment inside an existing DSM segment and have it
563  * automatically released when the containing DSM segment is detached.
564  * 'place' should be the address of the place where the area was created.
565  *
566  * This callback is automatically registered for the DSM segment containing
567  * the control object of in-place areas when a segment is provided to
568  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
569  * with dsa_create.
570  */
571 void
573 {
575 }
576 
577 /*
578  * Release a DSA area that was produced by dsa_create_in_place or
579  * dsa_attach_in_place. The 'code' argument is ignored but provides an
580  * interface suitable for on_shmem_exit or before_shmem_exit, for the
581  * convenience of users who want to create a DSA segment inside shared memory
582  * other than a DSM segment and have it automatically release at backend exit.
583  * 'place' should be the address of the place where the area was created.
584  */
585 void
587 {
589 }
590 
591 /*
592  * Release a DSA area that was produced by dsa_create_in_place or
593  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
594  * callbacks so that this is managed automatically, because failure to release
595  * an area created in-place leaks its segments permanently.
596  *
597  * This is also called automatically for areas produced by dsa_create or
598  * dsa_attach as an implementation detail.
599  */
600 void
602 {
604  int i;
605 
606  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
607  Assert(control->segment_header.magic ==
608  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
609  Assert(control->refcnt > 0);
610  if (--control->refcnt == 0)
611  {
612  for (i = 0; i <= control->high_segment_index; ++i)
613  {
614  dsm_handle handle;
615 
616  handle = control->segment_handles[i];
617  if (handle != DSM_HANDLE_INVALID)
618  dsm_unpin_segment(handle);
619  }
620  }
621  LWLockRelease(&control->lock);
622 }
623 
624 /*
625  * Keep a DSA area attached until end of session or explicit detach.
626  *
627  * By default, areas are owned by the current resource owner, which means they
628  * are detached automatically when that scope ends.
629  */
630 void
632 {
633  int i;
634 
635  Assert(!area->mapping_pinned);
636  area->mapping_pinned = true;
637 
638  for (i = 0; i <= area->high_segment_index; ++i)
639  if (area->segment_maps[i].segment != NULL)
641 }
642 
643 /*
644  * Allocate memory in this storage area. The return value is a dsa_pointer
645  * that can be passed to other processes, and converted to a local pointer
646  * with dsa_get_address. 'flags' is a bitmap which should be constructed
647  * from the following values:
648  *
649  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
650  * will result in an ERROR.
651  *
652  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
653  * no memory is available or a size limit established by dsa_set_size_limit
654  * would be exceeded. Otherwise, such allocations will result in an ERROR.
655  *
656  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
657  * contents of newly-allocated memory are indeterminate.
658  *
659  * These flags correspond to similarly named flags used by
660  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
661  * dsa_allocate0 which expand to a call to this function with commonly used
662  * flags.
663  */
665 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
666 {
667  uint16 size_class;
668  dsa_pointer start_pointer;
669  dsa_segment_map *segment_map;
670  dsa_pointer result;
671 
672  Assert(size > 0);
673 
674  /* Sanity check on huge individual allocation size. */
675  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
676  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
677  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
678 
679  /*
680  * If bigger than the largest size class, just grab a run of pages from
681  * the free page manager, instead of allocating an object from a pool.
682  * There will still be a span, but it's a special class of span that
683  * manages this whole allocation and simply gives all pages back to the
684  * free page manager when dsa_free is called.
685  */
686  if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
687  {
688  size_t npages = fpm_size_to_pages(size);
689  size_t first_page;
690  dsa_pointer span_pointer;
692 
693  /* Obtain a span object. */
694  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
695  if (!DsaPointerIsValid(span_pointer))
696  {
697  /* Raise error unless asked not to. */
698  if ((flags & DSA_ALLOC_NO_OOM) == 0)
699  ereport(ERROR,
700  (errcode(ERRCODE_OUT_OF_MEMORY),
701  errmsg("out of memory"),
702  errdetail("Failed on DSA request of size %zu.",
703  size)));
704  return InvalidDsaPointer;
705  }
706 
708 
709  /* Find a segment from which to allocate. */
710  segment_map = get_best_segment(area, npages);
711  if (segment_map == NULL)
712  segment_map = make_new_segment(area, npages);
713  if (segment_map == NULL)
714  {
715  /* Can't make any more segments: game over. */
717  dsa_free(area, span_pointer);
718 
719  /* Raise error unless asked not to. */
720  if ((flags & DSA_ALLOC_NO_OOM) == 0)
721  ereport(ERROR,
722  (errcode(ERRCODE_OUT_OF_MEMORY),
723  errmsg("out of memory"),
724  errdetail("Failed on DSA request of size %zu.",
725  size)));
726  return InvalidDsaPointer;
727  }
728 
729  /*
730  * Ask the free page manager for a run of pages. This should always
731  * succeed, since both get_best_segment and make_new_segment should
732  * only return a non-NULL pointer if it actually contains enough
733  * contiguous freespace. If it does fail, something in our backend
734  * private state is out of whack, so use FATAL to kill the process.
735  */
736  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
737  elog(FATAL,
738  "dsa_allocate could not find %zu free pages", npages);
740 
741  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
742  first_page * FPM_PAGE_SIZE);
743 
744  /* Initialize span and pagemap. */
746  LW_EXCLUSIVE);
747  init_span(area, span_pointer, pool, start_pointer, npages,
749  segment_map->pagemap[first_page] = span_pointer;
751 
752  /* Zero-initialize the memory if requested. */
753  if ((flags & DSA_ALLOC_ZERO) != 0)
754  memset(dsa_get_address(area, start_pointer), 0, size);
755 
756  return start_pointer;
757  }
758 
759  /* Map allocation to a size class. */
761  {
762  int mapidx;
763 
764  /* For smaller sizes we have a lookup table... */
765  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
766  DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
767  size_class = dsa_size_class_map[mapidx];
768  }
769  else
770  {
771  uint16 min;
772  uint16 max;
773 
774  /* ... and for the rest we search by binary chop. */
776  max = lengthof(dsa_size_classes) - 1;
777 
778  while (min < max)
779  {
780  uint16 mid = (min + max) / 2;
781  uint16 class_size = dsa_size_classes[mid];
782 
783  if (class_size < size)
784  min = mid + 1;
785  else
786  max = mid;
787  }
788 
789  size_class = min;
790  }
791  Assert(size <= dsa_size_classes[size_class]);
792  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
793 
794  /* Attempt to allocate an object from the appropriate pool. */
795  result = alloc_object(area, size_class);
796 
797  /* Check for failure to allocate. */
798  if (!DsaPointerIsValid(result))
799  {
800  /* Raise error unless asked not to. */
801  if ((flags & DSA_ALLOC_NO_OOM) == 0)
802  ereport(ERROR,
803  (errcode(ERRCODE_OUT_OF_MEMORY),
804  errmsg("out of memory"),
805  errdetail("Failed on DSA request of size %zu.", size)));
806  return InvalidDsaPointer;
807  }
808 
809  /* Zero-initialize the memory if requested. */
810  if ((flags & DSA_ALLOC_ZERO) != 0)
811  memset(dsa_get_address(area, result), 0, size);
812 
813  return result;
814 }
815 
816 /*
817  * Free memory obtained with dsa_allocate.
818  */
819 void
821 {
822  dsa_segment_map *segment_map;
823  int pageno;
824  dsa_pointer span_pointer;
825  dsa_area_span *span;
826  char *superblock;
827  char *object;
828  size_t size;
829  int size_class;
830 
831  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
833 
834  /* Locate the object, span and pool. */
835  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
836  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
837  span_pointer = segment_map->pagemap[pageno];
838  span = dsa_get_address(area, span_pointer);
839  superblock = dsa_get_address(area, span->start);
840  object = dsa_get_address(area, dp);
841  size_class = span->size_class;
842  size = dsa_size_classes[size_class];
843 
844  /*
845  * Special case for large objects that live in a special span: we return
846  * those pages directly to the free page manager and free the span.
847  */
848  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
849  {
850 
851 #ifdef CLOBBER_FREED_MEMORY
852  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
853 #endif
854 
855  /* Give pages back to free page manager. */
857  FreePageManagerPut(segment_map->fpm,
859  span->npages);
861  /* Unlink span. */
863  LW_EXCLUSIVE);
864  unlink_span(area, span);
866  /* Free the span object so it can be reused. */
867  dsa_free(area, span_pointer);
868  return;
869  }
870 
871 #ifdef CLOBBER_FREED_MEMORY
872  memset(object, 0x7f, size);
873 #endif
874 
875  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
876 
877  /* Put the object on the span's freelist. */
878  Assert(object >= superblock);
879  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
880  Assert((object - superblock) % size == 0);
881  NextFreeObjectIndex(object) = span->firstfree;
882  span->firstfree = (object - superblock) / size;
883  ++span->nallocatable;
884 
885  /*
886  * See if the span needs to moved to a different fullness class, or be
887  * freed so its pages can be given back to the segment.
888  */
889  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
890  {
891  /*
892  * The block was completely full and is located in the
893  * highest-numbered fullness class, which is never scanned for free
894  * chunks. We must move it to the next-lower fullness class.
895  */
896  unlink_span(area, span);
897  add_span_to_fullness_class(area, span, span_pointer,
899 
900  /*
901  * If this is the only span, and there is no active span, then we
902  * should probably move this span to fullness class 1. (Otherwise if
903  * you allocate exactly all the objects in the only span, it moves to
904  * class 3, then you free them all, it moves to 2, and then is given
905  * back, leaving no active span).
906  */
907  }
908  else if (span->nallocatable == span->nmax &&
909  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
910  {
911  /*
912  * This entire block is free, and it's not the active block for this
913  * size class. Return the memory to the free page manager. We don't
914  * do this for the active block to prevent hysteresis: if we
915  * repeatedly allocate and free the only chunk in the active block, it
916  * will be very inefficient if we deallocate and reallocate the block
917  * every time.
918  */
919  destroy_superblock(area, span_pointer);
920  }
921 
922  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
923 }
924 
925 /*
926  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
927  * memory allocated by the given area (possibly in another process) that
928  * hasn't yet been freed. This may cause a segment to be mapped into the
929  * current process if required, and may cause freed segments to be unmapped.
930  */
931 void *
933 {
935  size_t offset;
936 
937  /* Convert InvalidDsaPointer to NULL. */
938  if (!DsaPointerIsValid(dp))
939  return NULL;
940 
941  /* Process any requests to detach from freed segments. */
943 
944  /* Break the dsa_pointer into its components. */
945  index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
946  offset = DSA_EXTRACT_OFFSET(dp);
947  Assert(index < DSA_MAX_SEGMENTS);
948 
949  /* Check if we need to cause this segment to be mapped in. */
950  if (unlikely(area->segment_maps[index].mapped_address == NULL))
951  {
952  /* Call for effect (we don't need the result). */
953  get_segment_by_index(area, index);
954  }
955 
956  return area->segment_maps[index].mapped_address + offset;
957 }
958 
959 /*
960  * Pin this area, so that it will continue to exist even if all backends
961  * detach from it. In that case, the area can still be reattached to if a
962  * handle has been recorded somewhere.
963  */
964 void
966 {
968  if (area->control->pinned)
969  {
971  elog(ERROR, "dsa_area already pinned");
972  }
973  area->control->pinned = true;
974  ++area->control->refcnt;
976 }
977 
978 /*
979  * Undo the effects of dsa_pin, so that the given area can be freed when no
980  * backends are attached to it. May be called only if dsa_pin has been
981  * called.
982  */
983 void
985 {
987  Assert(area->control->refcnt > 1);
988  if (!area->control->pinned)
989  {
991  elog(ERROR, "dsa_area not pinned");
992  }
993  area->control->pinned = false;
994  --area->control->refcnt;
996 }
997 
998 /*
999  * Set the total size limit for this area. This limit is checked whenever new
1000  * segments need to be allocated from the operating system. If the new size
1001  * limit is already exceeded, this has no immediate effect.
1002  *
1003  * Note that the total virtual memory usage may be temporarily larger than
1004  * this limit when segments have been freed, but not yet detached by all
1005  * backends that have attached to them.
1006  */
1007 void
1008 dsa_set_size_limit(dsa_area *area, size_t limit)
1009 {
1011  area->control->max_total_segment_size = limit;
1013 }
1014 
1015 /*
1016  * Aggressively free all spare memory in the hope of returning DSM segments to
1017  * the operating system.
1018  */
1019 void
1021 {
1022  int size_class;
1023 
1024  /*
1025  * Trim in reverse pool order so we get to the spans-of-spans last, just
1026  * in case any become entirely free while processing all the other pools.
1027  */
1028  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1029  {
1030  dsa_area_pool *pool = &area->control->pools[size_class];
1031  dsa_pointer span_pointer;
1032 
1033  if (size_class == DSA_SCLASS_SPAN_LARGE)
1034  {
1035  /* Large object frees give back segments aggressively already. */
1036  continue;
1037  }
1038 
1039  /*
1040  * Search fullness class 1 only. That is where we expect to find an
1041  * entirely empty superblock (entirely empty superblocks in other
1042  * fullness classes are returned to the free page map by dsa_free).
1043  */
1044  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1045  span_pointer = pool->spans[1];
1046  while (DsaPointerIsValid(span_pointer))
1047  {
1048  dsa_area_span *span = dsa_get_address(area, span_pointer);
1049  dsa_pointer next = span->nextspan;
1050 
1051  if (span->nallocatable == span->nmax)
1052  destroy_superblock(area, span_pointer);
1053 
1054  span_pointer = next;
1055  }
1056  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1057  }
1058 }
1059 
1060 /*
1061  * Print out debugging information about the internal state of the shared
1062  * memory area.
1063  */
1064 void
1066 {
1067  size_t i,
1068  j;
1069 
1070  /*
1071  * Note: This gives an inconsistent snapshot as it acquires and releases
1072  * individual locks as it goes...
1073  */
1074 
1077  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1078  fprintf(stderr, " max_total_segment_size: %zu\n",
1080  fprintf(stderr, " total_segment_size: %zu\n",
1081  area->control->total_segment_size);
1082  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1083  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1084  fprintf(stderr, " segment bins:\n");
1085  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1086  {
1087  if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1088  {
1089  dsa_segment_index segment_index;
1090 
1091  fprintf(stderr,
1092  " segment bin %zu (at least %d contiguous pages free):\n",
1093  i, 1 << (i - 1));
1094  segment_index = area->control->segment_bins[i];
1095  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1096  {
1097  dsa_segment_map *segment_map;
1098 
1099  segment_map =
1100  get_segment_by_index(area, segment_index);
1101 
1102  fprintf(stderr,
1103  " segment index %zu, usable_pages = %zu, "
1104  "contiguous_pages = %zu, mapped at %p\n",
1105  segment_index,
1106  segment_map->header->usable_pages,
1107  fpm_largest(segment_map->fpm),
1108  segment_map->mapped_address);
1109  segment_index = segment_map->header->next;
1110  }
1111  }
1112  }
1114 
1115  fprintf(stderr, " pools:\n");
1116  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1117  {
1118  bool found = false;
1119 
1121  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1122  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1123  found = true;
1124  if (found)
1125  {
1126  if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1127  fprintf(stderr, " pool for blocks of span objects:\n");
1128  else if (i == DSA_SCLASS_SPAN_LARGE)
1129  fprintf(stderr, " pool for large object spans:\n");
1130  else
1131  fprintf(stderr,
1132  " pool for size class %zu (object size %hu bytes):\n",
1133  i, dsa_size_classes[i]);
1134  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1135  {
1136  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1137  fprintf(stderr, " fullness class %zu is empty\n", j);
1138  else
1139  {
1140  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1141 
1142  fprintf(stderr, " fullness class %zu:\n", j);
1143  while (DsaPointerIsValid(span_pointer))
1144  {
1145  dsa_area_span *span;
1146 
1147  span = dsa_get_address(area, span_pointer);
1148  fprintf(stderr,
1149  " span descriptor at "
1150  DSA_POINTER_FORMAT ", superblock at "
1152  ", pages = %zu, objects free = %hu/%hu\n",
1153  span_pointer, span->start, span->npages,
1154  span->nallocatable, span->nmax);
1155  span_pointer = span->nextspan;
1156  }
1157  }
1158  }
1159  }
1160  LWLockRelease(DSA_SCLASS_LOCK(area, i));
1161  }
1162 }
1163 
1164 /*
1165  * Return the smallest size that you can successfully provide to
1166  * dsa_create_in_place.
1167  */
1168 size_t
1170 {
1171  size_t size;
1172  int pages = 0;
1173 
1174  size = MAXALIGN(sizeof(dsa_area_control)) +
1175  MAXALIGN(sizeof(FreePageManager));
1176 
1177  /* Figure out how many pages we need, including the page map... */
1178  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1179  {
1180  ++pages;
1181  size += sizeof(dsa_pointer);
1182  }
1183 
1184  return pages * FPM_PAGE_SIZE;
1185 }
1186 
1187 /*
1188  * Workhorse function for dsa_create and dsa_create_in_place.
1189  */
1190 static dsa_area *
1191 create_internal(void *place, size_t size,
1192  int tranche_id,
1193  dsm_handle control_handle,
1194  dsm_segment *control_segment)
1195 {
1197  dsa_area *area;
1198  dsa_segment_map *segment_map;
1199  size_t usable_pages;
1200  size_t total_pages;
1201  size_t metadata_bytes;
1202  int i;
1203 
1204  /* Sanity check on the space we have to work in. */
1205  if (size < dsa_minimum_size())
1206  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1207  dsa_minimum_size(), size);
1208 
1209  /* Now figure out how much space is usable */
1210  total_pages = size / FPM_PAGE_SIZE;
1211  metadata_bytes =
1212  MAXALIGN(sizeof(dsa_area_control)) +
1213  MAXALIGN(sizeof(FreePageManager)) +
1214  total_pages * sizeof(dsa_pointer);
1215  /* Add padding up to next page boundary. */
1216  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1217  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1218  Assert(metadata_bytes <= size);
1219  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1220 
1221  /*
1222  * Initialize the dsa_area_control object located at the start of the
1223  * space.
1224  */
1225  control = (dsa_area_control *) place;
1226  control->segment_header.magic =
1227  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1230  control->segment_header.usable_pages = usable_pages;
1231  control->segment_header.freed = false;
1233  control->handle = control_handle;
1234  control->max_total_segment_size = (size_t) -1;
1235  control->total_segment_size = size;
1236  memset(&control->segment_handles[0], 0,
1237  sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
1238  control->segment_handles[0] = control_handle;
1239  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1240  control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1241  control->high_segment_index = 0;
1242  control->refcnt = 1;
1243  control->freed_segment_counter = 0;
1244  control->lwlock_tranche_id = tranche_id;
1245 
1246  /*
1247  * Create the dsa_area object that this backend will use to access the
1248  * area. Other backends will need to obtain their own dsa_area object by
1249  * attaching.
1250  */
1251  area = palloc(sizeof(dsa_area));
1252  area->control = control;
1253  area->mapping_pinned = false;
1254  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1255  area->high_segment_index = 0;
1256  area->freed_segment_counter = 0;
1257  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1258  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1260  control->lwlock_tranche_id);
1261 
1262  /* Set up the segment map for this process's mapping. */
1263  segment_map = &area->segment_maps[0];
1264  segment_map->segment = control_segment;
1265  segment_map->mapped_address = place;
1266  segment_map->header = (dsa_segment_header *) place;
1267  segment_map->fpm = (FreePageManager *)
1268  (segment_map->mapped_address +
1269  MAXALIGN(sizeof(dsa_area_control)));
1270  segment_map->pagemap = (dsa_pointer *)
1271  (segment_map->mapped_address +
1272  MAXALIGN(sizeof(dsa_area_control)) +
1273  MAXALIGN(sizeof(FreePageManager)));
1274 
1275  /* Set up the free page map. */
1276  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1277  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1278 
1279  if (usable_pages > 0)
1280  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1281  usable_pages);
1282 
1283  /* Put this segment into the appropriate bin. */
1284  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1285  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1286 
1287  return area;
1288 }
1289 
1290 /*
1291  * Workhorse function for dsa_attach and dsa_attach_in_place.
1292  */
1293 static dsa_area *
1294 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1295 {
1297  dsa_area *area;
1298  dsa_segment_map *segment_map;
1299 
1300  control = (dsa_area_control *) place;
1301  Assert(control->handle == handle);
1302  Assert(control->segment_handles[0] == handle);
1303  Assert(control->segment_header.magic ==
1304  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1305 
1306  /* Build the backend-local area object. */
1307  area = palloc(sizeof(dsa_area));
1308  area->control = control;
1309  area->mapping_pinned = false;
1310  memset(&area->segment_maps[0], 0,
1311  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1312  area->high_segment_index = 0;
1313 
1314  /* Set up the segment map for this process's mapping. */
1315  segment_map = &area->segment_maps[0];
1316  segment_map->segment = segment; /* NULL for in-place */
1317  segment_map->mapped_address = place;
1318  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1319  segment_map->fpm = (FreePageManager *)
1320  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1321  segment_map->pagemap = (dsa_pointer *)
1322  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1323  MAXALIGN(sizeof(FreePageManager)));
1324 
1325  /* Bump the reference count. */
1327  if (control->refcnt == 0)
1328  {
1329  /* We can't attach to a DSA area that has already been destroyed. */
1330  ereport(ERROR,
1331  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1332  errmsg("could not attach to dynamic shared area")));
1333  }
1334  ++control->refcnt;
1337 
1338  return area;
1339 }
1340 
1341 /*
1342  * Add a new span to fullness class 1 of the indicated pool.
1343  */
1344 static void
1346  dsa_pointer span_pointer,
1347  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1348  uint16 size_class)
1349 {
1350  dsa_area_span *span = dsa_get_address(area, span_pointer);
1351  size_t obsize = dsa_size_classes[size_class];
1352 
1353  /*
1354  * The per-pool lock must be held because we manipulate the span list for
1355  * this pool.
1356  */
1357  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1358 
1359  /* Push this span onto the front of the span list for fullness class 1. */
1360  if (DsaPointerIsValid(pool->spans[1]))
1361  {
1362  dsa_area_span *head = (dsa_area_span *)
1363  dsa_get_address(area, pool->spans[1]);
1364 
1365  head->prevspan = span_pointer;
1366  }
1367  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1368  span->nextspan = pool->spans[1];
1369  span->prevspan = InvalidDsaPointer;
1370  pool->spans[1] = span_pointer;
1371 
1372  span->start = start;
1373  span->npages = npages;
1374  span->size_class = size_class;
1375  span->ninitialized = 0;
1376  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1377  {
1378  /*
1379  * A block-of-spans contains its own descriptor, so mark one object as
1380  * initialized and reduce the count of allocatable objects by one.
1381  * Doing this here has the side effect of also reducing nmax by one,
1382  * which is important to make sure we free this object at the correct
1383  * time.
1384  */
1385  span->ninitialized = 1;
1386  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1387  }
1388  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1389  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1391  span->nmax = span->nallocatable;
1392  span->fclass = 1;
1393 }
1394 
1395 /*
1396  * Transfer the first span in one fullness class to the head of another
1397  * fullness class.
1398  */
1399 static bool
1401  dsa_area_pool *pool, int fromclass, int toclass)
1402 {
1403  dsa_pointer span_pointer;
1404  dsa_area_span *span;
1405  dsa_area_span *nextspan;
1406 
1407  /* Can't do it if source list is empty. */
1408  span_pointer = pool->spans[fromclass];
1409  if (!DsaPointerIsValid(span_pointer))
1410  return false;
1411 
1412  /* Remove span from head of source list. */
1413  span = dsa_get_address(area, span_pointer);
1414  pool->spans[fromclass] = span->nextspan;
1415  if (DsaPointerIsValid(span->nextspan))
1416  {
1417  nextspan = (dsa_area_span *)
1418  dsa_get_address(area, span->nextspan);
1419  nextspan->prevspan = InvalidDsaPointer;
1420  }
1421 
1422  /* Add span to head of target list. */
1423  span->nextspan = pool->spans[toclass];
1424  pool->spans[toclass] = span_pointer;
1425  if (DsaPointerIsValid(span->nextspan))
1426  {
1427  nextspan = (dsa_area_span *)
1428  dsa_get_address(area, span->nextspan);
1429  nextspan->prevspan = span_pointer;
1430  }
1431  span->fclass = toclass;
1432 
1433  return true;
1434 }
1435 
1436 /*
1437  * Allocate one object of the requested size class from the given area.
1438  */
1439 static inline dsa_pointer
1440 alloc_object(dsa_area *area, int size_class)
1441 {
1442  dsa_area_pool *pool = &area->control->pools[size_class];
1443  dsa_area_span *span;
1444  dsa_pointer block;
1445  dsa_pointer result;
1446  char *object;
1447  size_t size;
1448 
1449  /*
1450  * Even though ensure_active_superblock can in turn call alloc_object if
1451  * it needs to allocate a new span, that's always from a different pool,
1452  * and the order of lock acquisition is always the same, so it's OK that
1453  * we hold this lock for the duration of this function.
1454  */
1455  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1456  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1457 
1458  /*
1459  * If there's no active superblock, we must successfully obtain one or
1460  * fail the request.
1461  */
1462  if (!DsaPointerIsValid(pool->spans[1]) &&
1463  !ensure_active_superblock(area, pool, size_class))
1464  {
1465  result = InvalidDsaPointer;
1466  }
1467  else
1468  {
1469  /*
1470  * There should be a block in fullness class 1 at this point, and it
1471  * should never be completely full. Thus we can either pop an object
1472  * from the free list or, failing that, initialize a new object.
1473  */
1474  Assert(DsaPointerIsValid(pool->spans[1]));
1475  span = (dsa_area_span *)
1476  dsa_get_address(area, pool->spans[1]);
1477  Assert(span->nallocatable > 0);
1478  block = span->start;
1479  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1480  size = dsa_size_classes[size_class];
1481  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1482  {
1483  result = block + span->firstfree * size;
1484  object = dsa_get_address(area, result);
1485  span->firstfree = NextFreeObjectIndex(object);
1486  }
1487  else
1488  {
1489  result = block + span->ninitialized * size;
1490  ++span->ninitialized;
1491  }
1492  --span->nallocatable;
1493 
1494  /* If it's now full, move it to the highest-numbered fullness class. */
1495  if (span->nallocatable == 0)
1496  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1497  }
1498 
1499  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1500  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1501 
1502  return result;
1503 }
1504 
1505 /*
1506  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1507  * superblocks are completely full and no more can be allocated.
1508  *
1509  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1510  * utilization percentage is at least K/N, but we only enforce this rigorously
1511  * for the highest-numbered fullness class, which always contains exactly
1512  * those blocks that are completely full. It's otherwise acceptable for a
1513  * block to be in a higher-numbered fullness class than the one to which it
1514  * logically belongs. In addition, the active block, which is always the
1515  * first block in fullness class 1, is permitted to have a higher allocation
1516  * percentage than would normally be allowable for that fullness class; we
1517  * don't move it until it's completely full, and then it goes to the
1518  * highest-numbered fullness class.
1519  *
1520  * It might seem odd that the active block is the head of fullness class 1
1521  * rather than fullness class 0, but experience with other allocators has
1522  * shown that it's usually better to allocate from a block that's moderately
1523  * full rather than one that's nearly empty. Insofar as is reasonably
1524  * possible, we want to avoid performing new allocations in a block that would
1525  * otherwise become empty soon.
1526  */
1527 static bool
1529  int size_class)
1530 {
1531  dsa_pointer span_pointer;
1532  dsa_pointer start_pointer;
1533  size_t obsize = dsa_size_classes[size_class];
1534  size_t nmax;
1535  int fclass;
1536  size_t npages = 1;
1537  size_t first_page;
1538  size_t i;
1539  dsa_segment_map *segment_map;
1540 
1541  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1542 
1543  /*
1544  * Compute the number of objects that will fit in a block of this size
1545  * class. Span-of-spans blocks are just a single page, and the first
1546  * object isn't available for use because it describes the block-of-spans
1547  * itself.
1548  */
1549  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1550  nmax = FPM_PAGE_SIZE / obsize - 1;
1551  else
1552  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1553 
1554  /*
1555  * If fullness class 1 is empty, try to find a span to put in it by
1556  * scanning higher-numbered fullness classes (excluding the last one,
1557  * whose blocks are certain to all be completely full).
1558  */
1559  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1560  {
1561  span_pointer = pool->spans[fclass];
1562 
1563  while (DsaPointerIsValid(span_pointer))
1564  {
1565  int tfclass;
1566  dsa_area_span *span;
1567  dsa_area_span *nextspan;
1568  dsa_area_span *prevspan;
1569  dsa_pointer next_span_pointer;
1570 
1571  span = (dsa_area_span *)
1572  dsa_get_address(area, span_pointer);
1573  next_span_pointer = span->nextspan;
1574 
1575  /* Figure out what fullness class should contain this span. */
1576  tfclass = (nmax - span->nallocatable)
1577  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1578 
1579  /* Look up next span. */
1580  if (DsaPointerIsValid(span->nextspan))
1581  nextspan = (dsa_area_span *)
1582  dsa_get_address(area, span->nextspan);
1583  else
1584  nextspan = NULL;
1585 
1586  /*
1587  * If utilization has dropped enough that this now belongs in some
1588  * other fullness class, move it there.
1589  */
1590  if (tfclass < fclass)
1591  {
1592  /* Remove from the current fullness class list. */
1593  if (pool->spans[fclass] == span_pointer)
1594  {
1595  /* It was the head; remove it. */
1597  pool->spans[fclass] = span->nextspan;
1598  if (nextspan != NULL)
1599  nextspan->prevspan = InvalidDsaPointer;
1600  }
1601  else
1602  {
1603  /* It was not the head. */
1605  prevspan = (dsa_area_span *)
1606  dsa_get_address(area, span->prevspan);
1607  prevspan->nextspan = span->nextspan;
1608  }
1609  if (nextspan != NULL)
1610  nextspan->prevspan = span->prevspan;
1611 
1612  /* Push onto the head of the new fullness class list. */
1613  span->nextspan = pool->spans[tfclass];
1614  pool->spans[tfclass] = span_pointer;
1615  span->prevspan = InvalidDsaPointer;
1616  if (DsaPointerIsValid(span->nextspan))
1617  {
1618  nextspan = (dsa_area_span *)
1619  dsa_get_address(area, span->nextspan);
1620  nextspan->prevspan = span_pointer;
1621  }
1622  span->fclass = tfclass;
1623  }
1624 
1625  /* Advance to next span on list. */
1626  span_pointer = next_span_pointer;
1627  }
1628 
1629  /* Stop now if we found a suitable block. */
1630  if (DsaPointerIsValid(pool->spans[1]))
1631  return true;
1632  }
1633 
1634  /*
1635  * If there are no blocks that properly belong in fullness class 1, pick
1636  * one from some other fullness class and move it there anyway, so that we
1637  * have an allocation target. Our last choice is to transfer a block
1638  * that's almost empty (and might become completely empty soon if left
1639  * alone), but even that is better than failing, which is what we must do
1640  * if there are no blocks at all with freespace.
1641  */
1642  Assert(!DsaPointerIsValid(pool->spans[1]));
1643  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1644  if (transfer_first_span(area, pool, fclass, 1))
1645  return true;
1646  if (!DsaPointerIsValid(pool->spans[1]) &&
1647  transfer_first_span(area, pool, 0, 1))
1648  return true;
1649 
1650  /*
1651  * We failed to find an existing span with free objects, so we need to
1652  * allocate a new superblock and construct a new span to manage it.
1653  *
1654  * First, get a dsa_area_span object to describe the new superblock block
1655  * ... unless this allocation is for a dsa_area_span object, in which case
1656  * that's surely not going to work. We handle that case by storing the
1657  * span describing a block-of-spans inline.
1658  */
1659  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1660  {
1661  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1662  if (!DsaPointerIsValid(span_pointer))
1663  return false;
1664  npages = DSA_PAGES_PER_SUPERBLOCK;
1665  }
1666 
1667  /* Find or create a segment and allocate the superblock. */
1669  segment_map = get_best_segment(area, npages);
1670  if (segment_map == NULL)
1671  {
1672  segment_map = make_new_segment(area, npages);
1673  if (segment_map == NULL)
1674  {
1676  return false;
1677  }
1678  }
1679 
1680  /*
1681  * This shouldn't happen: get_best_segment() or make_new_segment()
1682  * promised that we can successfully allocate npages.
1683  */
1684  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1685  elog(FATAL,
1686  "dsa_allocate could not find %zu free pages for superblock",
1687  npages);
1689 
1690  /* Compute the start of the superblock. */
1691  start_pointer =
1692  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1693  first_page * FPM_PAGE_SIZE);
1694 
1695  /*
1696  * If this is a block-of-spans, carve the descriptor right out of the
1697  * allocated space.
1698  */
1699  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1700  {
1701  /*
1702  * We have a pointer into the segment. We need to build a dsa_pointer
1703  * from the segment index and offset into the segment.
1704  */
1705  span_pointer = start_pointer;
1706  }
1707 
1708  /* Initialize span and pagemap. */
1709  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1710  for (i = 0; i < npages; ++i)
1711  segment_map->pagemap[first_page + i] = span_pointer;
1712 
1713  return true;
1714 }
1715 
1716 /*
1717  * Return the segment map corresponding to a given segment index, mapping the
1718  * segment in if necessary. For internal segment book-keeping, this is called
1719  * with the area lock held. It is also called by dsa_free and dsa_get_address
1720  * without any locking, relying on the fact they have a known live segment
1721  * index and they always call check_for_freed_segments to ensures that any
1722  * freed segment occupying the same slot is detached first.
1723  */
1724 static dsa_segment_map *
1726 {
1727  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1728  {
1729  dsm_handle handle;
1730  dsm_segment *segment;
1731  dsa_segment_map *segment_map;
1732 
1733  /*
1734  * If we are reached by dsa_free or dsa_get_address, there must be at
1735  * least one object allocated in the referenced segment. Otherwise,
1736  * their caller has a double-free or access-after-free bug, which we
1737  * have no hope of detecting. So we know it's safe to access this
1738  * array slot without holding a lock; it won't change underneath us.
1739  * Furthermore, we know that we can see the latest contents of the
1740  * slot, as explained in check_for_freed_segments, which those
1741  * functions call before arriving here.
1742  */
1743  handle = area->control->segment_handles[index];
1744 
1745  /* It's an error to try to access an unused slot. */
1746  if (handle == DSM_HANDLE_INVALID)
1747  elog(ERROR,
1748  "dsa_area could not attach to a segment that has been freed");
1749 
1750  segment = dsm_attach(handle);
1751  if (segment == NULL)
1752  elog(ERROR, "dsa_area could not attach to segment");
1753  if (area->mapping_pinned)
1754  dsm_pin_mapping(segment);
1755  segment_map = &area->segment_maps[index];
1756  segment_map->segment = segment;
1757  segment_map->mapped_address = dsm_segment_address(segment);
1758  segment_map->header =
1759  (dsa_segment_header *) segment_map->mapped_address;
1760  segment_map->fpm = (FreePageManager *)
1761  (segment_map->mapped_address +
1762  MAXALIGN(sizeof(dsa_segment_header)));
1763  segment_map->pagemap = (dsa_pointer *)
1764  (segment_map->mapped_address +
1765  MAXALIGN(sizeof(dsa_segment_header)) +
1766  MAXALIGN(sizeof(FreePageManager)));
1767 
1768  /* Remember the highest index this backend has ever mapped. */
1769  if (area->high_segment_index < index)
1770  area->high_segment_index = index;
1771 
1772  Assert(segment_map->header->magic ==
1773  (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1774  }
1775 
1776  /*
1777  * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1778  * but it's a bug in the calling code and undefined behavior if the
1779  * address is not live (ie if the segment might possibly have been freed,
1780  * they're trying to use a dangling pointer).
1781  *
1782  * For dsa.c code that holds the area lock to manipulate segment_bins
1783  * lists, it would be a bug if we ever reach a freed segment here. After
1784  * it's marked as freed, the only thing any backend should do with it is
1785  * unmap it, and it should always have done that in
1786  * check_for_freed_segments_locked() before arriving here to resolve an
1787  * index to a segment_map.
1788  *
1789  * Either way we can assert that we aren't returning a freed segment.
1790  */
1791  Assert(!area->segment_maps[index].header->freed);
1792 
1793  return &area->segment_maps[index];
1794 }
1795 
1796 /*
1797  * Return a superblock to the free page manager. If the underlying segment
1798  * has become entirely free, then return it to the operating system.
1799  *
1800  * The appropriate pool lock must be held.
1801  */
1802 static void
1804 {
1805  dsa_area_span *span = dsa_get_address(area, span_pointer);
1806  int size_class = span->size_class;
1807  dsa_segment_map *segment_map;
1808 
1809 
1810  /* Remove it from its fullness class list. */
1811  unlink_span(area, span);
1812 
1813  /*
1814  * Note: Here we acquire the area lock while we already hold a per-pool
1815  * lock. We never hold the area lock and then take a pool lock, or we
1816  * could deadlock.
1817  */
1820  segment_map =
1822  FreePageManagerPut(segment_map->fpm,
1824  span->npages);
1825  /* Check if the segment is now entirely free. */
1826  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1827  {
1828  dsa_segment_index index = get_segment_index(area, segment_map);
1829 
1830  /* If it's not the segment with extra control data, free it. */
1831  if (index != 0)
1832  {
1833  /*
1834  * Give it back to the OS, and allow other backends to detect that
1835  * they need to detach.
1836  */
1837  unlink_segment(area, segment_map);
1838  segment_map->header->freed = true;
1840  segment_map->header->size);
1841  area->control->total_segment_size -=
1842  segment_map->header->size;
1844  dsm_detach(segment_map->segment);
1845  area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1846  ++area->control->freed_segment_counter;
1847  segment_map->segment = NULL;
1848  segment_map->header = NULL;
1849  segment_map->mapped_address = NULL;
1850  }
1851  }
1853 
1854  /*
1855  * Span-of-spans blocks store the span which describes them within the
1856  * block itself, so freeing the storage implicitly frees the descriptor
1857  * also. If this is a block of any other type, we need to separately free
1858  * the span object also. This recursive call to dsa_free will acquire the
1859  * span pool's lock. We can't deadlock because the acquisition order is
1860  * always some other pool and then the span pool.
1861  */
1862  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1863  dsa_free(area, span_pointer);
1864 }
1865 
1866 static void
1868 {
1869  if (DsaPointerIsValid(span->nextspan))
1870  {
1871  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1872 
1873  next->prevspan = span->prevspan;
1874  }
1875  if (DsaPointerIsValid(span->prevspan))
1876  {
1877  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1878 
1879  prev->nextspan = span->nextspan;
1880  }
1881  else
1882  {
1883  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1884 
1885  pool->spans[span->fclass] = span->nextspan;
1886  }
1887 }
1888 
1889 static void
1891  dsa_pointer span_pointer,
1892  int fclass)
1893 {
1894  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1895 
1896  if (DsaPointerIsValid(pool->spans[fclass]))
1897  {
1898  dsa_area_span *head = dsa_get_address(area,
1899  pool->spans[fclass]);
1900 
1901  head->prevspan = span_pointer;
1902  }
1903  span->prevspan = InvalidDsaPointer;
1904  span->nextspan = pool->spans[fclass];
1905  pool->spans[fclass] = span_pointer;
1906  span->fclass = fclass;
1907 }
1908 
1909 /*
1910  * Detach from an area that was either created or attached to by this process.
1911  */
1912 void
1914 {
1915  int i;
1916 
1917  /* Detach from all segments. */
1918  for (i = 0; i <= area->high_segment_index; ++i)
1919  if (area->segment_maps[i].segment != NULL)
1920  dsm_detach(area->segment_maps[i].segment);
1921 
1922  /*
1923  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1924  * 'releasing' (= adjusting the reference count). It would be nice to
1925  * combine these operations, but client code might never get around to
1926  * calling dsa_detach because of an error path, and a detach hook on any
1927  * particular segment is too late to detach other segments in the area
1928  * without risking a 'leak' warning in the non-error path.
1929  */
1930 
1931  /* Free the backend-local area object. */
1932  pfree(area);
1933 }
1934 
1935 /*
1936  * Unlink a segment from the bin that contains it.
1937  */
1938 static void
1940 {
1941  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1942  {
1943  dsa_segment_map *prev;
1944 
1945  prev = get_segment_by_index(area, segment_map->header->prev);
1946  prev->header->next = segment_map->header->next;
1947  }
1948  else
1949  {
1950  Assert(area->control->segment_bins[segment_map->header->bin] ==
1951  get_segment_index(area, segment_map));
1952  area->control->segment_bins[segment_map->header->bin] =
1953  segment_map->header->next;
1954  }
1955  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1956  {
1958 
1959  next = get_segment_by_index(area, segment_map->header->next);
1960  next->header->prev = segment_map->header->prev;
1961  }
1962 }
1963 
1964 /*
1965  * Find a segment that could satisfy a request for 'npages' of contiguous
1966  * memory, or return NULL if none can be found. This may involve attaching to
1967  * segments that weren't previously attached so that we can query their free
1968  * pages map.
1969  */
1970 static dsa_segment_map *
1971 get_best_segment(dsa_area *area, size_t npages)
1972 {
1973  size_t bin;
1974 
1977 
1978  /*
1979  * Start searching from the first bin that *might* have enough contiguous
1980  * pages.
1981  */
1982  for (bin = contiguous_pages_to_segment_bin(npages);
1983  bin < DSA_NUM_SEGMENT_BINS;
1984  ++bin)
1985  {
1986  /*
1987  * The minimum contiguous size that any segment in this bin should
1988  * have. We'll re-bin if we see segments with fewer.
1989  */
1990  size_t threshold = (size_t) 1 << (bin - 1);
1991  dsa_segment_index segment_index;
1992 
1993  /* Search this bin for a segment with enough contiguous space. */
1994  segment_index = area->control->segment_bins[bin];
1995  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1996  {
1997  dsa_segment_map *segment_map;
1998  dsa_segment_index next_segment_index;
1999  size_t contiguous_pages;
2000 
2001  segment_map = get_segment_by_index(area, segment_index);
2002  next_segment_index = segment_map->header->next;
2003  contiguous_pages = fpm_largest(segment_map->fpm);
2004 
2005  /* Not enough for the request, still enough for this bin. */
2006  if (contiguous_pages >= threshold && contiguous_pages < npages)
2007  {
2008  segment_index = next_segment_index;
2009  continue;
2010  }
2011 
2012  /* Re-bin it if it's no longer in the appropriate bin. */
2013  if (contiguous_pages < threshold)
2014  {
2015  size_t new_bin;
2016 
2017  new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
2018 
2019  /* Remove it from its current bin. */
2020  unlink_segment(area, segment_map);
2021 
2022  /* Push it onto the front of its new bin. */
2023  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2024  segment_map->header->next =
2025  area->control->segment_bins[new_bin];
2026  segment_map->header->bin = new_bin;
2027  area->control->segment_bins[new_bin] = segment_index;
2028  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2029  {
2031 
2032  next = get_segment_by_index(area,
2033  segment_map->header->next);
2034  Assert(next->header->bin == new_bin);
2035  next->header->prev = segment_index;
2036  }
2037 
2038  /*
2039  * But fall through to see if it's enough to satisfy this
2040  * request anyway....
2041  */
2042  }
2043 
2044  /* Check if we are done. */
2045  if (contiguous_pages >= npages)
2046  return segment_map;
2047 
2048  /* Continue searching the same bin. */
2049  segment_index = next_segment_index;
2050  }
2051  }
2052 
2053  /* Not found. */
2054  return NULL;
2055 }
2056 
2057 /*
2058  * Create a new segment that can handle at least requested_pages. Returns
2059  * NULL if the requested total size limit or maximum allowed number of
2060  * segments would be exceeded.
2061  */
2062 static dsa_segment_map *
2063 make_new_segment(dsa_area *area, size_t requested_pages)
2064 {
2065  dsa_segment_index new_index;
2066  size_t metadata_bytes;
2067  size_t total_size;
2068  size_t total_pages;
2069  size_t usable_pages;
2070  dsa_segment_map *segment_map;
2071  dsm_segment *segment;
2072 
2074 
2075  /* Find a segment slot that is not in use (linearly for now). */
2076  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2077  {
2078  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2079  break;
2080  }
2081  if (new_index == DSA_MAX_SEGMENTS)
2082  return NULL;
2083 
2084  /*
2085  * If the total size limit is already exceeded, then we exit early and
2086  * avoid arithmetic wraparound in the unsigned expressions below.
2087  */
2088  if (area->control->total_segment_size >=
2090  return NULL;
2091 
2092  /*
2093  * The size should be at least as big as requested, and at least big
2094  * enough to follow a geometric series that approximately doubles the
2095  * total storage each time we create a new segment. We use geometric
2096  * growth because the underlying DSM system isn't designed for large
2097  * numbers of segments (otherwise we might even consider just using one
2098  * DSM segment for each large allocation and for each superblock, and then
2099  * we wouldn't need to use FreePageManager).
2100  *
2101  * We decide on a total segment size first, so that we produce tidy
2102  * power-of-two sized segments. This is a good property to have if we
2103  * move to huge pages in the future. Then we work back to the number of
2104  * pages we can fit.
2105  */
2106  total_size = DSA_INITIAL_SEGMENT_SIZE *
2107  ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2108  total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2109  total_size = Min(total_size,
2111  area->control->total_segment_size);
2112 
2113  total_pages = total_size / FPM_PAGE_SIZE;
2114  metadata_bytes =
2115  MAXALIGN(sizeof(dsa_segment_header)) +
2116  MAXALIGN(sizeof(FreePageManager)) +
2117  sizeof(dsa_pointer) * total_pages;
2118 
2119  /* Add padding up to next page boundary. */
2120  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2121  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2122  if (total_size <= metadata_bytes)
2123  return NULL;
2124  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2125  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2126 
2127  /* See if that is enough... */
2128  if (requested_pages > usable_pages)
2129  {
2130  /*
2131  * We'll make an odd-sized segment, working forward from the requested
2132  * number of pages.
2133  */
2134  usable_pages = requested_pages;
2135  metadata_bytes =
2136  MAXALIGN(sizeof(dsa_segment_header)) +
2137  MAXALIGN(sizeof(FreePageManager)) +
2138  usable_pages * sizeof(dsa_pointer);
2139 
2140  /* Add padding up to next page boundary. */
2141  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2142  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2143  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2144 
2145  /* Is that too large for dsa_pointer's addressing scheme? */
2146  if (total_size > DSA_MAX_SEGMENT_SIZE)
2147  return NULL;
2148 
2149  /* Would that exceed the limit? */
2150  if (total_size > area->control->max_total_segment_size -
2151  area->control->total_segment_size)
2152  return NULL;
2153  }
2154 
2155  /* Create the segment. */
2156  segment = dsm_create(total_size, 0);
2157  if (segment == NULL)
2158  return NULL;
2159  dsm_pin_segment(segment);
2160  if (area->mapping_pinned)
2161  dsm_pin_mapping(segment);
2162 
2163  /* Store the handle in shared memory to be found by index. */
2164  area->control->segment_handles[new_index] =
2165  dsm_segment_handle(segment);
2166  /* Track the highest segment index in the history of the area. */
2167  if (area->control->high_segment_index < new_index)
2168  area->control->high_segment_index = new_index;
2169  /* Track the highest segment index this backend has ever mapped. */
2170  if (area->high_segment_index < new_index)
2171  area->high_segment_index = new_index;
2172  /* Track total size of all segments. */
2176 
2177  /* Build a segment map for this segment in this backend. */
2178  segment_map = &area->segment_maps[new_index];
2179  segment_map->segment = segment;
2180  segment_map->mapped_address = dsm_segment_address(segment);
2181  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2182  segment_map->fpm = (FreePageManager *)
2183  (segment_map->mapped_address +
2184  MAXALIGN(sizeof(dsa_segment_header)));
2185  segment_map->pagemap = (dsa_pointer *)
2186  (segment_map->mapped_address +
2187  MAXALIGN(sizeof(dsa_segment_header)) +
2188  MAXALIGN(sizeof(FreePageManager)));
2189 
2190  /* Set up the free page map. */
2191  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2192  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2193  usable_pages);
2194 
2195  /* Set up the segment header and put it in the appropriate bin. */
2196  segment_map->header->magic =
2197  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2198  segment_map->header->usable_pages = usable_pages;
2199  segment_map->header->size = total_size;
2200  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2201  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2202  segment_map->header->next =
2203  area->control->segment_bins[segment_map->header->bin];
2204  segment_map->header->freed = false;
2205  area->control->segment_bins[segment_map->header->bin] = new_index;
2206  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2207  {
2209  get_segment_by_index(area, segment_map->header->next);
2210 
2211  Assert(next->header->bin == segment_map->header->bin);
2212  next->header->prev = new_index;
2213  }
2214 
2215  return segment_map;
2216 }
2217 
2218 /*
2219  * Check if any segments have been freed by destroy_superblock, so we can
2220  * detach from them in this backend. This function is called by
2221  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2222  * received can be resolved to the correct segment.
2223  *
2224  * The danger we want to defend against is that there could be an old segment
2225  * mapped into a given slot in this backend, and the dsa_pointer they have
2226  * might refer to some new segment in the same slot. So those functions must
2227  * be sure to process all instructions to detach from a freed segment that had
2228  * been generated by the time this process received the dsa_pointer, before
2229  * they call get_segment_by_index.
2230  */
2231 static void
2233 {
2234  size_t freed_segment_counter;
2235 
2236  /*
2237  * Any other process that has freed a segment has incremented
2238  * freed_segment_counter while holding an LWLock, and that must precede
2239  * any backend creating a new segment in the same slot while holding an
2240  * LWLock, and that must precede the creation of any dsa_pointer pointing
2241  * into the new segment which might reach us here, and the caller must
2242  * have sent the dsa_pointer to this process using appropriate memory
2243  * synchronization (some kind of locking or atomic primitive or system
2244  * call). So all we need to do on the reading side is ask for the load of
2245  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2246  * has, and we can be sure to detect any segments that had been freed as
2247  * of the time that the dsa_pointer reached this process.
2248  */
2249  pg_read_barrier();
2250  freed_segment_counter = area->control->freed_segment_counter;
2251  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2252  {
2253  /* Check all currently mapped segments to find what's been freed. */
2257  }
2258 }
2259 
2260 /*
2261  * Workhorse for check_for_freed_segments(), and also used directly in path
2262  * where the area lock is already held. This should be called after acquiring
2263  * the lock but before looking up any segment by index number, to make sure we
2264  * unmap any stale segments that might have previously had the same index as a
2265  * current segment.
2266  */
2267 static void
2269 {
2270  size_t freed_segment_counter;
2271  int i;
2272 
2274  freed_segment_counter = area->control->freed_segment_counter;
2275  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2276  {
2277  for (i = 0; i <= area->high_segment_index; ++i)
2278  {
2279  if (area->segment_maps[i].header != NULL &&
2280  area->segment_maps[i].header->freed)
2281  {
2282  dsm_detach(area->segment_maps[i].segment);
2283  area->segment_maps[i].segment = NULL;
2284  area->segment_maps[i].header = NULL;
2285  area->segment_maps[i].mapped_address = NULL;
2286  }
2287  }
2289  }
2290 }
uint16 nmax
Definition: dsa.c:205
bool mapping_pinned
Definition: dsa.c:360
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:307
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:377
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:378
dsa_pointer nextspan
Definition: dsa.c:198
Definition: lwlock.h:32
void dsa_unpin(dsa_area *area)
Definition: dsa.c:984
static int32 next
Definition: blutils.c:213
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define InvalidDsaPointer
Definition: dsa.h:78
uint32 dsm_handle
Definition: dsm_impl.h:54
uint16 ninitialized
Definition: dsa.c:202
#define contiguous_pages_to_segment_bin(n)
Definition: dsa.c:140
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1842
#define PointerGetDatum(X)
Definition: postgres.h:556
size_t usable_pages
Definition: dsa.c:156
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:533
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:247
dsa_segment_index high_segment_index
Definition: dsa.c:371
#define Min(x, y)
Definition: c.h:911
dsa_segment_index high_segment_index
Definition: dsa.c:315
dsm_segment * segment
Definition: dsa.c:341
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:541
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
size_t size
Definition: dsa.c:158
unsigned char uint8
Definition: c.h:357
dsa_segment_index prev
Definition: dsa.c:164
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
bool pinned
Definition: dsa.c:319
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:966
int errcode(int sqlerrcode)
Definition: elog.c:608
size_t dsa_minimum_size(void)
Definition: dsa.c:1169
dsm_handle dsa_handle
Definition: dsa.h:100
int lwlock_tranche_id
Definition: dsa.c:323
#define lengthof(array)
Definition: c.h:669
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1294
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:975
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:277
dsa_pointer start
Definition: dsa.c:199
#define fprintf
Definition: port.h:196
size_t total_segment_size
Definition: dsa.c:311
size_t npages
Definition: dsa.c:200
uint64 dsa_pointer
Definition: dsa.h:62
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
FreePageManager * fpm
Definition: dsa.c:344
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:101
dsa_segment_header * header
Definition: dsa.c:343
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:420
uint16 fclass
Definition: dsa.c:206
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:381
Definition: type.h:89
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:494
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:805
#define DSM_HANDLE_INVALID
Definition: dsm.h:23
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:932
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
LWLock lock
Definition: dsa.c:325
unsigned short uint16
Definition: c.h:358
void pfree(void *pointer)
Definition: mcxt.c:1056
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1528
int refcnt
Definition: dsa.c:317
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:68
#define ERROR
Definition: elog.h:43
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1191
uint16 size_class
Definition: dsa.c:201
#define FATAL
Definition: elog.h:52
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1400
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:765
void dsa_release_in_place(void *place)
Definition: dsa.c:601
dsa_segment_index next
Definition: dsa.c:170
dsa_area_control * control
Definition: dsa.c:357
int errdetail(const char *fmt,...)
Definition: elog.c:955
void dsa_detach(dsa_area *area)
Definition: dsa.c:1913
unsigned int uint32
Definition: c.h:359
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:631
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:837
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:469
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:77
#define ereport(elevel, rest)
Definition: elog.h:141
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:133
static const uint16 dsa_size_classes[]
Definition: dsa.c:236
#define AllocSizeIsValid(size)
Definition: memutils.h:42
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1867
size_t max_total_segment_size
Definition: dsa.c:313
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:678
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:572
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1939
void dsa_trim(dsa_area *area)
Definition: dsa.c:1020
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
uintptr_t Datum
Definition: postgres.h:367
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:437
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:144
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:506
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:269
int64 total_size
Definition: pg_checksums.c:68
dsa_segment_header segment_header
Definition: dsa.c:301
dsa_pointer prevspan
Definition: dsa.c:197
#define FPM_PAGE_SIZE
Definition: freepage.h:30
size_t freed_segment_counter
Definition: dsa.c:321
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:250
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1890
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:118
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:938
#define Assert(condition)
Definition: c.h:739
#define fpm_largest(fpm)
Definition: freepage.h:88
#define pg_read_barrier()
Definition: atomics.h:158
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:127
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:121
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:114
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:309
dsa_pointer pool
Definition: dsa.c:196
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:305
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
size_t dsa_segment_index
Definition: dsa.c:124
#define MAXALIGN(LEN)
Definition: c.h:692
size_t freed_segment_counter
Definition: dsa.c:374
dsa_pointer * pagemap
Definition: dsa.c:345
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:46
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:290
char * mapped_address
Definition: dsa.c:342
static const uint8 dsa_size_class_map[]
Definition: dsa.c:259
#define DatumGetPointer(X)
Definition: postgres.h:549
#define DSA_MAX_SEGMENTS
Definition: dsa.c:94
#define DsaPointerIsValid(x)
Definition: dsa.h:81
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1803
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:368
Definition: dsa.c:354
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:820
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:664
uint16 nallocatable
Definition: dsa.c:203
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:822
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2232
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2063
#define elog(elevel,...)
Definition: elog.h:228
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1345
int i
void dsa_dump(dsa_area *area)
Definition: dsa.c:1065
#define unlikely(x)
Definition: c.h:208
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1440
uint16 firstfree
Definition: dsa.c:204
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1008
LWLock lock
Definition: dsa.c:288
#define DSA_AREA_LOCK(area)
Definition: dsa.c:143
void dsa_pin(dsa_area *area)
Definition: dsa.c:965
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:251
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:111
uint32 magic
Definition: dsa.c:154
size_t bin
Definition: dsa.c:172
dsa_handle handle
Definition: dsa.c:303
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:665
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:104
#define NextFreeObjectIndex(object)
Definition: dsa.c:213
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:329
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2268
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:586
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:1971
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1725