PostgreSQL Source Code  git master
dsa.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  * Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments. While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects. A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required. Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced. These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed. Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page. Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager. Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes. Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects. Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool. Currently there is one pool, and therefore one lock, per size
30  * class. Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research. Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist. Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map. When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  * src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/ipc.h"
57 #include "storage/lwlock.h"
58 #include "storage/shmem.h"
59 #include "utils/dsa.h"
60 #include "utils/freepage.h"
61 #include "utils/memutils.h"
62 #include "utils/resowner.h"
63 
64 /*
65  * The size of the initial DSM segment that backs a dsa_area created by
66  * dsa_create. After creating some number of segments of this size we'll
67  * double this size, and so on. Larger segments may be created if necessary
68  * to satisfy large requests.
69  */
70 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
71 
72 /*
73  * How many segments to create before we double the segment size. If this is
74  * low, then there is likely to be a lot of wasted space in the largest
75  * segment. If it is high, then we risk running out of segment slots (see
76  * dsm.c's limits on total number of segments), or limiting the total size
77  * an area can manage when using small pointers.
78  */
79 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
80 
81 /*
82  * The number of bits used to represent the offset part of a dsa_pointer.
83  * This controls the maximum size of a segment, the maximum possible
84  * allocation size and also the maximum number of segments per area.
85  */
86 #if SIZEOF_DSA_POINTER == 4
87 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
88 #else
89 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
90 #endif
91 
92 /*
93  * The maximum number of DSM segments that an area can own, determined by
94  * the number of bits remaining (but capped at 1024).
95  */
96 #define DSA_MAX_SEGMENTS \
97  Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
98 
99 /* The bitmask for extracting the offset from a dsa_pointer. */
100 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
101 
102 /* The maximum size of a DSM segment. */
103 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
104 
105 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
106 #define DSA_PAGES_PER_SUPERBLOCK 16
107 
108 /*
109  * A magic number used as a sanity check for following DSM segments belonging
110  * to a DSA area (this number will be XORed with the area handle and
111  * the segment index).
112  */
113 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
114 
115 /* Build a dsa_pointer given a segment number and offset. */
116 #define DSA_MAKE_POINTER(segment_number, offset) \
117  (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
118 
119 /* Extract the segment number from a dsa_pointer. */
120 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
121 
122 /* Extract the offset from a dsa_pointer. */
123 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
124 
125 /* The type used for index segment indexes (zero based). */
126 typedef size_t dsa_segment_index;
127 
128 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
129 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
130 
131 /*
132  * How many bins of segments do we have? The bins are used to categorize
133  * segments by their largest contiguous run of free pages.
134  */
135 #define DSA_NUM_SEGMENT_BINS 16
136 
137 /*
138  * What is the lowest bin that holds segments that *might* have n contiguous
139  * free pages? There is no point in looking in segments in lower bins; they
140  * definitely can't service a request for n free pages.
141  */
142 static inline size_t
144 {
145  size_t bin;
146 
147  if (n == 0)
148  bin = 0;
149  else
150  bin = pg_leftmost_one_pos_size_t(n) + 1;
151 
152  return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
153 }
154 
155 /* Macros for access to locks. */
156 #define DSA_AREA_LOCK(area) (&area->control->lock)
157 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
158 
159 /*
160  * The header for an individual segment. This lives at the start of each DSM
161  * segment owned by a DSA area including the first segment (where it appears
162  * as part of the dsa_area_control struct).
163  */
164 typedef struct
165 {
166  /* Sanity check magic value. */
168  /* Total number of pages in this segment (excluding metadata area). */
169  size_t usable_pages;
170  /* Total size of this segment in bytes. */
171  size_t size;
172 
173  /*
174  * Index of the segment that precedes this one in the same segment bin, or
175  * DSA_SEGMENT_INDEX_NONE if this is the first one.
176  */
178 
179  /*
180  * Index of the segment that follows this one in the same segment bin, or
181  * DSA_SEGMENT_INDEX_NONE if this is the last one.
182  */
184  /* The index of the bin that contains this segment. */
185  size_t bin;
186 
187  /*
188  * A flag raised to indicate that this segment is being returned to the
189  * operating system and has been unpinned.
190  */
191  bool freed;
193 
194 /*
195  * Metadata for one superblock.
196  *
197  * For most blocks, span objects are stored out-of-line; that is, the span
198  * object is not stored within the block itself. But, as an exception, for a
199  * "span of spans", the span object is stored "inline". The allocation is
200  * always exactly one page, and the dsa_area_span object is located at
201  * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
202  * and the remaining fields are used just as they would be in an ordinary
203  * block. We can't allocate spans out of ordinary superblocks because
204  * creating an ordinary superblock requires us to be able to allocate a span
205  * *first*. Doing it this way avoids that circularity.
206  */
207 typedef struct
208 {
209  dsa_pointer pool; /* Containing pool. */
210  dsa_pointer prevspan; /* Previous span. */
211  dsa_pointer nextspan; /* Next span. */
212  dsa_pointer start; /* Starting address. */
213  size_t npages; /* Length of span in pages. */
214  uint16 size_class; /* Size class. */
215  uint16 ninitialized; /* Maximum number of objects ever allocated. */
216  uint16 nallocatable; /* Number of objects currently allocatable. */
217  uint16 firstfree; /* First object on free list. */
218  uint16 nmax; /* Maximum number of objects ever possible. */
219  uint16 fclass; /* Current fullness class. */
220 } dsa_area_span;
221 
222 /*
223  * Given a pointer to an object in a span, access the index of the next free
224  * object in the same span (ie in the span's freelist) as an L-value.
225  */
226 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
227 
228 /*
229  * Small allocations are handled by dividing a single block of memory into
230  * many small objects of equal size. The possible allocation sizes are
231  * defined by the following array. Larger size classes are spaced more widely
232  * than smaller size classes. We fudge the spacing for size classes >1kB to
233  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
234  * blocks, we bump the maximum object size up to the largest multiple of
235  * 8 bytes that still lets us fit the same number of objects into one block.
236  *
237  * NB: Because of this fudging, if we were ever to use differently-sized blocks
238  * for small allocations, these size classes would need to be reworked to be
239  * optimal for the new size.
240  *
241  * NB: The optimal spacing for size classes, as well as the size of the blocks
242  * out of which small objects are allocated, is not a question that has one
243  * right answer. Some allocators (such as tcmalloc) use more closely-spaced
244  * size classes than we do here, while others (like aset.c) use more
245  * widely-spaced classes. Spacing the classes more closely avoids wasting
246  * memory within individual chunks, but also means a larger number of
247  * potentially-unfilled blocks.
248  */
249 static const uint16 dsa_size_classes[] = {
250  sizeof(dsa_area_span), 0, /* special size classes */
251  8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
252  80, 96, 112, 128, /* 4 classes separated by 16 bytes */
253  160, 192, 224, 256, /* 4 classes separated by 32 bytes */
254  320, 384, 448, 512, /* 4 classes separated by 64 bytes */
255  640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
256  1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
257  2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
258  5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
259 };
260 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
261 
262 /* Special size classes. */
263 #define DSA_SCLASS_BLOCK_OF_SPANS 0
264 #define DSA_SCLASS_SPAN_LARGE 1
265 
266 /*
267  * The following lookup table is used to map the size of small objects
268  * (less than 1kB) onto the corresponding size class. To use this table,
269  * round the size of the object up to the next multiple of 8 bytes, and then
270  * index into this array.
271  */
272 static const uint8 dsa_size_class_map[] = {
273  2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
274  14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
275  18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
276  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
277  22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
278  23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
279  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
280  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
281 };
282 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
283 
284 /*
285  * Superblocks are binned by how full they are. Generally, each fullness
286  * class corresponds to one quartile, but the block being used for
287  * allocations is always at the head of the list for fullness class 1,
288  * regardless of how full it really is.
289  */
290 #define DSA_FULLNESS_CLASSES 4
291 
292 /*
293  * A dsa_area_pool represents a set of objects of a given size class.
294  *
295  * Perhaps there should be multiple pools for the same size class for
296  * contention avoidance, but for now there is just one!
297  */
298 typedef struct
299 {
300  /* A lock protecting access to this pool. */
302  /* A set of linked lists of spans, arranged by fullness. */
304  /* Should we pad this out to a cacheline boundary? */
305 } dsa_area_pool;
306 
307 /*
308  * The control block for an area. This lives in shared memory, at the start of
309  * the first DSM segment controlled by this area.
310  */
311 typedef struct
312 {
313  /* The segment header for the first segment. */
315  /* The handle for this area. */
317  /* The handles of the segments owned by this area. */
318  dsm_handle segment_handles[DSA_MAX_SEGMENTS];
319  /* Lists of segments, binned by maximum contiguous run of free pages. */
321  /* The object pools for each size class. */
323  /* The total size of all active segments. */
325  /* The maximum total size of backing storage we are allowed. */
327  /* Highest used segment index in the history of this area. */
329  /* The reference count for this area. */
330  int refcnt;
331  /* A flag indicating that this area has been pinned. */
332  bool pinned;
333  /* The number of times that segments have been freed. */
335  /* The LWLock tranche ID. */
337  /* The general lock (protects everything except object pools). */
340 
341 /* Given a pointer to a pool, find a dsa_pointer. */
342 #define DsaAreaPoolToDsaPointer(area, p) \
343  DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
344 
345 /*
346  * A dsa_segment_map is stored within the backend-private memory of each
347  * individual backend. It holds the base address of the segment within that
348  * backend, plus the addresses of key objects within the segment. Those
349  * could instead be derived from the base address but it's handy to have them
350  * around.
351  */
352 typedef struct
353 {
354  dsm_segment *segment; /* DSM segment */
355  char *mapped_address; /* Address at which segment is mapped */
356  dsa_segment_header *header; /* Header (same as mapped_address) */
357  FreePageManager *fpm; /* Free page manager within segment. */
358  dsa_pointer *pagemap; /* Page map within segment. */
360 
361 /*
362  * Per-backend state for a storage area. Backends obtain one of these by
363  * creating an area or attaching to an existing one using a handle. Each
364  * process that needs to use an area uses its own object to track where the
365  * segments are mapped.
366  */
367 struct dsa_area
368 {
369  /* Pointer to the control object in shared memory. */
371 
372  /*
373  * All the mappings are owned by this. The dsa_area itself is not
374  * directly tracked by the ResourceOwner, but the effect is the same. NULL
375  * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
376  * been called.
377  */
379 
380  /*
381  * This backend's array of segment maps, ordered by segment index
382  * corresponding to control->segment_handles. Some of the area's segments
383  * may not be mapped in this backend yet, and some slots may have been
384  * freed and need to be detached; these operations happen on demand.
385  */
387 
388  /* The highest segment index this backend has ever mapped. */
390 
391  /* The last observed freed_segment_counter. */
393 };
394 
395 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
396 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
397 
398 /* Given a pointer to a segment_map, obtain a segment index number. */
399 #define get_segment_index(area, segment_map_ptr) \
400  (segment_map_ptr - &area->segment_maps[0])
401 
402 static void init_span(dsa_area *area, dsa_pointer span_pointer,
403  dsa_area_pool *pool, dsa_pointer start, size_t npages,
404  uint16 size_class);
405 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
406  int fromclass, int toclass);
407 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
408 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
409  int size_class);
412 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
413 static void unlink_span(dsa_area *area, dsa_area_span *span);
414 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
415  dsa_pointer span_pointer, int fclass);
416 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
417 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
418 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
419 static dsa_area *create_internal(void *place, size_t size,
420  int tranche_id,
421  dsm_handle control_handle,
422  dsm_segment *control_segment);
423 static dsa_area *attach_internal(void *place, dsm_segment *segment,
424  dsa_handle handle);
425 static void check_for_freed_segments(dsa_area *area);
426 static void check_for_freed_segments_locked(dsa_area *area);
427 static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
428 
429 /*
430  * Create a new shared area in a new DSM segment. Further DSM segments will
431  * be allocated as required to extend the available space.
432  *
433  * We can't allocate a LWLock tranche_id within this function, because tranche
434  * IDs are a scarce resource; there are only 64k available, using low numbers
435  * when possible matters, and we have no provision for recycling them. So,
436  * we require the caller to provide one.
437  */
438 dsa_area *
439 dsa_create(int tranche_id)
440 {
441  dsm_segment *segment;
442  dsa_area *area;
443 
444  /*
445  * Create the DSM segment that will hold the shared control object and the
446  * first segment of usable space.
447  */
448  segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
449 
450  /*
451  * All segments backing this area are pinned, so that DSA can explicitly
452  * control their lifetime (otherwise a newly created segment belonging to
453  * this area might be freed when the only backend that happens to have it
454  * mapped in ends, corrupting the area).
455  */
456  dsm_pin_segment(segment);
457 
458  /* Create a new DSA area with the control object in this segment. */
459  area = create_internal(dsm_segment_address(segment),
461  tranche_id,
462  dsm_segment_handle(segment), segment);
463 
464  /* Clean up when the control segment detaches. */
467 
468  return area;
469 }
470 
471 /*
472  * Create a new shared area in an existing shared memory space, which may be
473  * either DSM or Postmaster-initialized memory. DSM segments will be
474  * allocated as required to extend the available space, though that can be
475  * prevented with dsa_set_size_limit(area, size) using the same size provided
476  * to dsa_create_in_place.
477  *
478  * Areas created in-place must eventually be released by the backend that
479  * created them and all backends that attach to them. This can be done
480  * explicitly with dsa_release_in_place, or, in the special case that 'place'
481  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
482  * segment so that a detach hook can be registered with the containing DSM
483  * segment.
484  *
485  * See dsa_create() for a note about the tranche arguments.
486  */
487 dsa_area *
488 dsa_create_in_place(void *place, size_t size,
489  int tranche_id, dsm_segment *segment)
490 {
491  dsa_area *area;
492 
493  area = create_internal(place, size, tranche_id,
494  DSM_HANDLE_INVALID, NULL);
495 
496  /*
497  * Clean up when the control segment detaches, if a containing DSM segment
498  * was provided.
499  */
500  if (segment != NULL)
502  PointerGetDatum(place));
503 
504  return area;
505 }
506 
507 /*
508  * Obtain a handle that can be passed to other processes so that they can
509  * attach to the given area. Cannot be called for areas created with
510  * dsa_create_in_place.
511  */
514 {
516  return area->control->handle;
517 }
518 
519 /*
520  * Attach to an area given a handle generated (possibly in another process) by
521  * dsa_get_handle. The area must have been created with dsa_create (not
522  * dsa_create_in_place).
523  */
524 dsa_area *
526 {
527  dsm_segment *segment;
528  dsa_area *area;
529 
530  /*
531  * An area handle is really a DSM segment handle for the first segment, so
532  * we go ahead and attach to that.
533  */
534  segment = dsm_attach(handle);
535  if (segment == NULL)
536  ereport(ERROR,
537  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
538  errmsg("could not attach to dynamic shared area")));
539 
540  area = attach_internal(dsm_segment_address(segment), segment, handle);
541 
542  /* Clean up when the control segment detaches. */
545 
546  return area;
547 }
548 
549 /*
550  * Attach to an area that was created with dsa_create_in_place. The caller
551  * must somehow know the location in memory that was used when the area was
552  * created, though it may be mapped at a different virtual address in this
553  * process.
554  *
555  * See dsa_create_in_place for note about releasing in-place areas, and the
556  * optional 'segment' argument which can be provided to allow automatic
557  * release if the containing memory happens to be a DSM segment.
558  */
559 dsa_area *
560 dsa_attach_in_place(void *place, dsm_segment *segment)
561 {
562  dsa_area *area;
563 
564  area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
565 
566  /*
567  * Clean up when the control segment detaches, if a containing DSM segment
568  * was provided.
569  */
570  if (segment != NULL)
572  PointerGetDatum(place));
573 
574  return area;
575 }
576 
577 /*
578  * Release a DSA area that was produced by dsa_create_in_place or
579  * dsa_attach_in_place. The 'segment' argument is ignored but provides an
580  * interface suitable for on_dsm_detach, for the convenience of users who want
581  * to create a DSA segment inside an existing DSM segment and have it
582  * automatically released when the containing DSM segment is detached.
583  * 'place' should be the address of the place where the area was created.
584  *
585  * This callback is automatically registered for the DSM segment containing
586  * the control object of in-place areas when a segment is provided to
587  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
588  * with dsa_create.
589  */
590 void
592 {
594 }
595 
596 /*
597  * Release a DSA area that was produced by dsa_create_in_place or
598  * dsa_attach_in_place. The 'code' argument is ignored but provides an
599  * interface suitable for on_shmem_exit or before_shmem_exit, for the
600  * convenience of users who want to create a DSA segment inside shared memory
601  * other than a DSM segment and have it automatically release at backend exit.
602  * 'place' should be the address of the place where the area was created.
603  */
604 void
606 {
608 }
609 
610 /*
611  * Release a DSA area that was produced by dsa_create_in_place or
612  * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
613  * callbacks so that this is managed automatically, because failure to release
614  * an area created in-place leaks its segments permanently.
615  *
616  * This is also called automatically for areas produced by dsa_create or
617  * dsa_attach as an implementation detail.
618  */
619 void
621 {
622  dsa_area_control *control = (dsa_area_control *) place;
623  int i;
624 
625  LWLockAcquire(&control->lock, LW_EXCLUSIVE);
626  Assert(control->segment_header.magic ==
627  (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
628  Assert(control->refcnt > 0);
629  if (--control->refcnt == 0)
630  {
631  for (i = 0; i <= control->high_segment_index; ++i)
632  {
633  dsm_handle handle;
634 
635  handle = control->segment_handles[i];
636  if (handle != DSM_HANDLE_INVALID)
637  dsm_unpin_segment(handle);
638  }
639  }
640  LWLockRelease(&control->lock);
641 }
642 
643 /*
644  * Keep a DSA area attached until end of session or explicit detach.
645  *
646  * By default, areas are owned by the current resource owner, which means they
647  * are detached automatically when that scope ends.
648  */
649 void
651 {
652  int i;
653 
654  if (area->resowner != NULL)
655  {
656  area->resowner = NULL;
657 
658  for (i = 0; i <= area->high_segment_index; ++i)
659  if (area->segment_maps[i].segment != NULL)
661  }
662 }
663 
664 /*
665  * Allocate memory in this storage area. The return value is a dsa_pointer
666  * that can be passed to other processes, and converted to a local pointer
667  * with dsa_get_address. 'flags' is a bitmap which should be constructed
668  * from the following values:
669  *
670  * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
671  * will result in an ERROR.
672  *
673  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
674  * no memory is available or a size limit established by dsa_set_size_limit
675  * would be exceeded. Otherwise, such allocations will result in an ERROR.
676  *
677  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
678  * contents of newly-allocated memory are indeterminate.
679  *
680  * These flags correspond to similarly named flags used by
681  * MemoryContextAllocExtended(). See also the macros dsa_allocate and
682  * dsa_allocate0 which expand to a call to this function with commonly used
683  * flags.
684  */
686 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
687 {
688  uint16 size_class;
689  dsa_pointer start_pointer;
690  dsa_segment_map *segment_map;
691  dsa_pointer result;
692 
693  Assert(size > 0);
694 
695  /* Sanity check on huge individual allocation size. */
696  if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
697  ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
698  elog(ERROR, "invalid DSA memory alloc request size %zu", size);
699 
700  /*
701  * If bigger than the largest size class, just grab a run of pages from
702  * the free page manager, instead of allocating an object from a pool.
703  * There will still be a span, but it's a special class of span that
704  * manages this whole allocation and simply gives all pages back to the
705  * free page manager when dsa_free is called.
706  */
707  if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
708  {
709  size_t npages = fpm_size_to_pages(size);
710  size_t first_page;
711  dsa_pointer span_pointer;
713 
714  /* Obtain a span object. */
715  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
716  if (!DsaPointerIsValid(span_pointer))
717  {
718  /* Raise error unless asked not to. */
719  if ((flags & DSA_ALLOC_NO_OOM) == 0)
720  ereport(ERROR,
721  (errcode(ERRCODE_OUT_OF_MEMORY),
722  errmsg("out of memory"),
723  errdetail("Failed on DSA request of size %zu.",
724  size)));
725  return InvalidDsaPointer;
726  }
727 
729 
730  /* Find a segment from which to allocate. */
731  segment_map = get_best_segment(area, npages);
732  if (segment_map == NULL)
733  segment_map = make_new_segment(area, npages);
734  if (segment_map == NULL)
735  {
736  /* Can't make any more segments: game over. */
738  dsa_free(area, span_pointer);
739 
740  /* Raise error unless asked not to. */
741  if ((flags & DSA_ALLOC_NO_OOM) == 0)
742  ereport(ERROR,
743  (errcode(ERRCODE_OUT_OF_MEMORY),
744  errmsg("out of memory"),
745  errdetail("Failed on DSA request of size %zu.",
746  size)));
747  return InvalidDsaPointer;
748  }
749 
750  /*
751  * Ask the free page manager for a run of pages. This should always
752  * succeed, since both get_best_segment and make_new_segment should
753  * only return a non-NULL pointer if it actually contains enough
754  * contiguous freespace. If it does fail, something in our backend
755  * private state is out of whack, so use FATAL to kill the process.
756  */
757  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
758  elog(FATAL,
759  "dsa_allocate could not find %zu free pages", npages);
761 
762  start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
763  first_page * FPM_PAGE_SIZE);
764 
765  /* Initialize span and pagemap. */
767  LW_EXCLUSIVE);
768  init_span(area, span_pointer, pool, start_pointer, npages,
770  segment_map->pagemap[first_page] = span_pointer;
772 
773  /* Zero-initialize the memory if requested. */
774  if ((flags & DSA_ALLOC_ZERO) != 0)
775  memset(dsa_get_address(area, start_pointer), 0, size);
776 
777  return start_pointer;
778  }
779 
780  /* Map allocation to a size class. */
782  {
783  int mapidx;
784 
785  /* For smaller sizes we have a lookup table... */
786  mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
788  size_class = dsa_size_class_map[mapidx];
789  }
790  else
791  {
792  uint16 min;
793  uint16 max;
794 
795  /* ... and for the rest we search by binary chop. */
797  max = lengthof(dsa_size_classes) - 1;
798 
799  while (min < max)
800  {
801  uint16 mid = (min + max) / 2;
802  uint16 class_size = dsa_size_classes[mid];
803 
804  if (class_size < size)
805  min = mid + 1;
806  else
807  max = mid;
808  }
809 
810  size_class = min;
811  }
812  Assert(size <= dsa_size_classes[size_class]);
813  Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
814 
815  /* Attempt to allocate an object from the appropriate pool. */
816  result = alloc_object(area, size_class);
817 
818  /* Check for failure to allocate. */
819  if (!DsaPointerIsValid(result))
820  {
821  /* Raise error unless asked not to. */
822  if ((flags & DSA_ALLOC_NO_OOM) == 0)
823  ereport(ERROR,
824  (errcode(ERRCODE_OUT_OF_MEMORY),
825  errmsg("out of memory"),
826  errdetail("Failed on DSA request of size %zu.", size)));
827  return InvalidDsaPointer;
828  }
829 
830  /* Zero-initialize the memory if requested. */
831  if ((flags & DSA_ALLOC_ZERO) != 0)
832  memset(dsa_get_address(area, result), 0, size);
833 
834  return result;
835 }
836 
837 /*
838  * Free memory obtained with dsa_allocate.
839  */
840 void
842 {
843  dsa_segment_map *segment_map;
844  int pageno;
845  dsa_pointer span_pointer;
846  dsa_area_span *span;
847  char *superblock;
848  char *object;
849  size_t size;
850  int size_class;
851 
852  /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
854 
855  /* Locate the object, span and pool. */
856  segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
857  pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
858  span_pointer = segment_map->pagemap[pageno];
859  span = dsa_get_address(area, span_pointer);
860  superblock = dsa_get_address(area, span->start);
861  object = dsa_get_address(area, dp);
862  size_class = span->size_class;
863  size = dsa_size_classes[size_class];
864 
865  /*
866  * Special case for large objects that live in a special span: we return
867  * those pages directly to the free page manager and free the span.
868  */
869  if (span->size_class == DSA_SCLASS_SPAN_LARGE)
870  {
871 
872 #ifdef CLOBBER_FREED_MEMORY
873  memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
874 #endif
875 
876  /* Give pages back to free page manager. */
878  FreePageManagerPut(segment_map->fpm,
880  span->npages);
881 
882  /* Move segment to appropriate bin if necessary. */
883  rebin_segment(area, segment_map);
885 
886  /* Unlink span. */
888  LW_EXCLUSIVE);
889  unlink_span(area, span);
891  /* Free the span object so it can be reused. */
892  dsa_free(area, span_pointer);
893  return;
894  }
895 
896 #ifdef CLOBBER_FREED_MEMORY
897  memset(object, 0x7f, size);
898 #endif
899 
900  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
901 
902  /* Put the object on the span's freelist. */
903  Assert(object >= superblock);
904  Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
905  Assert((object - superblock) % size == 0);
906  NextFreeObjectIndex(object) = span->firstfree;
907  span->firstfree = (object - superblock) / size;
908  ++span->nallocatable;
909 
910  /*
911  * See if the span needs to moved to a different fullness class, or be
912  * freed so its pages can be given back to the segment.
913  */
914  if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
915  {
916  /*
917  * The block was completely full and is located in the
918  * highest-numbered fullness class, which is never scanned for free
919  * chunks. We must move it to the next-lower fullness class.
920  */
921  unlink_span(area, span);
922  add_span_to_fullness_class(area, span, span_pointer,
924 
925  /*
926  * If this is the only span, and there is no active span, then we
927  * should probably move this span to fullness class 1. (Otherwise if
928  * you allocate exactly all the objects in the only span, it moves to
929  * class 3, then you free them all, it moves to 2, and then is given
930  * back, leaving no active span).
931  */
932  }
933  else if (span->nallocatable == span->nmax &&
934  (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
935  {
936  /*
937  * This entire block is free, and it's not the active block for this
938  * size class. Return the memory to the free page manager. We don't
939  * do this for the active block to prevent hysteresis: if we
940  * repeatedly allocate and free the only chunk in the active block, it
941  * will be very inefficient if we deallocate and reallocate the block
942  * every time.
943  */
944  destroy_superblock(area, span_pointer);
945  }
946 
947  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
948 }
949 
950 /*
951  * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
952  * memory allocated by the given area (possibly in another process) that
953  * hasn't yet been freed. This may cause a segment to be mapped into the
954  * current process if required, and may cause freed segments to be unmapped.
955  */
956 void *
958 {
960  size_t offset;
961 
962  /* Convert InvalidDsaPointer to NULL. */
963  if (!DsaPointerIsValid(dp))
964  return NULL;
965 
966  /* Process any requests to detach from freed segments. */
968 
969  /* Break the dsa_pointer into its components. */
971  offset = DSA_EXTRACT_OFFSET(dp);
973 
974  /* Check if we need to cause this segment to be mapped in. */
975  if (unlikely(area->segment_maps[index].mapped_address == NULL))
976  {
977  /* Call for effect (we don't need the result). */
979  }
980 
981  return area->segment_maps[index].mapped_address + offset;
982 }
983 
984 /*
985  * Pin this area, so that it will continue to exist even if all backends
986  * detach from it. In that case, the area can still be reattached to if a
987  * handle has been recorded somewhere.
988  */
989 void
991 {
993  if (area->control->pinned)
994  {
996  elog(ERROR, "dsa_area already pinned");
997  }
998  area->control->pinned = true;
999  ++area->control->refcnt;
1001 }
1002 
1003 /*
1004  * Undo the effects of dsa_pin, so that the given area can be freed when no
1005  * backends are attached to it. May be called only if dsa_pin has been
1006  * called.
1007  */
1008 void
1010 {
1012  Assert(area->control->refcnt > 1);
1013  if (!area->control->pinned)
1014  {
1016  elog(ERROR, "dsa_area not pinned");
1017  }
1018  area->control->pinned = false;
1019  --area->control->refcnt;
1021 }
1022 
1023 /*
1024  * Set the total size limit for this area. This limit is checked whenever new
1025  * segments need to be allocated from the operating system. If the new size
1026  * limit is already exceeded, this has no immediate effect.
1027  *
1028  * Note that the total virtual memory usage may be temporarily larger than
1029  * this limit when segments have been freed, but not yet detached by all
1030  * backends that have attached to them.
1031  */
1032 void
1033 dsa_set_size_limit(dsa_area *area, size_t limit)
1034 {
1036  area->control->max_total_segment_size = limit;
1038 }
1039 
1040 /*
1041  * Aggressively free all spare memory in the hope of returning DSM segments to
1042  * the operating system.
1043  */
1044 void
1046 {
1047  int size_class;
1048 
1049  /*
1050  * Trim in reverse pool order so we get to the spans-of-spans last, just
1051  * in case any become entirely free while processing all the other pools.
1052  */
1053  for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1054  {
1055  dsa_area_pool *pool = &area->control->pools[size_class];
1056  dsa_pointer span_pointer;
1057 
1058  if (size_class == DSA_SCLASS_SPAN_LARGE)
1059  {
1060  /* Large object frees give back segments aggressively already. */
1061  continue;
1062  }
1063 
1064  /*
1065  * Search fullness class 1 only. That is where we expect to find an
1066  * entirely empty superblock (entirely empty superblocks in other
1067  * fullness classes are returned to the free page map by dsa_free).
1068  */
1069  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1070  span_pointer = pool->spans[1];
1071  while (DsaPointerIsValid(span_pointer))
1072  {
1073  dsa_area_span *span = dsa_get_address(area, span_pointer);
1074  dsa_pointer next = span->nextspan;
1075 
1076  if (span->nallocatable == span->nmax)
1077  destroy_superblock(area, span_pointer);
1078 
1079  span_pointer = next;
1080  }
1081  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1082  }
1083 }
1084 
1085 /*
1086  * Print out debugging information about the internal state of the shared
1087  * memory area.
1088  */
1089 void
1091 {
1092  size_t i,
1093  j;
1094 
1095  /*
1096  * Note: This gives an inconsistent snapshot as it acquires and releases
1097  * individual locks as it goes...
1098  */
1099 
1102  fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1103  fprintf(stderr, " max_total_segment_size: %zu\n",
1105  fprintf(stderr, " total_segment_size: %zu\n",
1106  area->control->total_segment_size);
1107  fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1108  fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1109  fprintf(stderr, " segment bins:\n");
1110  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1111  {
1113  {
1114  dsa_segment_index segment_index;
1115 
1116  fprintf(stderr,
1117  " segment bin %zu (at least %d contiguous pages free):\n",
1118  i, 1 << (i - 1));
1119  segment_index = area->control->segment_bins[i];
1120  while (segment_index != DSA_SEGMENT_INDEX_NONE)
1121  {
1122  dsa_segment_map *segment_map;
1123 
1124  segment_map =
1125  get_segment_by_index(area, segment_index);
1126 
1127  fprintf(stderr,
1128  " segment index %zu, usable_pages = %zu, "
1129  "contiguous_pages = %zu, mapped at %p\n",
1130  segment_index,
1131  segment_map->header->usable_pages,
1132  fpm_largest(segment_map->fpm),
1133  segment_map->mapped_address);
1134  segment_index = segment_map->header->next;
1135  }
1136  }
1137  }
1139 
1140  fprintf(stderr, " pools:\n");
1141  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1142  {
1143  bool found = false;
1144 
1146  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1147  if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1148  found = true;
1149  if (found)
1150  {
1152  fprintf(stderr, " pool for blocks of span objects:\n");
1153  else if (i == DSA_SCLASS_SPAN_LARGE)
1154  fprintf(stderr, " pool for large object spans:\n");
1155  else
1156  fprintf(stderr,
1157  " pool for size class %zu (object size %hu bytes):\n",
1158  i, dsa_size_classes[i]);
1159  for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1160  {
1161  if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1162  fprintf(stderr, " fullness class %zu is empty\n", j);
1163  else
1164  {
1165  dsa_pointer span_pointer = area->control->pools[i].spans[j];
1166 
1167  fprintf(stderr, " fullness class %zu:\n", j);
1168  while (DsaPointerIsValid(span_pointer))
1169  {
1170  dsa_area_span *span;
1171 
1172  span = dsa_get_address(area, span_pointer);
1173  fprintf(stderr,
1174  " span descriptor at "
1175  DSA_POINTER_FORMAT ", superblock at "
1177  ", pages = %zu, objects free = %hu/%hu\n",
1178  span_pointer, span->start, span->npages,
1179  span->nallocatable, span->nmax);
1180  span_pointer = span->nextspan;
1181  }
1182  }
1183  }
1184  }
1186  }
1187 }
1188 
1189 /*
1190  * Return the smallest size that you can successfully provide to
1191  * dsa_create_in_place.
1192  */
1193 size_t
1195 {
1196  size_t size;
1197  int pages = 0;
1198 
1199  size = MAXALIGN(sizeof(dsa_area_control)) +
1200  MAXALIGN(sizeof(FreePageManager));
1201 
1202  /* Figure out how many pages we need, including the page map... */
1203  while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1204  {
1205  ++pages;
1206  size += sizeof(dsa_pointer);
1207  }
1208 
1209  return pages * FPM_PAGE_SIZE;
1210 }
1211 
1212 /*
1213  * Workhorse function for dsa_create and dsa_create_in_place.
1214  */
1215 static dsa_area *
1216 create_internal(void *place, size_t size,
1217  int tranche_id,
1218  dsm_handle control_handle,
1219  dsm_segment *control_segment)
1220 {
1221  dsa_area_control *control;
1222  dsa_area *area;
1223  dsa_segment_map *segment_map;
1224  size_t usable_pages;
1225  size_t total_pages;
1226  size_t metadata_bytes;
1227  int i;
1228 
1229  /* Sanity check on the space we have to work in. */
1230  if (size < dsa_minimum_size())
1231  elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1232  dsa_minimum_size(), size);
1233 
1234  /* Now figure out how much space is usable */
1235  total_pages = size / FPM_PAGE_SIZE;
1236  metadata_bytes =
1237  MAXALIGN(sizeof(dsa_area_control)) +
1238  MAXALIGN(sizeof(FreePageManager)) +
1239  total_pages * sizeof(dsa_pointer);
1240  /* Add padding up to next page boundary. */
1241  if (metadata_bytes % FPM_PAGE_SIZE != 0)
1242  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1243  Assert(metadata_bytes <= size);
1244  usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1245 
1246  /*
1247  * Initialize the dsa_area_control object located at the start of the
1248  * space.
1249  */
1250  control = (dsa_area_control *) place;
1251  memset(place, 0, sizeof(*control));
1252  control->segment_header.magic =
1253  DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1256  control->segment_header.usable_pages = usable_pages;
1257  control->segment_header.freed = false;
1259  control->handle = control_handle;
1260  control->max_total_segment_size = (size_t) -1;
1261  control->total_segment_size = size;
1262  control->segment_handles[0] = control_handle;
1263  for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1265  control->refcnt = 1;
1266  control->lwlock_tranche_id = tranche_id;
1267 
1268  /*
1269  * Create the dsa_area object that this backend will use to access the
1270  * area. Other backends will need to obtain their own dsa_area object by
1271  * attaching.
1272  */
1273  area = palloc(sizeof(dsa_area));
1274  area->control = control;
1276  memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1277  area->high_segment_index = 0;
1278  area->freed_segment_counter = 0;
1279  LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1280  for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1282  control->lwlock_tranche_id);
1283 
1284  /* Set up the segment map for this process's mapping. */
1285  segment_map = &area->segment_maps[0];
1286  segment_map->segment = control_segment;
1287  segment_map->mapped_address = place;
1288  segment_map->header = (dsa_segment_header *) place;
1289  segment_map->fpm = (FreePageManager *)
1290  (segment_map->mapped_address +
1291  MAXALIGN(sizeof(dsa_area_control)));
1292  segment_map->pagemap = (dsa_pointer *)
1293  (segment_map->mapped_address +
1294  MAXALIGN(sizeof(dsa_area_control)) +
1295  MAXALIGN(sizeof(FreePageManager)));
1296 
1297  /* Set up the free page map. */
1298  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1299  /* There can be 0 usable pages if size is dsa_minimum_size(). */
1300 
1301  if (usable_pages > 0)
1302  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1303  usable_pages);
1304 
1305  /* Put this segment into the appropriate bin. */
1306  control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1307  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1308 
1309  return area;
1310 }
1311 
1312 /*
1313  * Workhorse function for dsa_attach and dsa_attach_in_place.
1314  */
1315 static dsa_area *
1316 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1317 {
1318  dsa_area_control *control;
1319  dsa_area *area;
1320  dsa_segment_map *segment_map;
1321 
1322  control = (dsa_area_control *) place;
1323  Assert(control->handle == handle);
1324  Assert(control->segment_handles[0] == handle);
1325  Assert(control->segment_header.magic ==
1326  (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1327 
1328  /* Build the backend-local area object. */
1329  area = palloc(sizeof(dsa_area));
1330  area->control = control;
1332  memset(&area->segment_maps[0], 0,
1333  sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1334  area->high_segment_index = 0;
1335 
1336  /* Set up the segment map for this process's mapping. */
1337  segment_map = &area->segment_maps[0];
1338  segment_map->segment = segment; /* NULL for in-place */
1339  segment_map->mapped_address = place;
1340  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1341  segment_map->fpm = (FreePageManager *)
1342  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1343  segment_map->pagemap = (dsa_pointer *)
1344  (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1345  MAXALIGN(sizeof(FreePageManager)));
1346 
1347  /* Bump the reference count. */
1349  if (control->refcnt == 0)
1350  {
1351  /* We can't attach to a DSA area that has already been destroyed. */
1352  ereport(ERROR,
1353  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1354  errmsg("could not attach to dynamic shared area")));
1355  }
1356  ++control->refcnt;
1359 
1360  return area;
1361 }
1362 
1363 /*
1364  * Add a new span to fullness class 1 of the indicated pool.
1365  */
1366 static void
1368  dsa_pointer span_pointer,
1369  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1370  uint16 size_class)
1371 {
1372  dsa_area_span *span = dsa_get_address(area, span_pointer);
1373  size_t obsize = dsa_size_classes[size_class];
1374 
1375  /*
1376  * The per-pool lock must be held because we manipulate the span list for
1377  * this pool.
1378  */
1379  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1380 
1381  /* Push this span onto the front of the span list for fullness class 1. */
1382  if (DsaPointerIsValid(pool->spans[1]))
1383  {
1384  dsa_area_span *head = (dsa_area_span *)
1385  dsa_get_address(area, pool->spans[1]);
1386 
1387  head->prevspan = span_pointer;
1388  }
1389  span->pool = DsaAreaPoolToDsaPointer(area, pool);
1390  span->nextspan = pool->spans[1];
1391  span->prevspan = InvalidDsaPointer;
1392  pool->spans[1] = span_pointer;
1393 
1394  span->start = start;
1395  span->npages = npages;
1396  span->size_class = size_class;
1397  span->ninitialized = 0;
1398  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1399  {
1400  /*
1401  * A block-of-spans contains its own descriptor, so mark one object as
1402  * initialized and reduce the count of allocatable objects by one.
1403  * Doing this here has the side effect of also reducing nmax by one,
1404  * which is important to make sure we free this object at the correct
1405  * time.
1406  */
1407  span->ninitialized = 1;
1408  span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1409  }
1410  else if (size_class != DSA_SCLASS_SPAN_LARGE)
1411  span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1413  span->nmax = span->nallocatable;
1414  span->fclass = 1;
1415 }
1416 
1417 /*
1418  * Transfer the first span in one fullness class to the head of another
1419  * fullness class.
1420  */
1421 static bool
1423  dsa_area_pool *pool, int fromclass, int toclass)
1424 {
1425  dsa_pointer span_pointer;
1426  dsa_area_span *span;
1427  dsa_area_span *nextspan;
1428 
1429  /* Can't do it if source list is empty. */
1430  span_pointer = pool->spans[fromclass];
1431  if (!DsaPointerIsValid(span_pointer))
1432  return false;
1433 
1434  /* Remove span from head of source list. */
1435  span = dsa_get_address(area, span_pointer);
1436  pool->spans[fromclass] = span->nextspan;
1437  if (DsaPointerIsValid(span->nextspan))
1438  {
1439  nextspan = (dsa_area_span *)
1440  dsa_get_address(area, span->nextspan);
1441  nextspan->prevspan = InvalidDsaPointer;
1442  }
1443 
1444  /* Add span to head of target list. */
1445  span->nextspan = pool->spans[toclass];
1446  pool->spans[toclass] = span_pointer;
1447  if (DsaPointerIsValid(span->nextspan))
1448  {
1449  nextspan = (dsa_area_span *)
1450  dsa_get_address(area, span->nextspan);
1451  nextspan->prevspan = span_pointer;
1452  }
1453  span->fclass = toclass;
1454 
1455  return true;
1456 }
1457 
1458 /*
1459  * Allocate one object of the requested size class from the given area.
1460  */
1461 static inline dsa_pointer
1462 alloc_object(dsa_area *area, int size_class)
1463 {
1464  dsa_area_pool *pool = &area->control->pools[size_class];
1465  dsa_area_span *span;
1466  dsa_pointer block;
1467  dsa_pointer result;
1468  char *object;
1469  size_t size;
1470 
1471  /*
1472  * Even though ensure_active_superblock can in turn call alloc_object if
1473  * it needs to allocate a new span, that's always from a different pool,
1474  * and the order of lock acquisition is always the same, so it's OK that
1475  * we hold this lock for the duration of this function.
1476  */
1477  Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1478  LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1479 
1480  /*
1481  * If there's no active superblock, we must successfully obtain one or
1482  * fail the request.
1483  */
1484  if (!DsaPointerIsValid(pool->spans[1]) &&
1485  !ensure_active_superblock(area, pool, size_class))
1486  {
1487  result = InvalidDsaPointer;
1488  }
1489  else
1490  {
1491  /*
1492  * There should be a block in fullness class 1 at this point, and it
1493  * should never be completely full. Thus we can either pop an object
1494  * from the free list or, failing that, initialize a new object.
1495  */
1496  Assert(DsaPointerIsValid(pool->spans[1]));
1497  span = (dsa_area_span *)
1498  dsa_get_address(area, pool->spans[1]);
1499  Assert(span->nallocatable > 0);
1500  block = span->start;
1501  Assert(size_class < DSA_NUM_SIZE_CLASSES);
1502  size = dsa_size_classes[size_class];
1503  if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1504  {
1505  result = block + span->firstfree * size;
1506  object = dsa_get_address(area, result);
1507  span->firstfree = NextFreeObjectIndex(object);
1508  }
1509  else
1510  {
1511  result = block + span->ninitialized * size;
1512  ++span->ninitialized;
1513  }
1514  --span->nallocatable;
1515 
1516  /* If it's now full, move it to the highest-numbered fullness class. */
1517  if (span->nallocatable == 0)
1518  transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1519  }
1520 
1521  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1522  LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1523 
1524  return result;
1525 }
1526 
1527 /*
1528  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1529  * superblocks are completely full and no more can be allocated.
1530  *
1531  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1532  * utilization percentage is at least K/N, but we only enforce this rigorously
1533  * for the highest-numbered fullness class, which always contains exactly
1534  * those blocks that are completely full. It's otherwise acceptable for a
1535  * block to be in a higher-numbered fullness class than the one to which it
1536  * logically belongs. In addition, the active block, which is always the
1537  * first block in fullness class 1, is permitted to have a higher allocation
1538  * percentage than would normally be allowable for that fullness class; we
1539  * don't move it until it's completely full, and then it goes to the
1540  * highest-numbered fullness class.
1541  *
1542  * It might seem odd that the active block is the head of fullness class 1
1543  * rather than fullness class 0, but experience with other allocators has
1544  * shown that it's usually better to allocate from a block that's moderately
1545  * full rather than one that's nearly empty. Insofar as is reasonably
1546  * possible, we want to avoid performing new allocations in a block that would
1547  * otherwise become empty soon.
1548  */
1549 static bool
1551  int size_class)
1552 {
1553  dsa_pointer span_pointer;
1554  dsa_pointer start_pointer;
1555  size_t obsize = dsa_size_classes[size_class];
1556  size_t nmax;
1557  int fclass;
1558  size_t npages = 1;
1559  size_t first_page;
1560  size_t i;
1561  dsa_segment_map *segment_map;
1562 
1563  Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1564 
1565  /*
1566  * Compute the number of objects that will fit in a block of this size
1567  * class. Span-of-spans blocks are just a single page, and the first
1568  * object isn't available for use because it describes the block-of-spans
1569  * itself.
1570  */
1571  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1572  nmax = FPM_PAGE_SIZE / obsize - 1;
1573  else
1574  nmax = DSA_SUPERBLOCK_SIZE / obsize;
1575 
1576  /*
1577  * If fullness class 1 is empty, try to find a span to put in it by
1578  * scanning higher-numbered fullness classes (excluding the last one,
1579  * whose blocks are certain to all be completely full).
1580  */
1581  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1582  {
1583  span_pointer = pool->spans[fclass];
1584 
1585  while (DsaPointerIsValid(span_pointer))
1586  {
1587  int tfclass;
1588  dsa_area_span *span;
1589  dsa_area_span *nextspan;
1590  dsa_area_span *prevspan;
1591  dsa_pointer next_span_pointer;
1592 
1593  span = (dsa_area_span *)
1594  dsa_get_address(area, span_pointer);
1595  next_span_pointer = span->nextspan;
1596 
1597  /* Figure out what fullness class should contain this span. */
1598  tfclass = (nmax - span->nallocatable)
1599  * (DSA_FULLNESS_CLASSES - 1) / nmax;
1600 
1601  /* Look up next span. */
1602  if (DsaPointerIsValid(span->nextspan))
1603  nextspan = (dsa_area_span *)
1604  dsa_get_address(area, span->nextspan);
1605  else
1606  nextspan = NULL;
1607 
1608  /*
1609  * If utilization has dropped enough that this now belongs in some
1610  * other fullness class, move it there.
1611  */
1612  if (tfclass < fclass)
1613  {
1614  /* Remove from the current fullness class list. */
1615  if (pool->spans[fclass] == span_pointer)
1616  {
1617  /* It was the head; remove it. */
1619  pool->spans[fclass] = span->nextspan;
1620  if (nextspan != NULL)
1621  nextspan->prevspan = InvalidDsaPointer;
1622  }
1623  else
1624  {
1625  /* It was not the head. */
1627  prevspan = (dsa_area_span *)
1628  dsa_get_address(area, span->prevspan);
1629  prevspan->nextspan = span->nextspan;
1630  }
1631  if (nextspan != NULL)
1632  nextspan->prevspan = span->prevspan;
1633 
1634  /* Push onto the head of the new fullness class list. */
1635  span->nextspan = pool->spans[tfclass];
1636  pool->spans[tfclass] = span_pointer;
1637  span->prevspan = InvalidDsaPointer;
1638  if (DsaPointerIsValid(span->nextspan))
1639  {
1640  nextspan = (dsa_area_span *)
1641  dsa_get_address(area, span->nextspan);
1642  nextspan->prevspan = span_pointer;
1643  }
1644  span->fclass = tfclass;
1645  }
1646 
1647  /* Advance to next span on list. */
1648  span_pointer = next_span_pointer;
1649  }
1650 
1651  /* Stop now if we found a suitable block. */
1652  if (DsaPointerIsValid(pool->spans[1]))
1653  return true;
1654  }
1655 
1656  /*
1657  * If there are no blocks that properly belong in fullness class 1, pick
1658  * one from some other fullness class and move it there anyway, so that we
1659  * have an allocation target. Our last choice is to transfer a block
1660  * that's almost empty (and might become completely empty soon if left
1661  * alone), but even that is better than failing, which is what we must do
1662  * if there are no blocks at all with freespace.
1663  */
1664  Assert(!DsaPointerIsValid(pool->spans[1]));
1665  for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1666  if (transfer_first_span(area, pool, fclass, 1))
1667  return true;
1668  if (!DsaPointerIsValid(pool->spans[1]) &&
1669  transfer_first_span(area, pool, 0, 1))
1670  return true;
1671 
1672  /*
1673  * We failed to find an existing span with free objects, so we need to
1674  * allocate a new superblock and construct a new span to manage it.
1675  *
1676  * First, get a dsa_area_span object to describe the new superblock block
1677  * ... unless this allocation is for a dsa_area_span object, in which case
1678  * that's surely not going to work. We handle that case by storing the
1679  * span describing a block-of-spans inline.
1680  */
1681  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1682  {
1683  span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1684  if (!DsaPointerIsValid(span_pointer))
1685  return false;
1686  npages = DSA_PAGES_PER_SUPERBLOCK;
1687  }
1688 
1689  /* Find or create a segment and allocate the superblock. */
1691  segment_map = get_best_segment(area, npages);
1692  if (segment_map == NULL)
1693  {
1694  segment_map = make_new_segment(area, npages);
1695  if (segment_map == NULL)
1696  {
1698  return false;
1699  }
1700  }
1701 
1702  /*
1703  * This shouldn't happen: get_best_segment() or make_new_segment()
1704  * promised that we can successfully allocate npages.
1705  */
1706  if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1707  elog(FATAL,
1708  "dsa_allocate could not find %zu free pages for superblock",
1709  npages);
1711 
1712  /* Compute the start of the superblock. */
1713  start_pointer =
1714  DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1715  first_page * FPM_PAGE_SIZE);
1716 
1717  /*
1718  * If this is a block-of-spans, carve the descriptor right out of the
1719  * allocated space.
1720  */
1721  if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1722  {
1723  /*
1724  * We have a pointer into the segment. We need to build a dsa_pointer
1725  * from the segment index and offset into the segment.
1726  */
1727  span_pointer = start_pointer;
1728  }
1729 
1730  /* Initialize span and pagemap. */
1731  init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1732  for (i = 0; i < npages; ++i)
1733  segment_map->pagemap[first_page + i] = span_pointer;
1734 
1735  return true;
1736 }
1737 
1738 /*
1739  * Return the segment map corresponding to a given segment index, mapping the
1740  * segment in if necessary. For internal segment book-keeping, this is called
1741  * with the area lock held. It is also called by dsa_free and dsa_get_address
1742  * without any locking, relying on the fact they have a known live segment
1743  * index and they always call check_for_freed_segments to ensures that any
1744  * freed segment occupying the same slot is detached first.
1745  */
1746 static dsa_segment_map *
1748 {
1749  if (unlikely(area->segment_maps[index].mapped_address == NULL))
1750  {
1751  dsm_handle handle;
1752  dsm_segment *segment;
1753  dsa_segment_map *segment_map;
1754  ResourceOwner oldowner;
1755 
1756  /*
1757  * If we are reached by dsa_free or dsa_get_address, there must be at
1758  * least one object allocated in the referenced segment. Otherwise,
1759  * their caller has a double-free or access-after-free bug, which we
1760  * have no hope of detecting. So we know it's safe to access this
1761  * array slot without holding a lock; it won't change underneath us.
1762  * Furthermore, we know that we can see the latest contents of the
1763  * slot, as explained in check_for_freed_segments, which those
1764  * functions call before arriving here.
1765  */
1766  handle = area->control->segment_handles[index];
1767 
1768  /* It's an error to try to access an unused slot. */
1769  if (handle == DSM_HANDLE_INVALID)
1770  elog(ERROR,
1771  "dsa_area could not attach to a segment that has been freed");
1772 
1773  oldowner = CurrentResourceOwner;
1775  segment = dsm_attach(handle);
1776  CurrentResourceOwner = oldowner;
1777  if (segment == NULL)
1778  elog(ERROR, "dsa_area could not attach to segment");
1779  segment_map = &area->segment_maps[index];
1780  segment_map->segment = segment;
1781  segment_map->mapped_address = dsm_segment_address(segment);
1782  segment_map->header =
1783  (dsa_segment_header *) segment_map->mapped_address;
1784  segment_map->fpm = (FreePageManager *)
1785  (segment_map->mapped_address +
1786  MAXALIGN(sizeof(dsa_segment_header)));
1787  segment_map->pagemap = (dsa_pointer *)
1788  (segment_map->mapped_address +
1789  MAXALIGN(sizeof(dsa_segment_header)) +
1790  MAXALIGN(sizeof(FreePageManager)));
1791 
1792  /* Remember the highest index this backend has ever mapped. */
1793  if (area->high_segment_index < index)
1794  area->high_segment_index = index;
1795 
1796  Assert(segment_map->header->magic ==
1798  }
1799 
1800  /*
1801  * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1802  * but it's a bug in the calling code and undefined behavior if the
1803  * address is not live (ie if the segment might possibly have been freed,
1804  * they're trying to use a dangling pointer).
1805  *
1806  * For dsa.c code that holds the area lock to manipulate segment_bins
1807  * lists, it would be a bug if we ever reach a freed segment here. After
1808  * it's marked as freed, the only thing any backend should do with it is
1809  * unmap it, and it should always have done that in
1810  * check_for_freed_segments_locked() before arriving here to resolve an
1811  * index to a segment_map.
1812  *
1813  * Either way we can assert that we aren't returning a freed segment.
1814  */
1815  Assert(!area->segment_maps[index].header->freed);
1816 
1817  return &area->segment_maps[index];
1818 }
1819 
1820 /*
1821  * Return a superblock to the free page manager. If the underlying segment
1822  * has become entirely free, then return it to the operating system.
1823  *
1824  * The appropriate pool lock must be held.
1825  */
1826 static void
1828 {
1829  dsa_area_span *span = dsa_get_address(area, span_pointer);
1830  int size_class = span->size_class;
1831  dsa_segment_map *segment_map;
1832 
1833 
1834  /* Remove it from its fullness class list. */
1835  unlink_span(area, span);
1836 
1837  /*
1838  * Note: Here we acquire the area lock while we already hold a per-pool
1839  * lock. We never hold the area lock and then take a pool lock, or we
1840  * could deadlock.
1841  */
1844  segment_map =
1846  FreePageManagerPut(segment_map->fpm,
1848  span->npages);
1849  /* Check if the segment is now entirely free. */
1850  if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1851  {
1852  dsa_segment_index index = get_segment_index(area, segment_map);
1853 
1854  /* If it's not the segment with extra control data, free it. */
1855  if (index != 0)
1856  {
1857  /*
1858  * Give it back to the OS, and allow other backends to detect that
1859  * they need to detach.
1860  */
1861  unlink_segment(area, segment_map);
1862  segment_map->header->freed = true;
1864  segment_map->header->size);
1865  area->control->total_segment_size -=
1866  segment_map->header->size;
1868  dsm_detach(segment_map->segment);
1870  ++area->control->freed_segment_counter;
1871  segment_map->segment = NULL;
1872  segment_map->header = NULL;
1873  segment_map->mapped_address = NULL;
1874  }
1875  }
1876 
1877  /* Move segment to appropriate bin if necessary. */
1878  if (segment_map->header != NULL)
1879  rebin_segment(area, segment_map);
1880 
1882 
1883  /*
1884  * Span-of-spans blocks store the span which describes them within the
1885  * block itself, so freeing the storage implicitly frees the descriptor
1886  * also. If this is a block of any other type, we need to separately free
1887  * the span object also. This recursive call to dsa_free will acquire the
1888  * span pool's lock. We can't deadlock because the acquisition order is
1889  * always some other pool and then the span pool.
1890  */
1891  if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1892  dsa_free(area, span_pointer);
1893 }
1894 
1895 static void
1897 {
1898  if (DsaPointerIsValid(span->nextspan))
1899  {
1900  dsa_area_span *next = dsa_get_address(area, span->nextspan);
1901 
1902  next->prevspan = span->prevspan;
1903  }
1904  if (DsaPointerIsValid(span->prevspan))
1905  {
1906  dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1907 
1908  prev->nextspan = span->nextspan;
1909  }
1910  else
1911  {
1912  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1913 
1914  pool->spans[span->fclass] = span->nextspan;
1915  }
1916 }
1917 
1918 static void
1920  dsa_pointer span_pointer,
1921  int fclass)
1922 {
1923  dsa_area_pool *pool = dsa_get_address(area, span->pool);
1924 
1925  if (DsaPointerIsValid(pool->spans[fclass]))
1926  {
1927  dsa_area_span *head = dsa_get_address(area,
1928  pool->spans[fclass]);
1929 
1930  head->prevspan = span_pointer;
1931  }
1932  span->prevspan = InvalidDsaPointer;
1933  span->nextspan = pool->spans[fclass];
1934  pool->spans[fclass] = span_pointer;
1935  span->fclass = fclass;
1936 }
1937 
1938 /*
1939  * Detach from an area that was either created or attached to by this process.
1940  */
1941 void
1943 {
1944  int i;
1945 
1946  /* Detach from all segments. */
1947  for (i = 0; i <= area->high_segment_index; ++i)
1948  if (area->segment_maps[i].segment != NULL)
1949  dsm_detach(area->segment_maps[i].segment);
1950 
1951  /*
1952  * Note that 'detaching' (= detaching from DSM segments) doesn't include
1953  * 'releasing' (= adjusting the reference count). It would be nice to
1954  * combine these operations, but client code might never get around to
1955  * calling dsa_detach because of an error path, and a detach hook on any
1956  * particular segment is too late to detach other segments in the area
1957  * without risking a 'leak' warning in the non-error path.
1958  */
1959 
1960  /* Free the backend-local area object. */
1961  pfree(area);
1962 }
1963 
1964 /*
1965  * Unlink a segment from the bin that contains it.
1966  */
1967 static void
1969 {
1970  if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1971  {
1972  dsa_segment_map *prev;
1973 
1974  prev = get_segment_by_index(area, segment_map->header->prev);
1975  prev->header->next = segment_map->header->next;
1976  }
1977  else
1978  {
1979  Assert(area->control->segment_bins[segment_map->header->bin] ==
1980  get_segment_index(area, segment_map));
1981  area->control->segment_bins[segment_map->header->bin] =
1982  segment_map->header->next;
1983  }
1984  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1985  {
1987 
1988  next = get_segment_by_index(area, segment_map->header->next);
1989  next->header->prev = segment_map->header->prev;
1990  }
1991 }
1992 
1993 /*
1994  * Find a segment that could satisfy a request for 'npages' of contiguous
1995  * memory, or return NULL if none can be found. This may involve attaching to
1996  * segments that weren't previously attached so that we can query their free
1997  * pages map.
1998  */
1999 static dsa_segment_map *
2000 get_best_segment(dsa_area *area, size_t npages)
2001 {
2002  size_t bin;
2003 
2006 
2007  /*
2008  * Start searching from the first bin that *might* have enough contiguous
2009  * pages.
2010  */
2011  for (bin = contiguous_pages_to_segment_bin(npages);
2012  bin < DSA_NUM_SEGMENT_BINS;
2013  ++bin)
2014  {
2015  /*
2016  * The minimum contiguous size that any segment in this bin should
2017  * have. We'll re-bin if we see segments with fewer.
2018  */
2019  size_t threshold = (size_t) 1 << (bin - 1);
2020  dsa_segment_index segment_index;
2021 
2022  /* Search this bin for a segment with enough contiguous space. */
2023  segment_index = area->control->segment_bins[bin];
2024  while (segment_index != DSA_SEGMENT_INDEX_NONE)
2025  {
2026  dsa_segment_map *segment_map;
2027  dsa_segment_index next_segment_index;
2028  size_t contiguous_pages;
2029 
2030  segment_map = get_segment_by_index(area, segment_index);
2031  next_segment_index = segment_map->header->next;
2032  contiguous_pages = fpm_largest(segment_map->fpm);
2033 
2034  /* Not enough for the request, still enough for this bin. */
2035  if (contiguous_pages >= threshold && contiguous_pages < npages)
2036  {
2037  segment_index = next_segment_index;
2038  continue;
2039  }
2040 
2041  /* Re-bin it if it's no longer in the appropriate bin. */
2042  if (contiguous_pages < threshold)
2043  {
2044  rebin_segment(area, segment_map);
2045 
2046  /*
2047  * But fall through to see if it's enough to satisfy this
2048  * request anyway....
2049  */
2050  }
2051 
2052  /* Check if we are done. */
2053  if (contiguous_pages >= npages)
2054  return segment_map;
2055 
2056  /* Continue searching the same bin. */
2057  segment_index = next_segment_index;
2058  }
2059  }
2060 
2061  /* Not found. */
2062  return NULL;
2063 }
2064 
2065 /*
2066  * Create a new segment that can handle at least requested_pages. Returns
2067  * NULL if the requested total size limit or maximum allowed number of
2068  * segments would be exceeded.
2069  */
2070 static dsa_segment_map *
2071 make_new_segment(dsa_area *area, size_t requested_pages)
2072 {
2073  dsa_segment_index new_index;
2074  size_t metadata_bytes;
2075  size_t total_size;
2076  size_t total_pages;
2077  size_t usable_pages;
2078  dsa_segment_map *segment_map;
2079  dsm_segment *segment;
2080  ResourceOwner oldowner;
2081 
2083 
2084  /* Find a segment slot that is not in use (linearly for now). */
2085  for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2086  {
2087  if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2088  break;
2089  }
2090  if (new_index == DSA_MAX_SEGMENTS)
2091  return NULL;
2092 
2093  /*
2094  * If the total size limit is already exceeded, then we exit early and
2095  * avoid arithmetic wraparound in the unsigned expressions below.
2096  */
2097  if (area->control->total_segment_size >=
2099  return NULL;
2100 
2101  /*
2102  * The size should be at least as big as requested, and at least big
2103  * enough to follow a geometric series that approximately doubles the
2104  * total storage each time we create a new segment. We use geometric
2105  * growth because the underlying DSM system isn't designed for large
2106  * numbers of segments (otherwise we might even consider just using one
2107  * DSM segment for each large allocation and for each superblock, and then
2108  * we wouldn't need to use FreePageManager).
2109  *
2110  * We decide on a total segment size first, so that we produce tidy
2111  * power-of-two sized segments. This is a good property to have if we
2112  * move to huge pages in the future. Then we work back to the number of
2113  * pages we can fit.
2114  */
2116  ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2120  area->control->total_segment_size);
2121 
2122  total_pages = total_size / FPM_PAGE_SIZE;
2123  metadata_bytes =
2124  MAXALIGN(sizeof(dsa_segment_header)) +
2125  MAXALIGN(sizeof(FreePageManager)) +
2126  sizeof(dsa_pointer) * total_pages;
2127 
2128  /* Add padding up to next page boundary. */
2129  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2130  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2131  if (total_size <= metadata_bytes)
2132  return NULL;
2133  usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2134  Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2135 
2136  /* See if that is enough... */
2137  if (requested_pages > usable_pages)
2138  {
2139  /*
2140  * We'll make an odd-sized segment, working forward from the requested
2141  * number of pages.
2142  */
2143  usable_pages = requested_pages;
2144  metadata_bytes =
2145  MAXALIGN(sizeof(dsa_segment_header)) +
2146  MAXALIGN(sizeof(FreePageManager)) +
2147  usable_pages * sizeof(dsa_pointer);
2148 
2149  /* Add padding up to next page boundary. */
2150  if (metadata_bytes % FPM_PAGE_SIZE != 0)
2151  metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2152  total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2153 
2154  /* Is that too large for dsa_pointer's addressing scheme? */
2156  return NULL;
2157 
2158  /* Would that exceed the limit? */
2160  area->control->total_segment_size)
2161  return NULL;
2162  }
2163 
2164  /* Create the segment. */
2165  oldowner = CurrentResourceOwner;
2167  segment = dsm_create(total_size, 0);
2168  CurrentResourceOwner = oldowner;
2169  if (segment == NULL)
2170  return NULL;
2171  dsm_pin_segment(segment);
2172 
2173  /* Store the handle in shared memory to be found by index. */
2174  area->control->segment_handles[new_index] =
2175  dsm_segment_handle(segment);
2176  /* Track the highest segment index in the history of the area. */
2177  if (area->control->high_segment_index < new_index)
2178  area->control->high_segment_index = new_index;
2179  /* Track the highest segment index this backend has ever mapped. */
2180  if (area->high_segment_index < new_index)
2181  area->high_segment_index = new_index;
2182  /* Track total size of all segments. */
2186 
2187  /* Build a segment map for this segment in this backend. */
2188  segment_map = &area->segment_maps[new_index];
2189  segment_map->segment = segment;
2190  segment_map->mapped_address = dsm_segment_address(segment);
2191  segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2192  segment_map->fpm = (FreePageManager *)
2193  (segment_map->mapped_address +
2194  MAXALIGN(sizeof(dsa_segment_header)));
2195  segment_map->pagemap = (dsa_pointer *)
2196  (segment_map->mapped_address +
2197  MAXALIGN(sizeof(dsa_segment_header)) +
2198  MAXALIGN(sizeof(FreePageManager)));
2199 
2200  /* Set up the free page map. */
2201  FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2202  FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2203  usable_pages);
2204 
2205  /* Set up the segment header and put it in the appropriate bin. */
2206  segment_map->header->magic =
2207  DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2208  segment_map->header->usable_pages = usable_pages;
2209  segment_map->header->size = total_size;
2210  segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2211  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2212  segment_map->header->next =
2213  area->control->segment_bins[segment_map->header->bin];
2214  segment_map->header->freed = false;
2215  area->control->segment_bins[segment_map->header->bin] = new_index;
2216  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2217  {
2219  get_segment_by_index(area, segment_map->header->next);
2220 
2221  Assert(next->header->bin == segment_map->header->bin);
2222  next->header->prev = new_index;
2223  }
2224 
2225  return segment_map;
2226 }
2227 
2228 /*
2229  * Check if any segments have been freed by destroy_superblock, so we can
2230  * detach from them in this backend. This function is called by
2231  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2232  * received can be resolved to the correct segment.
2233  *
2234  * The danger we want to defend against is that there could be an old segment
2235  * mapped into a given slot in this backend, and the dsa_pointer they have
2236  * might refer to some new segment in the same slot. So those functions must
2237  * be sure to process all instructions to detach from a freed segment that had
2238  * been generated by the time this process received the dsa_pointer, before
2239  * they call get_segment_by_index.
2240  */
2241 static void
2243 {
2244  size_t freed_segment_counter;
2245 
2246  /*
2247  * Any other process that has freed a segment has incremented
2248  * freed_segment_counter while holding an LWLock, and that must precede
2249  * any backend creating a new segment in the same slot while holding an
2250  * LWLock, and that must precede the creation of any dsa_pointer pointing
2251  * into the new segment which might reach us here, and the caller must
2252  * have sent the dsa_pointer to this process using appropriate memory
2253  * synchronization (some kind of locking or atomic primitive or system
2254  * call). So all we need to do on the reading side is ask for the load of
2255  * freed_segment_counter to follow the caller's load of the dsa_pointer it
2256  * has, and we can be sure to detect any segments that had been freed as
2257  * of the time that the dsa_pointer reached this process.
2258  */
2259  pg_read_barrier();
2260  freed_segment_counter = area->control->freed_segment_counter;
2261  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2262  {
2263  /* Check all currently mapped segments to find what's been freed. */
2267  }
2268 }
2269 
2270 /*
2271  * Workhorse for check_for_freed_segments(), and also used directly in path
2272  * where the area lock is already held. This should be called after acquiring
2273  * the lock but before looking up any segment by index number, to make sure we
2274  * unmap any stale segments that might have previously had the same index as a
2275  * current segment.
2276  */
2277 static void
2279 {
2280  size_t freed_segment_counter;
2281  int i;
2282 
2284  freed_segment_counter = area->control->freed_segment_counter;
2285  if (unlikely(area->freed_segment_counter != freed_segment_counter))
2286  {
2287  for (i = 0; i <= area->high_segment_index; ++i)
2288  {
2289  if (area->segment_maps[i].header != NULL &&
2290  area->segment_maps[i].header->freed)
2291  {
2292  dsm_detach(area->segment_maps[i].segment);
2293  area->segment_maps[i].segment = NULL;
2294  area->segment_maps[i].header = NULL;
2295  area->segment_maps[i].mapped_address = NULL;
2296  }
2297  }
2298  area->freed_segment_counter = freed_segment_counter;
2299  }
2300 }
2301 
2302 /*
2303  * Re-bin segment if it's no longer in the appropriate bin.
2304  */
2305 static void
2307 {
2308  size_t new_bin;
2309  dsa_segment_index segment_index;
2310 
2311  new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2312  if (segment_map->header->bin == new_bin)
2313  return;
2314 
2315  /* Remove it from its current bin. */
2316  unlink_segment(area, segment_map);
2317 
2318  /* Push it onto the front of its new bin. */
2319  segment_index = get_segment_index(area, segment_map);
2320  segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2321  segment_map->header->next = area->control->segment_bins[new_bin];
2322  segment_map->header->bin = new_bin;
2323  area->control->segment_bins[new_bin] = segment_index;
2324  if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2325  {
2327 
2328  next = get_segment_by_index(area, segment_map->header->next);
2329  Assert(next->header->bin == new_bin);
2330  next->header->prev = segment_index;
2331  }
2332 }
#define pg_read_barrier()
Definition: atomics.h:153
static int32 next
Definition: blutils.c:221
unsigned short uint16
Definition: c.h:494
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
#define MAXALIGN(LEN)
Definition: c.h:800
#define unlikely(x)
Definition: c.h:300
#define lengthof(array)
Definition: c.h:777
unsigned char uint8
Definition: c.h:493
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1968
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2242
dsa_area * dsa_create_in_place(void *place, size_t size, int tranche_id, dsm_segment *segment)
Definition: dsa.c:488
static const uint16 dsa_size_classes[]
Definition: dsa.c:249
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:120
#define DSA_AREA_LOCK(area)
Definition: dsa.c:156
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:79
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1919
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1550
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:129
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:113
void dsa_trim(dsa_area *area)
Definition: dsa.c:1045
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:395
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:116
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:399
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:605
#define DSA_INITIAL_SEGMENT_SIZE
Definition: dsa.c:70
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:591
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1462
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:106
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:282
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:143
static const uint8 dsa_size_class_map[]
Definition: dsa.c:272
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:686
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2071
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:525
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:396
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:342
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:957
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2278
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:123
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1827
#define DSA_MAX_SEGMENTS
Definition: dsa.c:96
size_t dsa_segment_index
Definition: dsa.c:126
static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2306
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:157
void dsa_release_in_place(void *place)
Definition: dsa.c:620
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1747
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1033
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:263
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1422
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1896
dsa_area * dsa_create(int tranche_id)
Definition: dsa.c:439
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:264
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:260
void dsa_unpin(dsa_area *area)
Definition: dsa.c:1009
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:650
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1316
#define NextFreeObjectIndex(object)
Definition: dsa.c:226
void dsa_dump(dsa_area *area)
Definition: dsa.c:1090
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:513
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1367
void dsa_detach(dsa_area *area)
Definition: dsa.c:1942
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:2000
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:560
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.c:103
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:290
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:841
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:135
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment)
Definition: dsa.c:1216
size_t dsa_minimum_size(void)
Definition: dsa.c:1194
void dsa_pin(dsa_area *area)
Definition: dsa.c:990
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
dsm_handle dsa_handle
Definition: dsa.h:100
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:103
#define DsaPointerIsValid(x)
Definition: dsa.h:81
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1124
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1096
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:804
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1133
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:666
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:517
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:916
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:989
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:956
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1208
int errcode(int sqlerrcode)
Definition: elog.c:860
int errmsg(const char *fmt,...)
Definition: elog.c:1075
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1893
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:703
@ LW_EXCLUSIVE
Definition: lwlock.h:116
void pfree(void *pointer)
Definition: mcxt.c:1431
void * palloc(Size size)
Definition: mcxt.c:1201
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:338
int64 total_size
Definition: pg_checksums.c:63
#define fprintf
Definition: port.h:242
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
Definition: lwlock.h:41
dsa_segment_header segment_header
Definition: dsa.c:314
size_t total_segment_size
Definition: dsa.c:324
int lwlock_tranche_id
Definition: dsa.c:336
dsa_segment_index high_segment_index
Definition: dsa.c:328
bool pinned
Definition: dsa.c:332
size_t max_total_segment_size
Definition: dsa.c:326
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:320
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:322
size_t freed_segment_counter
Definition: dsa.c:334
int refcnt
Definition: dsa.c:330
LWLock lock
Definition: dsa.c:338
dsa_handle handle
Definition: dsa.c:316
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:318
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:303
LWLock lock
Definition: dsa.c:301
dsa_pointer nextspan
Definition: dsa.c:211
uint16 fclass
Definition: dsa.c:219
dsa_pointer start
Definition: dsa.c:212
uint16 nallocatable
Definition: dsa.c:216
dsa_pointer prevspan
Definition: dsa.c:210
uint16 size_class
Definition: dsa.c:214
uint16 nmax
Definition: dsa.c:218
uint16 ninitialized
Definition: dsa.c:215
uint16 firstfree
Definition: dsa.c:217
dsa_pointer pool
Definition: dsa.c:209
size_t npages
Definition: dsa.c:213
Definition: dsa.c:368
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:386
dsa_segment_index high_segment_index
Definition: dsa.c:389
size_t freed_segment_counter
Definition: dsa.c:392
dsa_area_control * control
Definition: dsa.c:370
ResourceOwner resowner
Definition: dsa.c:378
uint32 magic
Definition: dsa.c:167
size_t size
Definition: dsa.c:171
dsa_segment_index next
Definition: dsa.c:183
dsa_segment_index prev
Definition: dsa.c:177
size_t usable_pages
Definition: dsa.c:169
size_t bin
Definition: dsa.c:185
dsa_segment_header * header
Definition: dsa.c:356
FreePageManager * fpm
Definition: dsa.c:357
dsm_segment * segment
Definition: dsa.c:354
dsa_pointer * pagemap
Definition: dsa.c:358
char * mapped_address
Definition: dsa.c:355
Definition: type.h:95