PostgreSQL Source Code  git master
freepage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * freepage.c
4  * Management of free memory pages.
5  *
6  * The intention of this code is to provide infrastructure for memory
7  * allocators written specifically for PostgreSQL. At least in the case
8  * of dynamic shared memory, we can't simply use malloc() or even
9  * relatively thin wrappers like palloc() which sit on top of it, because
10  * no allocator built into the operating system will deal with relative
11  * pointers. In the future, we may find other cases in which greater
12  * control over our own memory management seems desirable.
13  *
14  * A FreePageManager keeps track of which 4kB pages of memory are currently
15  * unused from the point of view of some higher-level memory allocator.
16  * Unlike a user-facing allocator such as palloc(), a FreePageManager can
17  * only allocate and free in units of whole pages, and freeing an
18  * allocation can only be done given knowledge of its length in pages.
19  *
20  * Since a free page manager has only a fixed amount of dedicated memory,
21  * and since there is no underlying allocator, it uses the free pages
22  * it is given to manage to store its bookkeeping data. It keeps multiple
23  * freelists of runs of pages, sorted by the size of the run; the head of
24  * each freelist is stored in the FreePageManager itself, and the first
25  * page of each run contains a relative pointer to the next run. See
26  * FreePageManagerGetInternal for more details on how the freelists are
27  * managed.
28  *
29  * To avoid memory fragmentation, it's important to consolidate adjacent
30  * spans of pages whenever possible; otherwise, large allocation requests
31  * might not be satisfied even when sufficient contiguous space is
32  * available. Therefore, in addition to the freelists, we maintain an
33  * in-memory btree of free page ranges ordered by page number. If a
34  * range being freed precedes or follows a range that is already free,
35  * the existing range is extended; if it exactly bridges the gap between
36  * free ranges, then the two existing ranges are consolidated with the
37  * newly-freed range to form one great big range of free pages.
38  *
39  * When there is only one range of free pages, the btree is trivial and
40  * is stored within the FreePageManager proper; otherwise, pages are
41  * allocated from the area under management as needed. Even in cases
42  * where memory fragmentation is very severe, only a tiny fraction of
43  * the pages under management are consumed by this btree.
44  *
45  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
46  * Portions Copyright (c) 1994, Regents of the University of California
47  *
48  * IDENTIFICATION
49  * src/backend/utils/mmgr/freepage.c
50  *
51  *-------------------------------------------------------------------------
52  */
53 
54 #include "postgres.h"
55 #include "lib/stringinfo.h"
56 #include "miscadmin.h"
57 
58 #include "utils/freepage.h"
59 #include "utils/relptr.h"
60 
61 
62 /* Magic numbers to identify various page types */
63 #define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0
64 #define FREE_PAGE_LEAF_MAGIC 0x98eae728
65 #define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9
66 
67 /* Doubly linked list of spans of free pages; stored in first page of span. */
69 {
70  int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */
71  Size npages; /* number of pages in span */
72  RelptrFreePageSpanLeader prev;
73  RelptrFreePageSpanLeader next;
74 };
75 
76 /* Common header for btree leaf and internal pages. */
77 typedef struct FreePageBtreeHeader
78 {
79  int magic; /* FREE_PAGE_LEAF_MAGIC or
80  * FREE_PAGE_INTERNAL_MAGIC */
81  Size nused; /* number of items used */
82  RelptrFreePageBtree parent; /* uplink */
84 
85 /* Internal key; points to next level of btree. */
87 {
88  Size first_page; /* low bound for keys on child page */
89  RelptrFreePageBtree child; /* downlink */
91 
92 /* Leaf key; no payload data. */
93 typedef struct FreePageBtreeLeafKey
94 {
95  Size first_page; /* first page in span */
96  Size npages; /* number of pages in span */
98 
99 /* Work out how many keys will fit on a page. */
100 #define FPM_ITEMS_PER_INTERNAL_PAGE \
101  ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
102  sizeof(FreePageBtreeInternalKey))
103 #define FPM_ITEMS_PER_LEAF_PAGE \
104  ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
105  sizeof(FreePageBtreeLeafKey))
106 
107 /* A btree page of either sort */
109 {
111  union
112  {
115  } u;
116 };
117 
118 /* Results of a btree search */
120 {
123  bool found;
124  unsigned split_pages;
126 
127 /* Helper functions */
129  FreePageBtree *btp);
132  FreePageBtree *btp);
134  FreePageBtree *btp);
137 static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
138  Size index, Size first_page, FreePageBtree *child);
140  Size first_page, Size npages);
141 static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
142 static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
143  Size index);
145 static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
146  FreePageBtreeSearchResult *result);
147 static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
148 static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
150  FreePageBtree *btp);
151 static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
153  FreePageBtree *parent, int level, StringInfo buf);
155  FreePageSpanLeader *span, Size expected_pages,
156  StringInfo buf);
157 static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
158  Size *first_page);
159 static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
160  Size npages, bool soft);
161 static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
162 static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
163  Size npages);
166 
167 #ifdef FPM_EXTRA_ASSERTS
168 static Size sum_free_pages(FreePageManager *fpm);
169 #endif
170 
171 /*
172  * Initialize a new, empty free page manager.
173  *
174  * 'fpm' should reference caller-provided memory large enough to contain a
175  * FreePageManager. We'll initialize it here.
176  *
177  * 'base' is the address to which all pointers are relative. When managing
178  * a dynamic shared memory segment, it should normally be the base of the
179  * segment. When managing backend-private memory, it can be either NULL or,
180  * if managing a single contiguous extent of memory, the start of that extent.
181  */
182 void
184 {
185  Size f;
186 
187  relptr_store(base, fpm->self, fpm);
188  relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
189  relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
190  fpm->btree_depth = 0;
191  fpm->btree_recycle_count = 0;
192  fpm->singleton_first_page = 0;
193  fpm->singleton_npages = 0;
194  fpm->contiguous_pages = 0;
195  fpm->contiguous_pages_dirty = true;
196 #ifdef FPM_EXTRA_ASSERTS
197  fpm->free_pages = 0;
198 #endif
199 
200  for (f = 0; f < FPM_NUM_FREELISTS; f++)
201  relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
202 }
203 
204 /*
205  * Allocate a run of pages of the given length from the free page manager.
206  * The return value indicates whether we were able to satisfy the request;
207  * if true, the first page of the allocation is stored in *first_page.
208  */
209 bool
210 FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
211 {
212  bool result;
213  Size contiguous_pages;
214 
215  result = FreePageManagerGetInternal(fpm, npages, first_page);
216 
217  /*
218  * It's a bit counterintuitive, but allocating pages can actually create
219  * opportunities for cleanup that create larger ranges. We might pull a
220  * key out of the btree that enables the item at the head of the btree
221  * recycle list to be inserted; and then if there are more items behind it
222  * one of those might cause two currently-separated ranges to merge,
223  * creating a single range of contiguous pages larger than any that
224  * existed previously. It might be worth trying to improve the cleanup
225  * algorithm to avoid such corner cases, but for now we just notice the
226  * condition and do the appropriate reporting.
227  */
228  contiguous_pages = FreePageBtreeCleanup(fpm);
229  if (fpm->contiguous_pages < contiguous_pages)
230  fpm->contiguous_pages = contiguous_pages;
231 
232  /*
233  * FreePageManagerGetInternal may have set contiguous_pages_dirty.
234  * Recompute contiguous_pages if so.
235  */
237 
238 #ifdef FPM_EXTRA_ASSERTS
239  if (result)
240  {
241  Assert(fpm->free_pages >= npages);
242  fpm->free_pages -= npages;
243  }
244  Assert(fpm->free_pages == sum_free_pages(fpm));
246 #endif
247  return result;
248 }
249 
250 #ifdef FPM_EXTRA_ASSERTS
251 static void
252 sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
253 {
254  char *base = fpm_segment_base(fpm);
255 
257  btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
258  ++*sum;
259  if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
260  {
261  Size index;
262 
263 
264  for (index = 0; index < btp->hdr.nused; ++index)
265  {
266  FreePageBtree *child;
267 
268  child = relptr_access(base, btp->u.internal_key[index].child);
269  sum_free_pages_recurse(fpm, child, sum);
270  }
271  }
272 }
273 static Size
274 sum_free_pages(FreePageManager *fpm)
275 {
276  FreePageSpanLeader *recycle;
277  char *base = fpm_segment_base(fpm);
278  Size sum = 0;
279  int list;
280 
281  /* Count the spans by scanning the freelists. */
282  for (list = 0; list < FPM_NUM_FREELISTS; ++list)
283  {
284 
285  if (!relptr_is_null(fpm->freelist[list]))
286  {
287  FreePageSpanLeader *candidate =
288  relptr_access(base, fpm->freelist[list]);
289 
290  do
291  {
292  sum += candidate->npages;
293  candidate = relptr_access(base, candidate->next);
294  } while (candidate != NULL);
295  }
296  }
297 
298  /* Count btree internal pages. */
299  if (fpm->btree_depth > 0)
300  {
302 
303  sum_free_pages_recurse(fpm, root, &sum);
304  }
305 
306  /* Count the recycle list. */
307  for (recycle = relptr_access(base, fpm->btree_recycle);
308  recycle != NULL;
309  recycle = relptr_access(base, recycle->next))
310  {
311  Assert(recycle->npages == 1);
312  ++sum;
313  }
314 
315  return sum;
316 }
317 #endif
318 
319 /*
320  * Compute the size of the largest run of pages that the user could
321  * successfully get.
322  */
323 static Size
325 {
326  char *base;
327  Size largest;
328 
329  base = fpm_segment_base(fpm);
330  largest = 0;
331  if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
332  {
333  FreePageSpanLeader *candidate;
334 
335  candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
336  do
337  {
338  if (candidate->npages > largest)
339  largest = candidate->npages;
340  candidate = relptr_access(base, candidate->next);
341  } while (candidate != NULL);
342  }
343  else
344  {
345  Size f = FPM_NUM_FREELISTS - 1;
346 
347  do
348  {
349  --f;
350  if (!relptr_is_null(fpm->freelist[f]))
351  {
352  largest = f + 1;
353  break;
354  }
355  } while (f > 0);
356  }
357 
358  return largest;
359 }
360 
361 /*
362  * Recompute the size of the largest run of pages that the user could
363  * successfully get, if it has been marked dirty.
364  */
365 static void
367 {
368  if (!fpm->contiguous_pages_dirty)
369  return;
370 
372  fpm->contiguous_pages_dirty = false;
373 }
374 
375 /*
376  * Transfer a run of pages to the free page manager.
377  */
378 void
379 FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
380 {
381  Size contiguous_pages;
382 
383  Assert(npages > 0);
384 
385  /* Record the new pages. */
386  contiguous_pages =
387  FreePageManagerPutInternal(fpm, first_page, npages, false);
388 
389  /*
390  * If the new range we inserted into the page manager was contiguous with
391  * an existing range, it may have opened up cleanup opportunities.
392  */
393  if (contiguous_pages > npages)
394  {
395  Size cleanup_contiguous_pages;
396 
397  cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
398  if (cleanup_contiguous_pages > contiguous_pages)
399  contiguous_pages = cleanup_contiguous_pages;
400  }
401 
402  /* See if we now have a new largest chunk. */
403  if (fpm->contiguous_pages < contiguous_pages)
404  fpm->contiguous_pages = contiguous_pages;
405 
406  /*
407  * The earlier call to FreePageManagerPutInternal may have set
408  * contiguous_pages_dirty if it needed to allocate internal pages, so
409  * recompute contiguous_pages if necessary.
410  */
412 
413 #ifdef FPM_EXTRA_ASSERTS
414  fpm->free_pages += npages;
415  Assert(fpm->free_pages == sum_free_pages(fpm));
417 #endif
418 }
419 
420 /*
421  * Produce a debugging dump of the state of a free page manager.
422  */
423 char *
425 {
426  char *base = fpm_segment_base(fpm);
428  FreePageSpanLeader *recycle;
429  bool dumped_any_freelist = false;
430  Size f;
431 
432  /* Initialize output buffer. */
434 
435  /* Dump general stuff. */
436  appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
437  relptr_offset(fpm->self), fpm->contiguous_pages);
438 
439  /* Dump btree. */
440  if (fpm->btree_depth > 0)
441  {
443 
444  appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
445  root = relptr_access(base, fpm->btree_root);
446  FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
447  }
448  else if (fpm->singleton_npages > 0)
449  {
450  appendStringInfo(&buf, "singleton: %zu(%zu)\n",
452  }
453 
454  /* Dump btree recycle list. */
455  recycle = relptr_access(base, fpm->btree_recycle);
456  if (recycle != NULL)
457  {
458  appendStringInfoString(&buf, "btree recycle:");
459  FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
460  }
461 
462  /* Dump free lists. */
463  for (f = 0; f < FPM_NUM_FREELISTS; ++f)
464  {
465  FreePageSpanLeader *span;
466 
467  if (relptr_is_null(fpm->freelist[f]))
468  continue;
469  if (!dumped_any_freelist)
470  {
471  appendStringInfoString(&buf, "freelists:\n");
472  dumped_any_freelist = true;
473  }
474  appendStringInfo(&buf, " %zu:", f + 1);
475  span = relptr_access(base, fpm->freelist[f]);
476  FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
477  }
478 
479  /* And return result to caller. */
480  return buf.data;
481 }
482 
483 
484 /*
485  * The first_page value stored at index zero in any non-root page must match
486  * the first_page value stored in its parent at the index which points to that
487  * page. So when the value stored at index zero in a btree page changes, we've
488  * got to walk up the tree adjusting ancestor keys until we reach an ancestor
489  * where that key isn't index zero. This function should be called after
490  * updating the first key on the target page; it will propagate the change
491  * upward as far as needed.
492  *
493  * We assume here that the first key on the page has not changed enough to
494  * require changes in the ordering of keys on its ancestor pages. Thus,
495  * if we search the parent page for the first key greater than or equal to
496  * the first key on the current page, the downlink to this page will be either
497  * the exact index returned by the search (if the first key decreased)
498  * or one less (if the first key increased).
499  */
500 static void
502 {
503  char *base = fpm_segment_base(fpm);
504  Size first_page;
505  FreePageBtree *parent;
506  FreePageBtree *child;
507 
508  /* This might be either a leaf or an internal page. */
509  Assert(btp->hdr.nused > 0);
510  if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
511  {
513  first_page = btp->u.leaf_key[0].first_page;
514  }
515  else
516  {
519  first_page = btp->u.internal_key[0].first_page;
520  }
521  child = btp;
522 
523  /* Loop until we find an ancestor that does not require adjustment. */
524  for (;;)
525  {
526  Size s;
527 
528  parent = relptr_access(base, child->hdr.parent);
529  if (parent == NULL)
530  break;
531  s = FreePageBtreeSearchInternal(parent, first_page);
532 
533  /* Key is either at index s or index s-1; figure out which. */
534  if (s >= parent->hdr.nused)
535  {
536  Assert(s == parent->hdr.nused);
537  --s;
538  }
539  else
540  {
541  FreePageBtree *check;
542 
543  check = relptr_access(base, parent->u.internal_key[s].child);
544  if (check != child)
545  {
546  Assert(s > 0);
547  --s;
548  }
549  }
550 
551 #ifdef USE_ASSERT_CHECKING
552  /* Debugging double-check. */
553  {
554  FreePageBtree *check;
555 
556  check = relptr_access(base, parent->u.internal_key[s].child);
557  Assert(s < parent->hdr.nused);
558  Assert(child == check);
559  }
560 #endif
561 
562  /* Update the parent key. */
563  parent->u.internal_key[s].first_page = first_page;
564 
565  /*
566  * If this is the first key in the parent, go up another level; else
567  * done.
568  */
569  if (s > 0)
570  break;
571  child = parent;
572  }
573 }
574 
575 /*
576  * Attempt to reclaim space from the free-page btree. The return value is
577  * the largest range of contiguous pages created by the cleanup operation.
578  */
579 static Size
581 {
582  char *base = fpm_segment_base(fpm);
583  Size max_contiguous_pages = 0;
584 
585  /* Attempt to shrink the depth of the btree. */
586  while (!relptr_is_null(fpm->btree_root))
587  {
589 
590  /* If the root contains only one key, reduce depth by one. */
591  if (root->hdr.nused == 1)
592  {
593  /* Shrink depth of tree by one. */
594  Assert(fpm->btree_depth > 0);
595  --fpm->btree_depth;
596  if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
597  {
598  /* If root is a leaf, convert only entry to singleton range. */
599  relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
600  fpm->singleton_first_page = root->u.leaf_key[0].first_page;
601  fpm->singleton_npages = root->u.leaf_key[0].npages;
602  }
603  else
604  {
605  FreePageBtree *newroot;
606 
607  /* If root is an internal page, make only child the root. */
608  Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
609  relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
610  newroot = relptr_access(base, fpm->btree_root);
611  relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
612  }
614  }
615  else if (root->hdr.nused == 2 &&
616  root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
617  {
618  Size end_of_first;
619  Size start_of_second;
620 
621  end_of_first = root->u.leaf_key[0].first_page +
622  root->u.leaf_key[0].npages;
623  start_of_second = root->u.leaf_key[1].first_page;
624 
625  if (end_of_first + 1 == start_of_second)
626  {
627  Size root_page = fpm_pointer_to_page(base, root);
628 
629  if (end_of_first == root_page)
630  {
631  FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
632  FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
633  fpm->singleton_first_page = root->u.leaf_key[0].first_page;
634  fpm->singleton_npages = root->u.leaf_key[0].npages +
635  root->u.leaf_key[1].npages + 1;
636  fpm->btree_depth = 0;
637  relptr_store(base, fpm->btree_root,
638  (FreePageBtree *) NULL);
640  fpm->singleton_npages);
641  Assert(max_contiguous_pages == 0);
642  max_contiguous_pages = fpm->singleton_npages;
643  }
644  }
645 
646  /* Whether it worked or not, it's time to stop. */
647  break;
648  }
649  else
650  {
651  /* Nothing more to do. Stop. */
652  break;
653  }
654  }
655 
656  /*
657  * Attempt to free recycled btree pages. We skip this if releasing the
658  * recycled page would require a btree page split, because the page we're
659  * trying to recycle would be consumed by the split, which would be
660  * counterproductive.
661  *
662  * We also currently only ever attempt to recycle the first page on the
663  * list; that could be made more aggressive, but it's not clear that the
664  * complexity would be worthwhile.
665  */
666  while (fpm->btree_recycle_count > 0)
667  {
668  FreePageBtree *btp;
669  Size first_page;
670  Size contiguous_pages;
671 
672  btp = FreePageBtreeGetRecycled(fpm);
673  first_page = fpm_pointer_to_page(base, btp);
674  contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
675  if (contiguous_pages == 0)
676  {
677  FreePageBtreeRecycle(fpm, first_page);
678  break;
679  }
680  else
681  {
682  if (contiguous_pages > max_contiguous_pages)
683  max_contiguous_pages = contiguous_pages;
684  }
685  }
686 
687  return max_contiguous_pages;
688 }
689 
690 /*
691  * Consider consolidating the given page with its left or right sibling,
692  * if it's fairly empty.
693  */
694 static void
696 {
697  char *base = fpm_segment_base(fpm);
698  FreePageBtree *np;
699  Size max;
700 
701  /*
702  * We only try to consolidate pages that are less than a third full. We
703  * could be more aggressive about this, but that might risk performing
704  * consolidation only to end up splitting again shortly thereafter. Since
705  * the btree should be very small compared to the space under management,
706  * our goal isn't so much to ensure that it always occupies the absolutely
707  * smallest possible number of pages as to reclaim pages before things get
708  * too egregiously out of hand.
709  */
710  if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
712  else
713  {
716  }
717  if (btp->hdr.nused >= max / 3)
718  return;
719 
720  /*
721  * If we can fit our right sibling's keys onto this page, consolidate.
722  */
723  np = FreePageBtreeFindRightSibling(base, btp);
724  if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
725  {
726  if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
727  {
728  memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
729  sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
730  btp->hdr.nused += np->hdr.nused;
731  }
732  else
733  {
734  memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
735  sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
736  btp->hdr.nused += np->hdr.nused;
738  }
739  FreePageBtreeRemovePage(fpm, np);
740  return;
741  }
742 
743  /*
744  * If we can fit our keys onto our left sibling's page, consolidate. In
745  * this case, we move our keys onto the other page rather than vice versa,
746  * to avoid having to adjust ancestor keys.
747  */
748  np = FreePageBtreeFindLeftSibling(base, btp);
749  if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
750  {
751  if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
752  {
753  memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
754  sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
755  np->hdr.nused += btp->hdr.nused;
756  }
757  else
758  {
759  memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
760  sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
761  np->hdr.nused += btp->hdr.nused;
763  }
764  FreePageBtreeRemovePage(fpm, btp);
765  return;
766  }
767 }
768 
769 /*
770  * Find the passed page's left sibling; that is, the page at the same level
771  * of the tree whose keyspace immediately precedes ours.
772  */
773 static FreePageBtree *
775 {
776  FreePageBtree *p = btp;
777  int levels = 0;
778 
779  /* Move up until we can move left. */
780  for (;;)
781  {
782  Size first_page;
783  Size index;
784 
785  first_page = FreePageBtreeFirstKey(p);
786  p = relptr_access(base, p->hdr.parent);
787 
788  if (p == NULL)
789  return NULL; /* we were passed the rightmost page */
790 
791  index = FreePageBtreeSearchInternal(p, first_page);
792  if (index > 0)
793  {
794  Assert(p->u.internal_key[index].first_page == first_page);
795  p = relptr_access(base, p->u.internal_key[index - 1].child);
796  break;
797  }
798  Assert(index == 0);
799  ++levels;
800  }
801 
802  /* Descend left. */
803  while (levels > 0)
804  {
806  p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
807  --levels;
808  }
809  Assert(p->hdr.magic == btp->hdr.magic);
810 
811  return p;
812 }
813 
814 /*
815  * Find the passed page's right sibling; that is, the page at the same level
816  * of the tree whose keyspace immediately follows ours.
817  */
818 static FreePageBtree *
820 {
821  FreePageBtree *p = btp;
822  int levels = 0;
823 
824  /* Move up until we can move right. */
825  for (;;)
826  {
827  Size first_page;
828  Size index;
829 
830  first_page = FreePageBtreeFirstKey(p);
831  p = relptr_access(base, p->hdr.parent);
832 
833  if (p == NULL)
834  return NULL; /* we were passed the rightmost page */
835 
836  index = FreePageBtreeSearchInternal(p, first_page);
837  if (index < p->hdr.nused - 1)
838  {
839  Assert(p->u.internal_key[index].first_page == first_page);
840  p = relptr_access(base, p->u.internal_key[index + 1].child);
841  break;
842  }
843  Assert(index == p->hdr.nused - 1);
844  ++levels;
845  }
846 
847  /* Descend left. */
848  while (levels > 0)
849  {
851  p = relptr_access(base, p->u.internal_key[0].child);
852  --levels;
853  }
854  Assert(p->hdr.magic == btp->hdr.magic);
855 
856  return p;
857 }
858 
859 /*
860  * Get the first key on a btree page.
861  */
862 static Size
864 {
865  Assert(btp->hdr.nused > 0);
866 
867  if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
868  return btp->u.leaf_key[0].first_page;
869  else
870  {
872  return btp->u.internal_key[0].first_page;
873  }
874 }
875 
876 /*
877  * Get a page from the btree recycle list for use as a btree page.
878  */
879 static FreePageBtree *
881 {
882  char *base = fpm_segment_base(fpm);
883  FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
884  FreePageSpanLeader *newhead;
885 
886  Assert(victim != NULL);
887  newhead = relptr_access(base, victim->next);
888  if (newhead != NULL)
889  relptr_copy(newhead->prev, victim->prev);
890  relptr_store(base, fpm->btree_recycle, newhead);
891  Assert(fpm_pointer_is_page_aligned(base, victim));
892  fpm->btree_recycle_count--;
893  return (FreePageBtree *) victim;
894 }
895 
896 /*
897  * Insert an item into an internal page.
898  */
899 static void
901  Size first_page, FreePageBtree *child)
902 {
905  Assert(index <= btp->hdr.nused);
906  memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
907  sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
908  btp->u.internal_key[index].first_page = first_page;
909  relptr_store(base, btp->u.internal_key[index].child, child);
910  ++btp->hdr.nused;
911 }
912 
913 /*
914  * Insert an item into a leaf page.
915  */
916 static void
918  Size npages)
919 {
922  Assert(index <= btp->hdr.nused);
923  memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
924  sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
925  btp->u.leaf_key[index].first_page = first_page;
926  btp->u.leaf_key[index].npages = npages;
927  ++btp->hdr.nused;
928 }
929 
930 /*
931  * Put a page on the btree recycle list.
932  */
933 static void
935 {
936  char *base = fpm_segment_base(fpm);
937  FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
938  FreePageSpanLeader *span;
939 
940  span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
942  span->npages = 1;
943  relptr_store(base, span->next, head);
944  relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
945  if (head != NULL)
946  relptr_store(base, head->prev, span);
947  relptr_store(base, fpm->btree_recycle, span);
948  fpm->btree_recycle_count++;
949 }
950 
951 /*
952  * Remove an item from the btree at the given position on the given page.
953  */
954 static void
956 {
958  Assert(index < btp->hdr.nused);
959 
960  /* When last item is removed, extirpate entire page from btree. */
961  if (btp->hdr.nused == 1)
962  {
963  FreePageBtreeRemovePage(fpm, btp);
964  return;
965  }
966 
967  /* Physically remove the key from the page. */
968  --btp->hdr.nused;
969  if (index < btp->hdr.nused)
970  memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
971  sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
972 
973  /* If we just removed the first key, adjust ancestor keys. */
974  if (index == 0)
976 
977  /* Consider whether to consolidate this page with a sibling. */
978  FreePageBtreeConsolidate(fpm, btp);
979 }
980 
981 /*
982  * Remove a page from the btree. Caller is responsible for having relocated
983  * any keys from this page that are still wanted. The page is placed on the
984  * recycled list.
985  */
986 static void
988 {
989  char *base = fpm_segment_base(fpm);
990  FreePageBtree *parent;
991  Size index;
992  Size first_page;
993 
994  for (;;)
995  {
996  /* Find parent page. */
997  parent = relptr_access(base, btp->hdr.parent);
998  if (parent == NULL)
999  {
1000  /* We are removing the root page. */
1001  relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
1002  fpm->btree_depth = 0;
1003  Assert(fpm->singleton_first_page == 0);
1004  Assert(fpm->singleton_npages == 0);
1005  return;
1006  }
1007 
1008  /*
1009  * If the parent contains only one item, we need to remove it as well.
1010  */
1011  if (parent->hdr.nused > 1)
1012  break;
1013  FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1014  btp = parent;
1015  }
1016 
1017  /* Find and remove the downlink. */
1018  first_page = FreePageBtreeFirstKey(btp);
1019  if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1020  {
1021  index = FreePageBtreeSearchLeaf(parent, first_page);
1022  Assert(index < parent->hdr.nused);
1023  if (index < parent->hdr.nused - 1)
1024  memmove(&parent->u.leaf_key[index],
1025  &parent->u.leaf_key[index + 1],
1026  sizeof(FreePageBtreeLeafKey)
1027  * (parent->hdr.nused - index - 1));
1028  }
1029  else
1030  {
1031  index = FreePageBtreeSearchInternal(parent, first_page);
1032  Assert(index < parent->hdr.nused);
1033  if (index < parent->hdr.nused - 1)
1034  memmove(&parent->u.internal_key[index],
1035  &parent->u.internal_key[index + 1],
1036  sizeof(FreePageBtreeInternalKey)
1037  * (parent->hdr.nused - index - 1));
1038  }
1039  parent->hdr.nused--;
1040  Assert(parent->hdr.nused > 0);
1041 
1042  /* Recycle the page. */
1043  FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1044 
1045  /* Adjust ancestor keys if needed. */
1046  if (index == 0)
1047  FreePageBtreeAdjustAncestorKeys(fpm, parent);
1048 
1049  /* Consider whether to consolidate the parent with a sibling. */
1050  FreePageBtreeConsolidate(fpm, parent);
1051 }
1052 
1053 /*
1054  * Search the btree for an entry for the given first page and initialize
1055  * *result with the results of the search. result->page and result->index
1056  * indicate either the position of an exact match or the position at which
1057  * the new key should be inserted. result->found is true for an exact match,
1058  * otherwise false. result->split_pages will contain the number of additional
1059  * btree pages that will be needed when performing a split to insert a key.
1060  * Except as described above, the contents of fields in the result object are
1061  * undefined on return.
1062  */
1063 static void
1065  FreePageBtreeSearchResult *result)
1066 {
1067  char *base = fpm_segment_base(fpm);
1068  FreePageBtree *btp = relptr_access(base, fpm->btree_root);
1069  Size index;
1070 
1071  result->split_pages = 1;
1072 
1073  /* If the btree is empty, there's nothing to find. */
1074  if (btp == NULL)
1075  {
1076  result->page = NULL;
1077  result->found = false;
1078  return;
1079  }
1080 
1081  /* Descend until we hit a leaf. */
1082  while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1083  {
1084  FreePageBtree *child;
1085  bool found_exact;
1086 
1087  index = FreePageBtreeSearchInternal(btp, first_page);
1088  found_exact = index < btp->hdr.nused &&
1089  btp->u.internal_key[index].first_page == first_page;
1090 
1091  /*
1092  * If we found an exact match we descend directly. Otherwise, we
1093  * descend into the child to the left if possible so that we can find
1094  * the insertion point at that child's high end.
1095  */
1096  if (!found_exact && index > 0)
1097  --index;
1098 
1099  /* Track required split depth for leaf insert. */
1101  {
1103  result->split_pages++;
1104  }
1105  else
1106  result->split_pages = 0;
1107 
1108  /* Descend to appropriate child page. */
1109  Assert(index < btp->hdr.nused);
1110  child = relptr_access(base, btp->u.internal_key[index].child);
1111  Assert(relptr_access(base, child->hdr.parent) == btp);
1112  btp = child;
1113  }
1114 
1115  /* Track required split depth for leaf insert. */
1116  if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
1117  {
1119  result->split_pages++;
1120  }
1121  else
1122  result->split_pages = 0;
1123 
1124  /* Search leaf page. */
1125  index = FreePageBtreeSearchLeaf(btp, first_page);
1126 
1127  /* Assemble results. */
1128  result->page = btp;
1129  result->index = index;
1130  result->found = index < btp->hdr.nused &&
1131  first_page == btp->u.leaf_key[index].first_page;
1132 }
1133 
1134 /*
1135  * Search an internal page for the first key greater than or equal to a given
1136  * page number. Returns the index of that key, or one greater than the number
1137  * of keys on the page if none.
1138  */
1139 static Size
1141 {
1142  Size low = 0;
1143  Size high = btp->hdr.nused;
1144 
1146  Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
1147 
1148  while (low < high)
1149  {
1150  Size mid = (low + high) / 2;
1151  Size val = btp->u.internal_key[mid].first_page;
1152 
1153  if (first_page == val)
1154  return mid;
1155  else if (first_page < val)
1156  high = mid;
1157  else
1158  low = mid + 1;
1159  }
1160 
1161  return low;
1162 }
1163 
1164 /*
1165  * Search a leaf page for the first key greater than or equal to a given
1166  * page number. Returns the index of that key, or one greater than the number
1167  * of keys on the page if none.
1168  */
1169 static Size
1171 {
1172  Size low = 0;
1173  Size high = btp->hdr.nused;
1174 
1176  Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
1177 
1178  while (low < high)
1179  {
1180  Size mid = (low + high) / 2;
1181  Size val = btp->u.leaf_key[mid].first_page;
1182 
1183  if (first_page == val)
1184  return mid;
1185  else if (first_page < val)
1186  high = mid;
1187  else
1188  low = mid + 1;
1189  }
1190 
1191  return low;
1192 }
1193 
1194 /*
1195  * Allocate a new btree page and move half the keys from the provided page
1196  * to the new page. Caller is responsible for making sure that there's a
1197  * page available from fpm->btree_recycle. Returns a pointer to the new page,
1198  * to which caller must add a downlink.
1199  */
1200 static FreePageBtree *
1202 {
1203  FreePageBtree *newsibling;
1204 
1205  newsibling = FreePageBtreeGetRecycled(fpm);
1206  newsibling->hdr.magic = btp->hdr.magic;
1207  newsibling->hdr.nused = btp->hdr.nused / 2;
1208  relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
1209  btp->hdr.nused -= newsibling->hdr.nused;
1210 
1211  if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1212  memcpy(&newsibling->u.leaf_key,
1213  &btp->u.leaf_key[btp->hdr.nused],
1214  sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
1215  else
1216  {
1218  memcpy(&newsibling->u.internal_key,
1219  &btp->u.internal_key[btp->hdr.nused],
1220  sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
1222  }
1223 
1224  return newsibling;
1225 }
1226 
1227 /*
1228  * When internal pages are split or merged, the parent pointers of their
1229  * children must be updated.
1230  */
1231 static void
1233 {
1234  Size i;
1235 
1237  for (i = 0; i < btp->hdr.nused; ++i)
1238  {
1239  FreePageBtree *child;
1240 
1241  child = relptr_access(base, btp->u.internal_key[i].child);
1242  relptr_store(base, child->hdr.parent, btp);
1243  }
1244 }
1245 
1246 /*
1247  * Debugging dump of btree data.
1248  */
1249 static void
1251  FreePageBtree *parent, int level, StringInfo buf)
1252 {
1253  char *base = fpm_segment_base(fpm);
1254  Size pageno = fpm_pointer_to_page(base, btp);
1255  Size index;
1256  FreePageBtree *check_parent;
1257 
1259  check_parent = relptr_access(base, btp->hdr.parent);
1260  appendStringInfo(buf, " %zu@%d %c", pageno, level,
1261  btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
1262  if (parent != check_parent)
1263  appendStringInfo(buf, " [actual parent %zu, expected %zu]",
1264  fpm_pointer_to_page(base, check_parent),
1265  fpm_pointer_to_page(base, parent));
1266  appendStringInfoChar(buf, ':');
1267  for (index = 0; index < btp->hdr.nused; ++index)
1268  {
1269  if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1270  appendStringInfo(buf, " %zu->%zu",
1273  else
1274  appendStringInfo(buf, " %zu(%zu)",
1275  btp->u.leaf_key[index].first_page,
1276  btp->u.leaf_key[index].npages);
1277  }
1278  appendStringInfoChar(buf, '\n');
1279 
1280  if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1281  {
1282  for (index = 0; index < btp->hdr.nused; ++index)
1283  {
1284  FreePageBtree *child;
1285 
1286  child = relptr_access(base, btp->u.internal_key[index].child);
1287  FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
1288  }
1289  }
1290 }
1291 
1292 /*
1293  * Debugging dump of free-span data.
1294  */
1295 static void
1297  Size expected_pages, StringInfo buf)
1298 {
1299  char *base = fpm_segment_base(fpm);
1300 
1301  while (span != NULL)
1302  {
1303  if (span->npages != expected_pages)
1304  appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
1305  span->npages);
1306  else
1307  appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
1308  span = relptr_access(base, span->next);
1309  }
1310 
1311  appendStringInfoChar(buf, '\n');
1312 }
1313 
1314 /*
1315  * This function allocates a run of pages of the given length from the free
1316  * page manager.
1317  */
1318 static bool
1320 {
1321  char *base = fpm_segment_base(fpm);
1322  FreePageSpanLeader *victim = NULL;
1323  FreePageSpanLeader *prev;
1326  Size victim_page = 0; /* placate compiler */
1327  Size f;
1328 
1329  /*
1330  * Search for a free span.
1331  *
1332  * Right now, we use a simple best-fit policy here, but it's possible for
1333  * this to result in memory fragmentation if we're repeatedly asked to
1334  * allocate chunks just a little smaller than what we have available.
1335  * Hopefully, this is unlikely, because we expect most requests to be
1336  * single pages or superblock-sized chunks -- but no policy can be optimal
1337  * under all circumstances unless it has knowledge of future allocation
1338  * patterns.
1339  */
1340  for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
1341  {
1342  /* Skip empty freelists. */
1343  if (relptr_is_null(fpm->freelist[f]))
1344  continue;
1345 
1346  /*
1347  * All of the freelists except the last one contain only items of a
1348  * single size, so we just take the first one. But the final free
1349  * list contains everything too big for any of the other lists, so we
1350  * need to search the list.
1351  */
1352  if (f < FPM_NUM_FREELISTS - 1)
1353  victim = relptr_access(base, fpm->freelist[f]);
1354  else
1355  {
1356  FreePageSpanLeader *candidate;
1357 
1358  candidate = relptr_access(base, fpm->freelist[f]);
1359  do
1360  {
1361  if (candidate->npages >= npages && (victim == NULL ||
1362  victim->npages > candidate->npages))
1363  {
1364  victim = candidate;
1365  if (victim->npages == npages)
1366  break;
1367  }
1368  candidate = relptr_access(base, candidate->next);
1369  } while (candidate != NULL);
1370  }
1371  break;
1372  }
1373 
1374  /* If we didn't find an allocatable span, return failure. */
1375  if (victim == NULL)
1376  return false;
1377 
1378  /* Remove span from free list. */
1380  prev = relptr_access(base, victim->prev);
1381  next = relptr_access(base, victim->next);
1382  if (prev != NULL)
1383  relptr_copy(prev->next, victim->next);
1384  else
1385  relptr_copy(fpm->freelist[f], victim->next);
1386  if (next != NULL)
1387  relptr_copy(next->prev, victim->prev);
1388  victim_page = fpm_pointer_to_page(base, victim);
1389 
1390  /* Decide whether we might be invalidating contiguous_pages. */
1391  if (f == FPM_NUM_FREELISTS - 1 &&
1392  victim->npages == fpm->contiguous_pages)
1393  {
1394  /*
1395  * The victim span came from the oversized freelist, and had the same
1396  * size as the longest span. There may or may not be another one of
1397  * the same size, so contiguous_pages must be recomputed just to be
1398  * safe.
1399  */
1400  fpm->contiguous_pages_dirty = true;
1401  }
1402  else if (f + 1 == fpm->contiguous_pages &&
1403  relptr_is_null(fpm->freelist[f]))
1404  {
1405  /*
1406  * The victim span came from a fixed sized freelist, and it was the
1407  * list for spans of the same size as the current longest span, and
1408  * the list is now empty after removing the victim. So
1409  * contiguous_pages must be recomputed without a doubt.
1410  */
1411  fpm->contiguous_pages_dirty = true;
1412  }
1413 
1414  /*
1415  * If we haven't initialized the btree yet, the victim must be the single
1416  * span stored within the FreePageManager itself. Otherwise, we need to
1417  * update the btree.
1418  */
1419  if (relptr_is_null(fpm->btree_root))
1420  {
1421  Assert(victim_page == fpm->singleton_first_page);
1422  Assert(victim->npages == fpm->singleton_npages);
1423  Assert(victim->npages >= npages);
1424  fpm->singleton_first_page += npages;
1425  fpm->singleton_npages -= npages;
1426  if (fpm->singleton_npages > 0)
1428  fpm->singleton_npages);
1429  }
1430  else
1431  {
1432  /*
1433  * If the span we found is exactly the right size, remove it from the
1434  * btree completely. Otherwise, adjust the btree entry to reflect the
1435  * still-unallocated portion of the span, and put that portion on the
1436  * appropriate free list.
1437  */
1438  FreePageBtreeSearch(fpm, victim_page, &result);
1439  Assert(result.found);
1440  if (victim->npages == npages)
1441  FreePageBtreeRemove(fpm, result.page, result.index);
1442  else
1443  {
1445 
1446  /* Adjust btree to reflect remaining pages. */
1447  Assert(victim->npages > npages);
1448  key = &result.page->u.leaf_key[result.index];
1449  Assert(key->npages == victim->npages);
1450  key->first_page += npages;
1451  key->npages -= npages;
1452  if (result.index == 0)
1454 
1455  /* Put the unallocated pages back on the appropriate free list. */
1456  FreePagePushSpanLeader(fpm, victim_page + npages,
1457  victim->npages - npages);
1458  }
1459  }
1460 
1461  /* Return results to caller. */
1462  *first_page = fpm_pointer_to_page(base, victim);
1463  return true;
1464 }
1465 
1466 /*
1467  * Put a range of pages into the btree and freelists, consolidating it with
1468  * existing free spans just before and/or after it. If 'soft' is true,
1469  * only perform the insertion if it can be done without allocating new btree
1470  * pages; if false, do it always. Returns 0 if the soft flag caused the
1471  * insertion to be skipped, or otherwise the size of the contiguous span
1472  * created by the insertion. This may be larger than npages if we're able
1473  * to consolidate with an adjacent range.
1474  */
1475 static Size
1477  bool soft)
1478 {
1479  char *base = fpm_segment_base(fpm);
1481  FreePageBtreeLeafKey *prevkey = NULL;
1482  FreePageBtreeLeafKey *nextkey = NULL;
1483  FreePageBtree *np;
1484  Size nindex;
1485 
1486  Assert(npages > 0);
1487 
1488  /* We can store a single free span without initializing the btree. */
1489  if (fpm->btree_depth == 0)
1490  {
1491  if (fpm->singleton_npages == 0)
1492  {
1493  /* Don't have a span yet; store this one. */
1494  fpm->singleton_first_page = first_page;
1495  fpm->singleton_npages = npages;
1496  FreePagePushSpanLeader(fpm, first_page, npages);
1497  return fpm->singleton_npages;
1498  }
1499  else if (fpm->singleton_first_page + fpm->singleton_npages ==
1500  first_page)
1501  {
1502  /* New span immediately follows sole existing span. */
1503  fpm->singleton_npages += npages;
1506  fpm->singleton_npages);
1507  return fpm->singleton_npages;
1508  }
1509  else if (first_page + npages == fpm->singleton_first_page)
1510  {
1511  /* New span immediately precedes sole existing span. */
1513  fpm->singleton_first_page = first_page;
1514  fpm->singleton_npages += npages;
1516  fpm->singleton_npages);
1517  return fpm->singleton_npages;
1518  }
1519  else
1520  {
1521  /* Not contiguous; we need to initialize the btree. */
1522  Size root_page;
1524 
1525  if (!relptr_is_null(fpm->btree_recycle))
1527  else if (soft)
1528  return 0; /* Should not allocate if soft. */
1529  else if (FreePageManagerGetInternal(fpm, 1, &root_page))
1530  root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
1531  else
1532  {
1533  /* We'd better be able to get a page from the existing range. */
1534  elog(FATAL, "free page manager btree is corrupt");
1535  }
1536 
1537  /* Create the btree and move the preexisting range into it. */
1538  root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
1539  root->hdr.nused = 1;
1540  relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
1541  root->u.leaf_key[0].first_page = fpm->singleton_first_page;
1542  root->u.leaf_key[0].npages = fpm->singleton_npages;
1543  relptr_store(base, fpm->btree_root, root);
1544  fpm->singleton_first_page = 0;
1545  fpm->singleton_npages = 0;
1546  fpm->btree_depth = 1;
1547 
1548  /*
1549  * Corner case: it may be that the btree root took the very last
1550  * free page. In that case, the sole btree entry covers a zero
1551  * page run, which is invalid. Overwrite it with the entry we're
1552  * trying to insert and get out.
1553  */
1554  if (root->u.leaf_key[0].npages == 0)
1555  {
1556  root->u.leaf_key[0].first_page = first_page;
1557  root->u.leaf_key[0].npages = npages;
1558  FreePagePushSpanLeader(fpm, first_page, npages);
1559  return npages;
1560  }
1561 
1562  /* Fall through to insert the new key. */
1563  }
1564  }
1565 
1566  /* Search the btree. */
1567  FreePageBtreeSearch(fpm, first_page, &result);
1568  Assert(!result.found);
1569  if (result.index > 0)
1570  prevkey = &result.page->u.leaf_key[result.index - 1];
1571  if (result.index < result.page->hdr.nused)
1572  {
1573  np = result.page;
1574  nindex = result.index;
1575  nextkey = &result.page->u.leaf_key[result.index];
1576  }
1577  else
1578  {
1579  np = FreePageBtreeFindRightSibling(base, result.page);
1580  nindex = 0;
1581  if (np != NULL)
1582  nextkey = &np->u.leaf_key[0];
1583  }
1584 
1585  /* Consolidate with the previous entry if possible. */
1586  if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
1587  {
1588  bool remove_next = false;
1589  Size result;
1590 
1591  Assert(prevkey->first_page + prevkey->npages == first_page);
1592  prevkey->npages = (first_page - prevkey->first_page) + npages;
1593 
1594  /* Check whether we can *also* consolidate with the following entry. */
1595  if (nextkey != NULL &&
1596  prevkey->first_page + prevkey->npages >= nextkey->first_page)
1597  {
1598  Assert(prevkey->first_page + prevkey->npages ==
1599  nextkey->first_page);
1600  prevkey->npages = (nextkey->first_page - prevkey->first_page)
1601  + nextkey->npages;
1602  FreePagePopSpanLeader(fpm, nextkey->first_page);
1603  remove_next = true;
1604  }
1605 
1606  /* Put the span on the correct freelist and save size. */
1607  FreePagePopSpanLeader(fpm, prevkey->first_page);
1608  FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
1609  result = prevkey->npages;
1610 
1611  /*
1612  * If we consolidated with both the preceding and following entries,
1613  * we must remove the following entry. We do this last, because
1614  * removing an element from the btree may invalidate pointers we hold
1615  * into the current data structure.
1616  *
1617  * NB: The btree is technically in an invalid state a this point
1618  * because we've already updated prevkey to cover the same key space
1619  * as nextkey. FreePageBtreeRemove() shouldn't notice that, though.
1620  */
1621  if (remove_next)
1622  FreePageBtreeRemove(fpm, np, nindex);
1623 
1624  return result;
1625  }
1626 
1627  /* Consolidate with the next entry if possible. */
1628  if (nextkey != NULL && first_page + npages >= nextkey->first_page)
1629  {
1630  Size newpages;
1631 
1632  /* Compute new size for span. */
1633  Assert(first_page + npages == nextkey->first_page);
1634  newpages = (nextkey->first_page - first_page) + nextkey->npages;
1635 
1636  /* Put span on correct free list. */
1637  FreePagePopSpanLeader(fpm, nextkey->first_page);
1638  FreePagePushSpanLeader(fpm, first_page, newpages);
1639 
1640  /* Update key in place. */
1641  nextkey->first_page = first_page;
1642  nextkey->npages = newpages;
1643 
1644  /* If reducing first key on page, ancestors might need adjustment. */
1645  if (nindex == 0)
1647 
1648  return nextkey->npages;
1649  }
1650 
1651  /* Split leaf page and as many of its ancestors as necessary. */
1652  if (result.split_pages > 0)
1653  {
1654  /*
1655  * NB: We could consider various coping strategies here to avoid a
1656  * split; most obviously, if np != result.page, we could target that
1657  * page instead. More complicated shuffling strategies could be
1658  * possible as well; basically, unless every single leaf page is 100%
1659  * full, we can jam this key in there if we try hard enough. It's
1660  * unlikely that trying that hard is worthwhile, but it's possible we
1661  * might need to make more than no effort. For now, we just do the
1662  * easy thing, which is nothing.
1663  */
1664 
1665  /* If this is a soft insert, it's time to give up. */
1666  if (soft)
1667  return 0;
1668 
1669  /* Check whether we need to allocate more btree pages to split. */
1670  if (result.split_pages > fpm->btree_recycle_count)
1671  {
1672  Size pages_needed;
1673  Size recycle_page;
1674  Size i;
1675 
1676  /*
1677  * Allocate the required number of pages and split each one in
1678  * turn. This should never fail, because if we've got enough
1679  * spans of free pages kicking around that we need additional
1680  * storage space just to remember them all, then we should
1681  * certainly have enough to expand the btree, which should only
1682  * ever use a tiny number of pages compared to the number under
1683  * management. If it does, something's badly screwed up.
1684  */
1685  pages_needed = result.split_pages - fpm->btree_recycle_count;
1686  for (i = 0; i < pages_needed; ++i)
1687  {
1688  if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
1689  elog(FATAL, "free page manager btree is corrupt");
1690  FreePageBtreeRecycle(fpm, recycle_page);
1691  }
1692 
1693  /*
1694  * The act of allocating pages to recycle may have invalidated the
1695  * results of our previous btree research, so repeat it. (We could
1696  * recheck whether any of our split-avoidance strategies that were
1697  * not viable before now are, but it hardly seems worthwhile, so
1698  * we don't bother. Consolidation can't be possible now if it
1699  * wasn't previously.)
1700  */
1701  FreePageBtreeSearch(fpm, first_page, &result);
1702 
1703  /*
1704  * The act of allocating pages for use in constructing our btree
1705  * should never cause any page to become more full, so the new
1706  * split depth should be no greater than the old one, and perhaps
1707  * less if we fortuitously allocated a chunk that freed up a slot
1708  * on the page we need to update.
1709  */
1710  Assert(result.split_pages <= fpm->btree_recycle_count);
1711  }
1712 
1713  /* If we still need to perform a split, do it. */
1714  if (result.split_pages > 0)
1715  {
1716  FreePageBtree *split_target = result.page;
1717  FreePageBtree *child = NULL;
1718  Size key = first_page;
1719 
1720  for (;;)
1721  {
1722  FreePageBtree *newsibling;
1723  FreePageBtree *parent;
1724 
1725  /* Identify parent page, which must receive downlink. */
1726  parent = relptr_access(base, split_target->hdr.parent);
1727 
1728  /* Split the page - downlink not added yet. */
1729  newsibling = FreePageBtreeSplitPage(fpm, split_target);
1730 
1731  /*
1732  * At this point in the loop, we're always carrying a pending
1733  * insertion. On the first pass, it's the actual key we're
1734  * trying to insert; on subsequent passes, it's the downlink
1735  * that needs to be added as a result of the split performed
1736  * during the previous loop iteration. Since we've just split
1737  * the page, there's definitely room on one of the two
1738  * resulting pages.
1739  */
1740  if (child == NULL)
1741  {
1742  Size index;
1743  FreePageBtree *insert_into;
1744 
1745  insert_into = key < newsibling->u.leaf_key[0].first_page ?
1746  split_target : newsibling;
1747  index = FreePageBtreeSearchLeaf(insert_into, key);
1748  FreePageBtreeInsertLeaf(insert_into, index, key, npages);
1749  if (index == 0 && insert_into == split_target)
1750  FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1751  }
1752  else
1753  {
1754  Size index;
1755  FreePageBtree *insert_into;
1756 
1757  insert_into =
1758  key < newsibling->u.internal_key[0].first_page ?
1759  split_target : newsibling;
1760  index = FreePageBtreeSearchInternal(insert_into, key);
1761  FreePageBtreeInsertInternal(base, insert_into, index,
1762  key, child);
1763  relptr_store(base, child->hdr.parent, insert_into);
1764  if (index == 0 && insert_into == split_target)
1765  FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1766  }
1767 
1768  /* If the page we just split has no parent, split the root. */
1769  if (parent == NULL)
1770  {
1771  FreePageBtree *newroot;
1772 
1773  newroot = FreePageBtreeGetRecycled(fpm);
1774  newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
1775  newroot->hdr.nused = 2;
1776  relptr_store(base, newroot->hdr.parent,
1777  (FreePageBtree *) NULL);
1778  newroot->u.internal_key[0].first_page =
1779  FreePageBtreeFirstKey(split_target);
1780  relptr_store(base, newroot->u.internal_key[0].child,
1781  split_target);
1782  relptr_store(base, split_target->hdr.parent, newroot);
1783  newroot->u.internal_key[1].first_page =
1784  FreePageBtreeFirstKey(newsibling);
1785  relptr_store(base, newroot->u.internal_key[1].child,
1786  newsibling);
1787  relptr_store(base, newsibling->hdr.parent, newroot);
1788  relptr_store(base, fpm->btree_root, newroot);
1789  fpm->btree_depth++;
1790 
1791  break;
1792  }
1793 
1794  /* If the parent page isn't full, insert the downlink. */
1795  key = newsibling->u.internal_key[0].first_page;
1796  if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
1797  {
1798  Size index;
1799 
1801  FreePageBtreeInsertInternal(base, parent, index,
1802  key, newsibling);
1803  relptr_store(base, newsibling->hdr.parent, parent);
1804  if (index == 0)
1805  FreePageBtreeAdjustAncestorKeys(fpm, parent);
1806  break;
1807  }
1808 
1809  /* The parent also needs to be split, so loop around. */
1810  child = newsibling;
1811  split_target = parent;
1812  }
1813 
1814  /*
1815  * The loop above did the insert, so just need to update the free
1816  * list, and we're done.
1817  */
1818  FreePagePushSpanLeader(fpm, first_page, npages);
1819 
1820  return npages;
1821  }
1822  }
1823 
1824  /* Physically add the key to the page. */
1826  FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
1827 
1828  /* If new first key on page, ancestors might need adjustment. */
1829  if (result.index == 0)
1831 
1832  /* Put it on the free list. */
1833  FreePagePushSpanLeader(fpm, first_page, npages);
1834 
1835  return npages;
1836 }
1837 
1838 /*
1839  * Remove a FreePageSpanLeader from the linked-list that contains it, either
1840  * because we're changing the size of the span, or because we're allocating it.
1841  */
1842 static void
1844 {
1845  char *base = fpm_segment_base(fpm);
1846  FreePageSpanLeader *span;
1848  FreePageSpanLeader *prev;
1849 
1850  span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
1851 
1852  next = relptr_access(base, span->next);
1853  prev = relptr_access(base, span->prev);
1854  if (next != NULL)
1855  relptr_copy(next->prev, span->prev);
1856  if (prev != NULL)
1857  relptr_copy(prev->next, span->next);
1858  else
1859  {
1860  Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
1861 
1862  Assert(relptr_offset(fpm->freelist[f]) == pageno * FPM_PAGE_SIZE);
1863  relptr_copy(fpm->freelist[f], span->next);
1864  }
1865 }
1866 
1867 /*
1868  * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
1869  */
1870 static void
1872 {
1873  char *base = fpm_segment_base(fpm);
1874  Size f = Min(npages, FPM_NUM_FREELISTS) - 1;
1875  FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
1876  FreePageSpanLeader *span;
1877 
1878  span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
1880  span->npages = npages;
1881  relptr_store(base, span->next, head);
1882  relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
1883  if (head != NULL)
1884  relptr_store(base, head->prev, span);
1885  relptr_store(base, fpm->freelist[f], span);
1886 }
static int32 next
Definition: blutils.c:221
#define Min(x, y)
Definition: c.h:1004
#define Assert(condition)
Definition: c.h:858
size_t Size
Definition: c.h:605
#define FATAL
Definition: elog.h:41
#define elog(elevel,...)
Definition: elog.h:224
static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
Definition: freepage.c:1140
#define FREE_PAGE_INTERNAL_MAGIC
Definition: freepage.c:65
struct FreePageBtreeHeader FreePageBtreeHeader
struct FreePageBtreeInternalKey FreePageBtreeInternalKey
static FreePageBtree * FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
Definition: freepage.c:819
static Size FreePageBtreeFirstKey(FreePageBtree *btp)
Definition: freepage.c:863
static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
Definition: freepage.c:1170
static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
Definition: freepage.c:1843
static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:1319
static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
Definition: freepage.c:955
static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:1871
static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, Size first_page, FreePageBtree *child)
Definition: freepage.c:900
static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
Definition: freepage.c:501
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
static Size FreePageManagerLargestContiguous(FreePageManager *fpm)
Definition: freepage.c:324
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
#define FPM_ITEMS_PER_LEAF_PAGE
Definition: freepage.c:103
char * FreePageManagerDump(FreePageManager *fpm)
Definition: freepage.c:424
static void FreePageManagerUpdateLargest(FreePageManager *fpm)
Definition: freepage.c:366
static FreePageBtree * FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
Definition: freepage.c:774
static FreePageBtree * FreePageBtreeGetRecycled(FreePageManager *fpm)
Definition: freepage.c:880
struct FreePageBtreeSearchResult FreePageBtreeSearchResult
static FreePageBtree * FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
Definition: freepage.c:1201
static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
Definition: freepage.c:987
static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
Definition: freepage.c:1232
#define FREE_PAGE_SPAN_LEADER_MAGIC
Definition: freepage.c:63
struct FreePageBtreeLeafKey FreePageBtreeLeafKey
static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, bool soft)
Definition: freepage.c:1476
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, FreePageBtree *parent, int level, StringInfo buf)
Definition: freepage.c:1250
static Size FreePageBtreeCleanup(FreePageManager *fpm)
Definition: freepage.c:580
static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, Size npages)
Definition: freepage.c:917
static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, FreePageBtreeSearchResult *result)
Definition: freepage.c:1064
#define FPM_ITEMS_PER_INTERNAL_PAGE
Definition: freepage.c:100
static void FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
Definition: freepage.c:695
#define FREE_PAGE_LEAF_MAGIC
Definition: freepage.c:64
static void FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, Size expected_pages, StringInfo buf)
Definition: freepage.c:1296
static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
Definition: freepage.c:934
#define fpm_page_to_pointer(base, page)
Definition: freepage.h:67
#define FPM_NUM_FREELISTS
Definition: freepage.h:40
#define fpm_pointer_to_page(base, ptr)
Definition: freepage.h:70
#define fpm_segment_base(fpm)
Definition: freepage.h:84
#define FPM_PAGE_SIZE
Definition: freepage.h:30
#define fpm_pointer_is_page_aligned(base, ptr)
Definition: freepage.h:78
long val
Definition: informix.c:670
int i
Definition: isn.c:73
static char * buf
Definition: pg_test_fsync.c:73
void check_stack_depth(void)
Definition: postgres.c:3531
tree ctl root
Definition: radixtree.h:1884
#define relptr_is_null(rp)
Definition: relptr.h:56
#define relptr_access(base, rp)
Definition: relptr.h:51
#define relptr_store(base, rp, val)
Definition: relptr.h:85
#define relptr_offset(rp)
Definition: relptr.h:59
#define relptr_copy(rp1, rp2)
Definition: relptr.h:90
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:97
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:182
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:194
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
RelptrFreePageBtree parent
Definition: freepage.c:82
RelptrFreePageBtree child
Definition: freepage.c:89
FreePageBtree * page
Definition: freepage.c:121
FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]
Definition: freepage.c:113
union FreePageBtree::@34 u
FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]
Definition: freepage.c:114
FreePageBtreeHeader hdr
Definition: freepage.c:110
unsigned btree_recycle_count
Definition: freepage.h:54
RelptrFreePageManager self
Definition: freepage.h:50
Size singleton_first_page
Definition: freepage.h:55
Size singleton_npages
Definition: freepage.h:56
RelptrFreePageBtree btree_root
Definition: freepage.h:51
RelptrFreePageSpanLeader freelist[FPM_NUM_FREELISTS]
Definition: freepage.h:59
unsigned btree_depth
Definition: freepage.h:53
Size contiguous_pages
Definition: freepage.h:57
bool contiguous_pages_dirty
Definition: freepage.h:58
RelptrFreePageSpanLeader btree_recycle
Definition: freepage.h:52
RelptrFreePageSpanLeader prev
Definition: freepage.c:72
RelptrFreePageSpanLeader next
Definition: freepage.c:73
Definition: type.h:95