PostgreSQL Source Code  git master
freespace.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * freespace.c
4  * POSTGRES free space map for quickly finding free space in relations
5  *
6  *
7  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/storage/freespace/freespace.c
12  *
13  *
14  * NOTES:
15  *
16  * Free Space Map keeps track of the amount of free space on pages, and
17  * allows quickly searching for a page with enough free space. The FSM is
18  * stored in a dedicated relation fork of all heap relations, and those
19  * index access methods that need it (see also indexfsm.c). See README for
20  * more information.
21  *
22  *-------------------------------------------------------------------------
23  */
24 #include "postgres.h"
25 
26 #include "access/htup_details.h"
27 #include "access/xloginsert.h"
28 #include "access/xlogutils.h"
29 #include "miscadmin.h"
30 #include "storage/freespace.h"
31 #include "storage/fsm_internals.h"
32 #include "storage/smgr.h"
33 #include "utils/rel.h"
34 
35 
36 /*
37  * We use just one byte to store the amount of free space on a page, so we
38  * divide the amount of free space a page can have into 256 different
39  * categories. The highest category, 255, represents a page with at least
40  * MaxFSMRequestSize bytes of free space, and the second highest category
41  * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42  * MaxFSMRequestSize, exclusive.
43  *
44  * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45  * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46  * categories look like this:
47  *
48  *
49  * Range Category
50  * 0 - 31 0
51  * 32 - 63 1
52  * ... ... ...
53  * 8096 - 8127 253
54  * 8128 - 8163 254
55  * 8164 - 8192 255
56  *
57  * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58  * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59  * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60  * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61  * completely empty page, that would mean that we could never satisfy a
62  * request of exactly MaxFSMRequestSize bytes.
63  */
64 #define FSM_CATEGORIES 256
65 #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 #define MaxFSMRequestSize MaxHeapTupleSize
67 
68 /*
69  * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70  * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71  * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72  * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73  * with a 3-level tree, and 512 is the smallest we support.
74  */
75 #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 
77 #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 #define FSM_BOTTOM_LEVEL 0
79 
80 /*
81  * The internal FSM routines work on a logical addressing scheme. Each
82  * level of the tree can be thought of as a separately addressable file.
83  */
84 typedef struct
85 {
86  int level; /* level */
87  int logpageno; /* page number within the level */
88 } FSMAddress;
89 
90 /* Address of the root page. */
92 
93 /* functions to navigate the tree */
94 static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
99 
100 static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 
103 /* functions to convert amount of free space to a FSM category */
104 static uint8 fsm_space_avail_to_cat(Size avail);
105 static uint8 fsm_space_needed_to_cat(Size needed);
106 static Size fsm_space_cat_to_avail(uint8 cat);
107 
108 /* workhorse functions for various operations */
109 static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110  uint8 newValue, uint8 minValue);
111 static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
114  bool *eof_p);
115 static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
116 
117 
118 /******** Public API ********/
119 
120 /*
121  * GetPageWithFreeSpace - try to find a page in the given relation with
122  * at least the specified amount of free space.
123  *
124  * If successful, return the block number; if not, return InvalidBlockNumber.
125  *
126  * The caller must be prepared for the possibility that the returned page
127  * will turn out to have too little space available by the time the caller
128  * gets a lock on it. In that case, the caller should report the actual
129  * amount of free space available on that page and then try again (see
130  * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131  * extend the relation.
132  *
133  * This can trigger FSM updates if any FSM entry is found to point to a block
134  * past the end of the relation.
135  */
138 {
139  uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
140 
141  return fsm_search(rel, min_cat);
142 }
143 
144 /*
145  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
146  *
147  * We provide this combo form to save some locking overhead, compared to
148  * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
149  * also some effort to return a page close to the old page; if there's a
150  * page with enough free space on the same FSM page where the old one page
151  * is located, it is preferred.
152  */
155  Size oldSpaceAvail, Size spaceNeeded)
156 {
157  int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
158  int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159  FSMAddress addr;
160  uint16 slot;
161  int search_slot;
162 
163  /* Get the location of the FSM byte representing the heap block */
164  addr = fsm_get_location(oldPage, &slot);
165 
166  search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
167 
168  /*
169  * If fsm_set_and_search found a suitable new block, return that.
170  * Otherwise, search as usual.
171  */
172  if (search_slot != -1)
173  {
174  BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
175 
176  /*
177  * Check that the blknum is actually in the relation. Don't try to
178  * update the FSM in that case, just fall back to the other case
179  */
180  if (fsm_does_block_exist(rel, blknum))
181  return blknum;
182  }
183  return fsm_search(rel, search_cat);
184 }
185 
186 /*
187  * RecordPageWithFreeSpace - update info about a page.
188  *
189  * Note that if the new spaceAvail value is higher than the old value stored
190  * in the FSM, the space might not become visible to searchers until the next
191  * FreeSpaceMapVacuum call, which updates the upper level pages.
192  */
193 void
195 {
196  int new_cat = fsm_space_avail_to_cat(spaceAvail);
197  FSMAddress addr;
198  uint16 slot;
199 
200  /* Get the location of the FSM byte representing the heap block */
201  addr = fsm_get_location(heapBlk, &slot);
202 
203  fsm_set_and_search(rel, addr, slot, new_cat, 0);
204 }
205 
206 /*
207  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
208  * WAL replay
209  */
210 void
212  Size spaceAvail)
213 {
214  int new_cat = fsm_space_avail_to_cat(spaceAvail);
215  FSMAddress addr;
216  uint16 slot;
217  BlockNumber blkno;
218  Buffer buf;
219  Page page;
220 
221  /* Get the location of the FSM byte representing the heap block */
222  addr = fsm_get_location(heapBlk, &slot);
223  blkno = fsm_logical_to_physical(addr);
224 
225  /* If the page doesn't exist already, extend */
226  buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
229 
230  page = BufferGetPage(buf);
231  if (PageIsNew(page))
232  PageInit(page, BLCKSZ, 0);
233 
234  if (fsm_set_avail(page, slot, new_cat))
235  MarkBufferDirtyHint(buf, false);
237 }
238 
239 /*
240  * GetRecordedFreeSpace - return the amount of free space on a particular page,
241  * according to the FSM.
242  */
243 Size
245 {
246  FSMAddress addr;
247  uint16 slot;
248  Buffer buf;
249  uint8 cat;
250 
251  /* Get the location of the FSM byte representing the heap block */
252  addr = fsm_get_location(heapBlk, &slot);
253 
254  buf = fsm_readbuf(rel, addr, false);
255  if (!BufferIsValid(buf))
256  return 0;
257  cat = fsm_get_avail(BufferGetPage(buf), slot);
259 
260  return fsm_space_cat_to_avail(cat);
261 }
262 
263 /*
264  * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
265  *
266  * nblocks is the new size of the heap.
267  *
268  * Return the number of blocks of new FSM.
269  * If it's InvalidBlockNumber, there is nothing to truncate;
270  * otherwise the caller is responsible for calling smgrtruncate()
271  * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
272  * to update upper-level pages in the FSM.
273  */
276 {
277  BlockNumber new_nfsmblocks;
278  FSMAddress first_removed_address;
279  uint16 first_removed_slot;
280  Buffer buf;
281 
282  /*
283  * If no FSM has been created yet for this relation, there's nothing to
284  * truncate.
285  */
287  return InvalidBlockNumber;
288 
289  /* Get the location in the FSM of the first removed heap block */
290  first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
291 
292  /*
293  * Zero out the tail of the last remaining FSM page. If the slot
294  * representing the first removed heap block is at a page boundary, as the
295  * first slot on the FSM page that first_removed_address points to, we can
296  * just truncate that page altogether.
297  */
298  if (first_removed_slot > 0)
299  {
300  buf = fsm_readbuf(rel, first_removed_address, false);
301  if (!BufferIsValid(buf))
302  return InvalidBlockNumber; /* nothing to do; the FSM was already
303  * smaller */
305 
306  /* NO EREPORT(ERROR) from here till changes are logged */
308 
309  fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
310 
311  /*
312  * This change is non-critical, because fsm_does_block_exist() would
313  * stop us from returning a truncated-away block. However, since this
314  * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
315  * of that many fsm_does_block_exist() rejections. Use a full
316  * MarkBufferDirty(), not MarkBufferDirtyHint().
317  */
319 
320  /*
321  * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
322  * differing from the rest of the file in this respect. This is
323  * optional; see README mention of full page images. XXX consider
324  * XLogSaveBufferForHint() for even closer similarity.
325  *
326  * A higher-level operation calls us at WAL replay. If we crash
327  * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
328  * not changed, and our fork remains valid. If we crash after that
329  * flush, redo will return here.
330  */
332  log_newpage_buffer(buf, false);
333 
335 
337 
338  new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
339  }
340  else
341  {
342  new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
343  if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
344  return InvalidBlockNumber; /* nothing to do; the FSM was already
345  * smaller */
346  }
347 
348  return new_nfsmblocks;
349 }
350 
351 /*
352  * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
353  *
354  * We assume that the bottom-level pages have already been updated with
355  * new free-space information.
356  */
357 void
359 {
360  bool dummy;
361 
362  /* Recursively scan the tree, starting at the root */
363  (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
365  &dummy);
366 }
367 
368 /*
369  * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
370  *
371  * As above, but assume that only heap pages between start and end-1 inclusive
372  * have new free-space information, so update only the upper-level slots
373  * covering that block range. end == InvalidBlockNumber is equivalent to
374  * "all the rest of the relation".
375  */
376 void
378 {
379  bool dummy;
380 
381  /* Recursively scan the tree, starting at the root */
382  if (end > start)
383  (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
384 }
385 
386 /******** Internal routines ********/
387 
388 /*
389  * Return category corresponding x bytes of free space
390  */
391 static uint8
393 {
394  int cat;
395 
396  Assert(avail < BLCKSZ);
397 
398  if (avail >= MaxFSMRequestSize)
399  return 255;
400 
401  cat = avail / FSM_CAT_STEP;
402 
403  /*
404  * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
405  * more.
406  */
407  if (cat > 254)
408  cat = 254;
409 
410  return (uint8) cat;
411 }
412 
413 /*
414  * Return the lower bound of the range of free space represented by given
415  * category.
416  */
417 static Size
419 {
420  /* The highest category represents exactly MaxFSMRequestSize bytes. */
421  if (cat == 255)
422  return MaxFSMRequestSize;
423  else
424  return cat * FSM_CAT_STEP;
425 }
426 
427 /*
428  * Which category does a page need to have, to accommodate x bytes of data?
429  * While fsm_space_avail_to_cat() rounds down, this needs to round up.
430  */
431 static uint8
433 {
434  int cat;
435 
436  /* Can't ask for more space than the highest category represents */
437  if (needed > MaxFSMRequestSize)
438  elog(ERROR, "invalid FSM request size %zu", needed);
439 
440  if (needed == 0)
441  return 1;
442 
443  cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
444 
445  if (cat > 255)
446  cat = 255;
447 
448  return (uint8) cat;
449 }
450 
451 /*
452  * Returns the physical block number of a FSM page
453  */
454 static BlockNumber
456 {
457  BlockNumber pages;
458  int leafno;
459  int l;
460 
461  /*
462  * Calculate the logical page number of the first leaf page below the
463  * given page.
464  */
465  leafno = addr.logpageno;
466  for (l = 0; l < addr.level; l++)
467  leafno *= SlotsPerFSMPage;
468 
469  /* Count upper level nodes required to address the leaf page */
470  pages = 0;
471  for (l = 0; l < FSM_TREE_DEPTH; l++)
472  {
473  pages += leafno + 1;
474  leafno /= SlotsPerFSMPage;
475  }
476 
477  /*
478  * If the page we were asked for wasn't at the bottom level, subtract the
479  * additional lower level pages we counted above.
480  */
481  pages -= addr.level;
482 
483  /* Turn the page count into 0-based block number */
484  return pages - 1;
485 }
486 
487 /*
488  * Return the FSM location corresponding to given heap block.
489  */
490 static FSMAddress
492 {
493  FSMAddress addr;
494 
495  addr.level = FSM_BOTTOM_LEVEL;
496  addr.logpageno = heapblk / SlotsPerFSMPage;
497  *slot = heapblk % SlotsPerFSMPage;
498 
499  return addr;
500 }
501 
502 /*
503  * Return the heap block number corresponding to given location in the FSM.
504  */
505 static BlockNumber
507 {
508  Assert(addr.level == FSM_BOTTOM_LEVEL);
509  return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
510 }
511 
512 /*
513  * Given a logical address of a child page, get the logical page number of
514  * the parent, and the slot within the parent corresponding to the child.
515  */
516 static FSMAddress
518 {
519  FSMAddress parent;
520 
521  Assert(child.level < FSM_ROOT_LEVEL);
522 
523  parent.level = child.level + 1;
524  parent.logpageno = child.logpageno / SlotsPerFSMPage;
525  *slot = child.logpageno % SlotsPerFSMPage;
526 
527  return parent;
528 }
529 
530 /*
531  * Given a logical address of a parent page and a slot number, get the
532  * logical address of the corresponding child page.
533  */
534 static FSMAddress
536 {
537  FSMAddress child;
538 
539  Assert(parent.level > FSM_BOTTOM_LEVEL);
540 
541  child.level = parent.level - 1;
542  child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
543 
544  return child;
545 }
546 
547 /*
548  * Read a FSM page.
549  *
550  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
551  * true, the FSM file is extended.
552  */
553 static Buffer
554 fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
555 {
556  BlockNumber blkno = fsm_logical_to_physical(addr);
557  Buffer buf;
558  SMgrRelation reln = RelationGetSmgr(rel);
559 
560  /*
561  * If we haven't cached the size of the FSM yet, check it first. Also
562  * recheck if the requested block seems to be past end, since our cached
563  * value might be stale. (We send smgr inval messages on truncation, but
564  * not on extension.)
565  */
567  blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
568  {
569  /* Invalidate the cache so smgrnblocks asks the kernel. */
571  if (smgrexists(reln, FSM_FORKNUM))
572  smgrnblocks(reln, FSM_FORKNUM);
573  else
574  reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
575  }
576 
577  /*
578  * For reading we use ZERO_ON_ERROR mode, and initialize the page if
579  * necessary. The FSM information is not accurate anyway, so it's better
580  * to clear corrupt pages than error out. Since the FSM changes are not
581  * WAL-logged, the so-called torn page problem on crash can lead to pages
582  * with corrupt headers, for example.
583  *
584  * We use the same path below to initialize pages when extending the
585  * relation, as a concurrent extension can end up with vm_extend()
586  * returning an already-initialized page.
587  */
588  if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
589  {
590  if (extend)
591  buf = fsm_extend(rel, blkno + 1);
592  else
593  return InvalidBuffer;
594  }
595  else
596  buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
597 
598  /*
599  * Initializing the page when needed is trickier than it looks, because of
600  * the possibility of multiple backends doing this concurrently, and our
601  * desire to not uselessly take the buffer lock in the normal path where
602  * the page is OK. We must take the lock to initialize the page, so
603  * recheck page newness after we have the lock, in case someone else
604  * already did it. Also, because we initially check PageIsNew with no
605  * lock, it's possible to fall through and return the buffer while someone
606  * else is still initializing the page (i.e., we might see pd_upper as set
607  * but other page header fields are still zeroes). This is harmless for
608  * callers that will take a buffer lock themselves, but some callers
609  * inspect the page without any lock at all. The latter is OK only so
610  * long as it doesn't depend on the page header having correct contents.
611  * Current usage is safe because PageGetContents() does not require that.
612  */
614  {
617  PageInit(BufferGetPage(buf), BLCKSZ, 0);
619  }
620  return buf;
621 }
622 
623 /*
624  * Ensure that the FSM fork is at least fsm_nblocks long, extending
625  * it if necessary with empty pages. And by empty, I mean pages filled
626  * with zeros, meaning there's no free space.
627  */
628 static Buffer
629 fsm_extend(Relation rel, BlockNumber fsm_nblocks)
630 {
631  return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
634  fsm_nblocks,
636 }
637 
638 /*
639  * Set value in given FSM page and slot.
640  *
641  * If minValue > 0, the updated page is also searched for a page with at
642  * least minValue of free space. If one is found, its slot number is
643  * returned, -1 otherwise.
644  */
645 static int
647  uint8 newValue, uint8 minValue)
648 {
649  Buffer buf;
650  Page page;
651  int newslot = -1;
652 
653  buf = fsm_readbuf(rel, addr, true);
655 
656  page = BufferGetPage(buf);
657 
658  if (fsm_set_avail(page, slot, newValue))
659  MarkBufferDirtyHint(buf, false);
660 
661  if (minValue != 0)
662  {
663  /* Search while we still hold the lock */
664  newslot = fsm_search_avail(buf, minValue,
665  addr.level == FSM_BOTTOM_LEVEL,
666  true);
667  }
668 
670 
671  return newslot;
672 }
673 
674 /*
675  * Search the tree for a heap page with at least min_cat of free space
676  */
677 static BlockNumber
678 fsm_search(Relation rel, uint8 min_cat)
679 {
680  int restarts = 0;
682 
683  for (;;)
684  {
685  int slot;
686  Buffer buf;
687  uint8 max_avail = 0;
688 
689  /* Read the FSM page. */
690  buf = fsm_readbuf(rel, addr, false);
691 
692  /* Search within the page */
693  if (BufferIsValid(buf))
694  {
696  slot = fsm_search_avail(buf, min_cat,
697  (addr.level == FSM_BOTTOM_LEVEL),
698  false);
699  if (slot == -1)
700  {
701  max_avail = fsm_get_max_avail(BufferGetPage(buf));
703  }
704  else
705  {
706  /* Keep the pin for possible update below */
708  }
709  }
710  else
711  slot = -1;
712 
713  if (slot != -1)
714  {
715  /*
716  * Descend the tree, or return the found block if we're at the
717  * bottom.
718  */
719  if (addr.level == FSM_BOTTOM_LEVEL)
720  {
721  BlockNumber blkno = fsm_get_heap_blk(addr, slot);
722  Page page;
723 
724  if (fsm_does_block_exist(rel, blkno))
725  {
727  return blkno;
728  }
729 
730  /*
731  * Block is past the end of the relation. Update FSM, and
732  * restart from root. The usual "advancenext" behavior is
733  * pessimal for this rare scenario, since every later slot is
734  * unusable in the same way. We could zero all affected slots
735  * on the same FSM page, but don't bet on the benefits of that
736  * optimization justifying its compiled code bulk.
737  */
738  page = BufferGetPage(buf);
740  fsm_set_avail(page, slot, 0);
741  MarkBufferDirtyHint(buf, false);
743  if (restarts++ > 10000) /* same rationale as below */
744  return InvalidBlockNumber;
745  addr = FSM_ROOT_ADDRESS;
746  }
747  else
748  {
750  }
751  addr = fsm_get_child(addr, slot);
752  }
753  else if (addr.level == FSM_ROOT_LEVEL)
754  {
755  /*
756  * At the root, failure means there's no page with enough free
757  * space in the FSM. Give up.
758  */
759  return InvalidBlockNumber;
760  }
761  else
762  {
763  uint16 parentslot;
764  FSMAddress parent;
765 
766  /*
767  * At lower level, failure can happen if the value in the upper-
768  * level node didn't reflect the value on the lower page. Update
769  * the upper node, to avoid falling into the same trap again, and
770  * start over.
771  *
772  * There's a race condition here, if another backend updates this
773  * page right after we release it, and gets the lock on the parent
774  * page before us. We'll then update the parent page with the now
775  * stale information we had. It's OK, because it should happen
776  * rarely, and will be fixed by the next vacuum.
777  */
778  parent = fsm_get_parent(addr, &parentslot);
779  fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
780 
781  /*
782  * If the upper pages are badly out of date, we might need to loop
783  * quite a few times, updating them as we go. Any inconsistencies
784  * should eventually be corrected and the loop should end. Looping
785  * indefinitely is nevertheless scary, so provide an emergency
786  * valve.
787  */
788  if (restarts++ > 10000)
789  return InvalidBlockNumber;
790 
791  /* Start search all over from the root */
792  addr = FSM_ROOT_ADDRESS;
793  }
794  }
795 }
796 
797 
798 /*
799  * Recursive guts of FreeSpaceMapVacuum
800  *
801  * Examine the FSM page indicated by addr, as well as its children, updating
802  * upper-level nodes that cover the heap block range from start to end-1.
803  * (It's okay if end is beyond the actual end of the map.)
804  * Return the maximum freespace value on this page.
805  *
806  * If addr is past the end of the FSM, set *eof_p to true and return 0.
807  *
808  * This traverses the tree in depth-first order. The tree is stored
809  * physically in depth-first order, so this should be pretty I/O efficient.
810  */
811 static uint8
814  bool *eof_p)
815 {
816  Buffer buf;
817  Page page;
818  uint8 max_avail;
819 
820  /* Read the page if it exists, or return EOF */
821  buf = fsm_readbuf(rel, addr, false);
822  if (!BufferIsValid(buf))
823  {
824  *eof_p = true;
825  return 0;
826  }
827  else
828  *eof_p = false;
829 
830  page = BufferGetPage(buf);
831 
832  /*
833  * If we're above the bottom level, recurse into children, and fix the
834  * information stored about them at this level.
835  */
836  if (addr.level > FSM_BOTTOM_LEVEL)
837  {
838  FSMAddress fsm_start,
839  fsm_end;
840  uint16 fsm_start_slot,
841  fsm_end_slot;
842  int slot,
843  start_slot,
844  end_slot;
845  bool eof = false;
846 
847  /*
848  * Compute the range of slots we need to update on this page, given
849  * the requested range of heap blocks to consider. The first slot to
850  * update is the one covering the "start" block, and the last slot is
851  * the one covering "end - 1". (Some of this work will be duplicated
852  * in each recursive call, but it's cheap enough to not worry about.)
853  */
854  fsm_start = fsm_get_location(start, &fsm_start_slot);
855  fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
856 
857  while (fsm_start.level < addr.level)
858  {
859  fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
860  fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
861  }
862  Assert(fsm_start.level == addr.level);
863 
864  if (fsm_start.logpageno == addr.logpageno)
865  start_slot = fsm_start_slot;
866  else if (fsm_start.logpageno > addr.logpageno)
867  start_slot = SlotsPerFSMPage; /* shouldn't get here... */
868  else
869  start_slot = 0;
870 
871  if (fsm_end.logpageno == addr.logpageno)
872  end_slot = fsm_end_slot;
873  else if (fsm_end.logpageno > addr.logpageno)
874  end_slot = SlotsPerFSMPage - 1;
875  else
876  end_slot = -1; /* shouldn't get here... */
877 
878  for (slot = start_slot; slot <= end_slot; slot++)
879  {
880  int child_avail;
881 
883 
884  /* After we hit end-of-file, just clear the rest of the slots */
885  if (!eof)
886  child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
887  start, end,
888  &eof);
889  else
890  child_avail = 0;
891 
892  /* Update information about the child */
893  if (fsm_get_avail(page, slot) != child_avail)
894  {
896  fsm_set_avail(page, slot, child_avail);
897  MarkBufferDirtyHint(buf, false);
899  }
900  }
901  }
902 
903  /* Now get the maximum value on the page, to return to caller */
904  max_avail = fsm_get_max_avail(page);
905 
906  /*
907  * Reset the next slot pointer. This encourages the use of low-numbered
908  * pages, increasing the chances that a later vacuum can truncate the
909  * relation. We don't bother with a lock here, nor with marking the page
910  * dirty if it wasn't already, since this is just a hint.
911  */
912  ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
913 
915 
916  return max_avail;
917 }
918 
919 
920 /*
921  * Check whether a block number is past the end of the relation. This can
922  * happen after WAL replay, if the FSM reached disk but newly-extended pages
923  * it refers to did not.
924  */
925 static bool
927 {
928  SMgrRelation smgr = RelationGetSmgr(rel);
929 
930  /*
931  * If below the cached nblocks, the block surely exists. Otherwise, we
932  * face a trade-off. We opt to compare to a fresh nblocks, incurring
933  * lseek() overhead. The alternative would be to assume the block does
934  * not exist, but that would cause FSM to set zero space available for
935  * blocks that main fork extension just recorded.
936  */
938  blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
939  blknumber < RelationGetNumberOfBlocks(rel));
940 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:910
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4906
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4923
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2514
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5140
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4970
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:190
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:89
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:83
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:50
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42
static char * PageGetContents(Page page)
Definition: bufpage.h:257
Pointer Page
Definition: bufpage.h:81
static bool PageIsNew(Page page)
Definition: bufpage.h:233
unsigned short uint16
Definition: c.h:505
#define Assert(condition)
Definition: c.h:858
unsigned char uint8
Definition: c.h:504
size_t Size
Definition: c.h:605
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define MaxFSMRequestSize
Definition: freespace.c:66
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof_p)
Definition: freespace.c:812
static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot)
Definition: freespace.c:535
static const FSMAddress FSM_ROOT_ADDRESS
Definition: freespace.c:91
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:377
static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber)
Definition: freespace.c:926
static uint8 fsm_space_needed_to_cat(Size needed)
Definition: freespace.c:432
#define FSM_BOTTOM_LEVEL
Definition: freespace.c:78
static uint8 fsm_space_avail_to_cat(Size avail)
Definition: freespace.c:392
static Size fsm_space_cat_to_avail(uint8 cat)
Definition: freespace.c:418
static BlockNumber fsm_logical_to_physical(FSMAddress addr)
Definition: freespace.c:455
static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot)
Definition: freespace.c:517
BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded)
Definition: freespace.c:154
static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot)
Definition: freespace.c:491
static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks)
Definition: freespace.c:629
static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
Definition: freespace.c:554
void FreeSpaceMapVacuum(Relation rel)
Definition: freespace.c:358
Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
Definition: freespace.c:244
static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue)
Definition: freespace.c:646
#define FSM_ROOT_LEVEL
Definition: freespace.c:77
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:194
static BlockNumber fsm_search(Relation rel, uint8 min_cat)
Definition: freespace.c:678
void XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:211
BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
Definition: freespace.c:137
static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot)
Definition: freespace.c:506
#define FSM_CAT_STEP
Definition: freespace.c:65
BlockNumber FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
Definition: freespace.c:275
#define FSM_TREE_DEPTH
Definition: freespace.c:75
FSMPageData * FSMPage
Definition: fsm_internals.h:45
#define SlotsPerFSMPage
Definition: fsm_internals.h:61
uint8 fsm_get_avail(Page page, int slot)
Definition: fsmpage.c:122
uint8 fsm_get_max_avail(Page page)
Definition: fsmpage.c:138
bool fsm_set_avail(Page page, int slot, uint8 value)
Definition: fsmpage.c:63
bool fsm_truncate_avail(Page page, int nslots)
Definition: fsmpage.c:313
int fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, bool exclusive_lock_held)
Definition: fsmpage.c:158
return str start
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static char * buf
Definition: pg_test_fsync.c:73
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
#define RelationNeedsWAL(relation)
Definition: rel.h:628
@ FSM_FORKNUM
Definition: relpath.h:59
@ MAIN_FORKNUM
Definition: relpath.h:58
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:655
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:398
int logpageno
Definition: freespace.c:87
int level
Definition: freespace.c:86
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition: xlogutils.c:471
bool InRecovery
Definition: xlogutils.c:50