PostgreSQL Source Code  git master
tidstore.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tidstore.c
4  * TID (ItemPointerData) storage implementation.
5  *
6  * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
7  * Internally it uses a radix tree as the storage for TIDs. The key is the
8  * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
9  *
10  * TidStore can be shared among parallel worker processes by using
11  * TidStoreCreateShared(). Other backends can attach to the shared TidStore
12  * by TidStoreAttach().
13  *
14  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
15  * Portions Copyright (c) 1994, Regents of the University of California
16  *
17  * IDENTIFICATION
18  * src/backend/access/common/tidstore.c
19  *
20  *-------------------------------------------------------------------------
21  */
22 #include "postgres.h"
23 
24 #include "access/tidstore.h"
25 #include "miscadmin.h"
26 #include "nodes/bitmapset.h"
27 #include "storage/lwlock.h"
28 #include "utils/dsa.h"
29 
30 
31 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
32 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
33 
34 /* number of active words for a page: */
35 #define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
36 
37 /* number of offsets we can store in the header of a BlocktableEntry */
38 #define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
39 
40 /*
41  * This is named similarly to PagetableEntry in tidbitmap.c
42  * because the two have a similar function.
43  */
44 typedef struct BlocktableEntry
45 {
46  struct
47  {
48 #ifndef WORDS_BIGENDIAN
49  /*
50  * We need to position this member to reserve space for the backing
51  * radix tree to tag the lowest bit when struct 'header' is stored
52  * inside a pointer or DSA pointer.
53  */
55 
57 #endif
58 
59  /*
60  * We can store a small number of offsets here to avoid wasting space
61  * with a sparse bitmap.
62  */
64 
65 #ifdef WORDS_BIGENDIAN
66  int8 nwords;
67  uint8 flags;
68 #endif
69  } header;
70 
71  /*
72  * We don't expect any padding space here, but to be cautious, code
73  * creating new entries should zero out space up to 'words'.
74  */
75 
78 
79 /*
80  * The type of 'nwords' limits the max number of words in the 'words' array.
81  * This computes the max offset we can actually store in the bitmap. In
82  * practice, it's almost always the same as MaxOffsetNumber.
83  */
84 #define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
85 
86 #define MaxBlocktableEntrySize \
87  offsetof(BlocktableEntry, words) + \
88  (sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
89 
90 #define RT_PREFIX local_ts
91 #define RT_SCOPE static
92 #define RT_DECLARE
93 #define RT_DEFINE
94 #define RT_VALUE_TYPE BlocktableEntry
95 #define RT_VARLEN_VALUE_SIZE(page) \
96  (offsetof(BlocktableEntry, words) + \
97  sizeof(bitmapword) * (page)->header.nwords)
98 #define RT_RUNTIME_EMBEDDABLE_VALUE
99 #include "lib/radixtree.h"
100 
101 #define RT_PREFIX shared_ts
102 #define RT_SHMEM
103 #define RT_SCOPE static
104 #define RT_DECLARE
105 #define RT_DEFINE
106 #define RT_VALUE_TYPE BlocktableEntry
107 #define RT_VARLEN_VALUE_SIZE(page) \
108  (offsetof(BlocktableEntry, words) + \
109  sizeof(bitmapword) * (page)->header.nwords)
110 #define RT_RUNTIME_EMBEDDABLE_VALUE
111 #include "lib/radixtree.h"
112 
113 /* Per-backend state for a TidStore */
114 struct TidStore
115 {
116  /* MemoryContext where the TidStore is allocated */
118 
119  /* MemoryContext that the radix tree uses */
121 
122  /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
123  union
124  {
125  local_ts_radix_tree *local;
126  shared_ts_radix_tree *shared;
127  } tree;
128 
129  /* DSA area for TidStore if using shared memory */
131 };
132 #define TidStoreIsShared(ts) ((ts)->area != NULL)
133 
134 /* Iterator for TidStore */
136 {
138 
139  /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
140  union
141  {
142  shared_ts_iter *shared;
143  local_ts_iter *local;
145 
146  /* output for the caller */
148 };
149 
150 static void tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
151  BlocktableEntry *page);
152 
153 /*
154  * Create a TidStore. The TidStore will live in the memory context that is
155  * CurrentMemoryContext at the time of this call. The TID storage, backed
156  * by a radix tree, will live in its child memory context, rt_context.
157  *
158  * "max_bytes" is not an internally-enforced limit; it is used only as a
159  * hint to cap the memory block size of the memory context for TID storage.
160  * This reduces space wastage due to over-allocation. If the caller wants to
161  * monitor memory usage, it must compare its limit with the value reported
162  * by TidStoreMemoryUsage().
163  */
164 TidStore *
165 TidStoreCreateLocal(size_t max_bytes, bool insert_only)
166 {
167  TidStore *ts;
168  size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
169  size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
170  size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
171 
172  ts = palloc0(sizeof(TidStore));
174 
175  /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
176  while (16 * maxBlockSize > max_bytes)
177  maxBlockSize >>= 1;
178 
179  if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
180  maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
181 
182  /* Create a memory context for the TID storage */
183  if (insert_only)
184  {
186  "TID storage",
187  minContextSize,
188  initBlockSize,
189  maxBlockSize);
190  }
191  else
192  {
194  "TID storage",
195  minContextSize,
196  initBlockSize,
197  maxBlockSize);
198  }
199 
200  ts->tree.local = local_ts_create(ts->rt_context);
201 
202  return ts;
203 }
204 
205 /*
206  * Similar to TidStoreCreateLocal() but create a shared TidStore on a
207  * DSA area. The TID storage will live in the DSA area, and the memory
208  * context rt_context will have only meta data of the radix tree.
209  *
210  * The returned object is allocated in backend-local memory.
211  */
212 TidStore *
213 TidStoreCreateShared(size_t max_bytes, int tranche_id)
214 {
215  TidStore *ts;
216  dsa_area *area;
217  size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
218  size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE;
219 
220  ts = palloc0(sizeof(TidStore));
222 
224  "TID storage meta data",
226 
227  /*
228  * Choose the initial and maximum DSA segment sizes to be no longer than
229  * 1/8 of max_bytes.
230  */
231  while (8 * dsa_max_size > max_bytes)
232  dsa_max_size >>= 1;
233 
234  if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
235  dsa_max_size = DSA_MIN_SEGMENT_SIZE;
236 
237  if (dsa_init_size > dsa_max_size)
238  dsa_init_size = dsa_max_size;
239 
240  area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
241  ts->tree.shared = shared_ts_create(ts->rt_context, area,
242  tranche_id);
243  ts->area = area;
244 
245  return ts;
246 }
247 
248 /*
249  * Attach to the shared TidStore. 'area_handle' is the DSA handle where
250  * the TidStore is created. 'handle' is the dsa_pointer returned by
251  * TidStoreGetHandle(). The returned object is allocated in backend-local
252  * memory using the CurrentMemoryContext.
253  */
254 TidStore *
256 {
257  TidStore *ts;
258  dsa_area *area;
259 
260  Assert(area_handle != DSA_HANDLE_INVALID);
261  Assert(DsaPointerIsValid(handle));
262 
263  /* create per-backend state */
264  ts = palloc0(sizeof(TidStore));
265 
266  area = dsa_attach(area_handle);
267 
268  /* Find the shared the shared radix tree */
269  ts->tree.shared = shared_ts_attach(area, handle);
270  ts->area = area;
271 
272  return ts;
273 }
274 
275 /*
276  * Detach from a TidStore. This also detaches from radix tree and frees
277  * the backend-local resources.
278  */
279 void
281 {
283 
284  shared_ts_detach(ts->tree.shared);
285  dsa_detach(ts->area);
286 
287  pfree(ts);
288 }
289 
290 /*
291  * Lock support functions.
292  *
293  * We can use the radix tree's lock for shared TidStore as the data we
294  * need to protect is only the shared radix tree.
295  */
296 
297 void
299 {
300  if (TidStoreIsShared(ts))
301  shared_ts_lock_exclusive(ts->tree.shared);
302 }
303 
304 void
306 {
307  if (TidStoreIsShared(ts))
308  shared_ts_lock_share(ts->tree.shared);
309 }
310 
311 void
313 {
314  if (TidStoreIsShared(ts))
315  shared_ts_unlock(ts->tree.shared);
316 }
317 
318 /*
319  * Destroy a TidStore, returning all memory.
320  *
321  * Note that the caller must be certain that no other backend will attempt to
322  * access the TidStore before calling this function. Other backend must
323  * explicitly call TidStoreDetach() to free up backend-local memory associated
324  * with the TidStore. The backend that calls TidStoreDestroy() must not call
325  * TidStoreDetach().
326  */
327 void
329 {
330  /* Destroy underlying radix tree */
331  if (TidStoreIsShared(ts))
332  {
333  shared_ts_free(ts->tree.shared);
334 
335  dsa_detach(ts->area);
336  }
337  else
338  local_ts_free(ts->tree.local);
339 
341 
342  pfree(ts);
343 }
344 
345 /*
346  * Create or replace an entry for the given block and array of offsets.
347  *
348  * NB: This function is designed and optimized for vacuum's heap scanning
349  * phase, so has some limitations:
350  *
351  * - The offset numbers "offsets" must be sorted in ascending order.
352  * - If the block number already exists, the entry will be replaced --
353  * there is no way to add or remove offsets from an entry.
354  */
355 void
357  int num_offsets)
358 {
359  union
360  {
362  BlocktableEntry force_align_entry;
363  } data;
364  BlocktableEntry *page = (BlocktableEntry *) data.data;
366  int wordnum;
367  int next_word_threshold;
368  int idx = 0;
369 
370  Assert(num_offsets > 0);
371 
372  /* Check if the given offset numbers are ordered */
373  for (int i = 1; i < num_offsets; i++)
374  Assert(offsets[i] > offsets[i - 1]);
375 
376  memset(page, 0, offsetof(BlocktableEntry, words));
377 
378  if (num_offsets <= NUM_FULL_OFFSETS)
379  {
380  for (int i = 0; i < num_offsets; i++)
381  {
382  OffsetNumber off = offsets[i];
383 
384  /* safety check to ensure we don't overrun bit array bounds */
385  if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
386  elog(ERROR, "tuple offset out of range: %u", off);
387 
388  page->header.full_offsets[i] = off;
389  }
390 
391  page->header.nwords = 0;
392  }
393  else
394  {
395  for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
396  wordnum <= WORDNUM(offsets[num_offsets - 1]);
397  wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
398  {
399  word = 0;
400 
401  while (idx < num_offsets)
402  {
403  OffsetNumber off = offsets[idx];
404 
405  /* safety check to ensure we don't overrun bit array bounds */
406  if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
407  elog(ERROR, "tuple offset out of range: %u", off);
408 
409  if (off >= next_word_threshold)
410  break;
411 
412  word |= ((bitmapword) 1 << BITNUM(off));
413  idx++;
414  }
415 
416  /* write out offset bitmap for this wordnum */
417  page->words[wordnum] = word;
418  }
419 
420  page->header.nwords = wordnum;
421  Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
422  }
423 
424  if (TidStoreIsShared(ts))
425  shared_ts_set(ts->tree.shared, blkno, page);
426  else
427  local_ts_set(ts->tree.local, blkno, page);
428 }
429 
430 /* Return true if the given TID is present in the TidStore */
431 bool
433 {
434  int wordnum;
435  int bitnum;
436  BlocktableEntry *page;
439 
440  if (TidStoreIsShared(ts))
441  page = shared_ts_find(ts->tree.shared, blk);
442  else
443  page = local_ts_find(ts->tree.local, blk);
444 
445  /* no entry for the blk */
446  if (page == NULL)
447  return false;
448 
449  if (page->header.nwords == 0)
450  {
451  /* we have offsets in the header */
452  for (int i = 0; i < NUM_FULL_OFFSETS; i++)
453  {
454  if (page->header.full_offsets[i] == off)
455  return true;
456  }
457  return false;
458  }
459  else
460  {
461  wordnum = WORDNUM(off);
462  bitnum = BITNUM(off);
463 
464  /* no bitmap for the off */
465  if (wordnum >= page->header.nwords)
466  return false;
467 
468  return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
469  }
470 }
471 
472 /*
473  * Prepare to iterate through a TidStore.
474  *
475  * The TidStoreIter struct is created in the caller's memory context, and it
476  * will be freed in TidStoreEndIterate.
477  *
478  * The caller is responsible for locking TidStore until the iteration is
479  * finished.
480  */
481 TidStoreIter *
483 {
484  TidStoreIter *iter;
485 
486  iter = palloc0(sizeof(TidStoreIter));
487  iter->ts = ts;
488 
489  /*
490  * We start with an array large enough to contain at least the offsets
491  * from one completely full bitmap element.
492  */
494  iter->output.offsets = palloc(sizeof(OffsetNumber) * iter->output.max_offset);
495 
496  if (TidStoreIsShared(ts))
497  iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
498  else
499  iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
500 
501  return iter;
502 }
503 
504 
505 /*
506  * Scan the TidStore and return the TIDs of the next block. The offsets in
507  * each iteration result are ordered, as are the block numbers over all
508  * iterations.
509  */
512 {
513  uint64 key;
514  BlocktableEntry *page;
515 
516  if (TidStoreIsShared(iter->ts))
517  page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
518  else
519  page = local_ts_iterate_next(iter->tree_iter.local, &key);
520 
521  if (page == NULL)
522  return NULL;
523 
524  /* Collect TIDs from the key-value pair */
526 
527  return &(iter->output);
528 }
529 
530 /*
531  * Finish the iteration on TidStore.
532  *
533  * The caller is responsible for releasing any locks.
534  */
535 void
537 {
538  if (TidStoreIsShared(iter->ts))
539  shared_ts_end_iterate(iter->tree_iter.shared);
540  else
541  local_ts_end_iterate(iter->tree_iter.local);
542 
543  pfree(iter->output.offsets);
544  pfree(iter);
545 }
546 
547 /*
548  * Return the memory usage of TidStore.
549  */
550 size_t
552 {
553  if (TidStoreIsShared(ts))
554  return shared_ts_memory_usage(ts->tree.shared);
555  else
556  return local_ts_memory_usage(ts->tree.local);
557 }
558 
559 /*
560  * Return the DSA area where the TidStore lives.
561  */
562 dsa_area *
564 {
566 
567  return ts->area;
568 }
569 
572 {
574 
575  return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
576 }
577 
578 /* Extract TIDs from the given key-value pair */
579 static void
581  BlocktableEntry *page)
582 {
583  TidStoreIterResult *result = (&iter->output);
584  int wordnum;
585 
586  result->num_offsets = 0;
587  result->blkno = blkno;
588 
589  if (page->header.nwords == 0)
590  {
591  /* we have offsets in the header */
592  for (int i = 0; i < NUM_FULL_OFFSETS; i++)
593  {
595  result->offsets[result->num_offsets++] = page->header.full_offsets[i];
596  }
597  }
598  else
599  {
600  for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
601  {
602  bitmapword w = page->words[wordnum];
603  int off = wordnum * BITS_PER_BITMAPWORD;
604 
605  /* Make sure there is enough space to add offsets */
606  if ((result->num_offsets + BITS_PER_BITMAPWORD) > result->max_offset)
607  {
608  result->max_offset *= 2;
609  result->offsets = repalloc(result->offsets,
610  sizeof(OffsetNumber) * result->max_offset);
611  }
612 
613  while (w != 0)
614  {
615  if (w & 1)
616  result->offsets[result->num_offsets++] = (OffsetNumber) off;
617  off++;
618  w >>= 1;
619  }
620  }
621  }
622 }
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:259
uint32 bitmapword
Definition: bitmapset.h:44
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:43
uint32 BlockNumber
Definition: block.h:31
MemoryContext BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: bump.c:131
signed char int8
Definition: c.h:492
#define Assert(condition)
Definition: c.h:858
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:398
unsigned char uint8
Definition: c.h:504
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:510
void dsa_detach(dsa_area *area)
Definition: dsa.c:1952
dsa_area * dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:421
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_MIN_SEGMENT_SIZE
Definition: dsa.h:100
dsm_handle dsa_handle
Definition: dsa.h:136
#define DSA_DEFAULT_INIT_SEGMENT_SIZE
Definition: dsa.h:97
#define DSA_HANDLE_INVALID
Definition: dsa.h:139
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.h:103
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
int i
Definition: isn.c:73
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1540
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
void * palloc(Size size)
Definition: mcxt.c:1316
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_MAXSIZE
Definition: memutils.h:159
#define ALLOCSET_DEFAULT_MINSIZE
Definition: memutils.h:157
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define ALLOCSET_DEFAULT_INITSIZE
Definition: memutils.h:158
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
const void * data
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1474
bitmapword words[FLEXIBLE_ARRAY_MEMBER]
Definition: tidstore.c:76
struct BlocktableEntry::@12 header
OffsetNumber full_offsets[NUM_FULL_OFFSETS]
Definition: tidstore.c:63
BlockNumber blkno
Definition: tidstore.h:26
OffsetNumber * offsets
Definition: tidstore.h:29
shared_ts_iter * shared
Definition: tidstore.c:142
local_ts_iter * local
Definition: tidstore.c:143
TidStore * ts
Definition: tidstore.c:137
TidStoreIterResult output
Definition: tidstore.c:147
union TidStoreIter::@14 tree_iter
dsa_area * area
Definition: tidstore.c:130
MemoryContext context
Definition: tidstore.c:117
MemoryContext rt_context
Definition: tidstore.c:120
shared_ts_radix_tree * shared
Definition: tidstore.c:126
local_ts_radix_tree * local
Definition: tidstore.c:125
union TidStore::@13 tree
Definition: dsa.c:348
TidStore * TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
Definition: tidstore.c:255
#define NUM_FULL_OFFSETS
Definition: tidstore.c:38
#define WORDNUM(x)
Definition: tidstore.c:31
void TidStoreEndIterate(TidStoreIter *iter)
Definition: tidstore.c:536
void TidStoreDetach(TidStore *ts)
Definition: tidstore.c:280
TidStoreIterResult * TidStoreIterateNext(TidStoreIter *iter)
Definition: tidstore.c:511
void TidStoreLockShare(TidStore *ts)
Definition: tidstore.c:305
TidStore * TidStoreCreateShared(size_t max_bytes, int tranche_id)
Definition: tidstore.c:213
void TidStoreDestroy(TidStore *ts)
Definition: tidstore.c:328
dsa_area * TidStoreGetDSA(TidStore *ts)
Definition: tidstore.c:563
static void tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno, BlocktableEntry *page)
Definition: tidstore.c:580
#define BITNUM(x)
Definition: tidstore.c:32
void TidStoreUnlock(TidStore *ts)
Definition: tidstore.c:312
TidStore * TidStoreCreateLocal(size_t max_bytes, bool insert_only)
Definition: tidstore.c:165
#define TidStoreIsShared(ts)
Definition: tidstore.c:132
#define WORDS_PER_PAGE(n)
Definition: tidstore.c:35
bool TidStoreIsMember(TidStore *ts, ItemPointer tid)
Definition: tidstore.c:432
TidStoreIter * TidStoreBeginIterate(TidStore *ts)
Definition: tidstore.c:482
struct BlocktableEntry BlocktableEntry
#define MAX_OFFSET_IN_BITMAP
Definition: tidstore.c:84
void TidStoreLockExclusive(TidStore *ts)
Definition: tidstore.c:298
dsa_pointer TidStoreGetHandle(TidStore *ts)
Definition: tidstore.c:571
#define MaxBlocktableEntrySize
Definition: tidstore.c:86
void TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, int num_offsets)
Definition: tidstore.c:356
size_t TidStoreMemoryUsage(TidStore *ts)
Definition: tidstore.c:551