PostgreSQL Source Code  git master
tidstore.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tidstore.c
4  * TID (ItemPointerData) storage implementation.
5  *
6  * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
7  * Internally it uses a radix tree as the storage for TIDs. The key is the
8  * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
9  *
10  * TidStore can be shared among parallel worker processes by using
11  * TidStoreCreateShared(). Other backends can attach to the shared TidStore
12  * by TidStoreAttach().
13  *
14  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
15  * Portions Copyright (c) 1994, Regents of the University of California
16  *
17  * IDENTIFICATION
18  * src/backend/access/common/tidstore.c
19  *
20  *-------------------------------------------------------------------------
21  */
22 #include "postgres.h"
23 
24 #include "access/tidstore.h"
25 #include "miscadmin.h"
26 #include "nodes/bitmapset.h"
27 #include "storage/lwlock.h"
28 #include "utils/dsa.h"
29 
30 
31 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
32 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
33 
34 /* number of active words for a page: */
35 #define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
36 
37 /* number of offsets we can store in the header of a BlocktableEntry */
38 #define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
39 
40 /*
41  * This is named similarly to PagetableEntry in tidbitmap.c
42  * because the two have a similar function.
43  */
44 typedef struct BlocktableEntry
45 {
46  struct
47  {
48 #ifndef WORDS_BIGENDIAN
49  /*
50  * We need to position this member to reserve space for the backing
51  * radix tree to tag the lowest bit when struct 'header' is stored
52  * inside a pointer or DSA pointer.
53  */
55 
57 #endif
58 
59  /*
60  * We can store a small number of offsets here to avoid wasting space
61  * with a sparse bitmap.
62  */
64 
65 #ifdef WORDS_BIGENDIAN
66  int8 nwords;
67  uint8 flags;
68 #endif
69  } header;
70 
71  /*
72  * We don't expect any padding space here, but to be cautious, code
73  * creating new entries should zero out space up to 'words'.
74  */
75 
78 
79 /*
80  * The type of 'nwords' limits the max number of words in the 'words' array.
81  * This computes the max offset we can actually store in the bitmap. In
82  * practice, it's almost always the same as MaxOffsetNumber.
83  */
84 #define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
85 
86 #define MaxBlocktableEntrySize \
87  offsetof(BlocktableEntry, words) + \
88  (sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
89 
90 #define RT_PREFIX local_ts
91 #define RT_SCOPE static
92 #define RT_DECLARE
93 #define RT_DEFINE
94 #define RT_VALUE_TYPE BlocktableEntry
95 #define RT_VARLEN_VALUE_SIZE(page) \
96  (offsetof(BlocktableEntry, words) + \
97  sizeof(bitmapword) * (page)->header.nwords)
98 #define RT_RUNTIME_EMBEDDABLE_VALUE
99 #include "lib/radixtree.h"
100 
101 #define RT_PREFIX shared_ts
102 #define RT_SHMEM
103 #define RT_SCOPE static
104 #define RT_DECLARE
105 #define RT_DEFINE
106 #define RT_VALUE_TYPE BlocktableEntry
107 #define RT_VARLEN_VALUE_SIZE(page) \
108  (offsetof(BlocktableEntry, words) + \
109  sizeof(bitmapword) * (page)->header.nwords)
110 #define RT_RUNTIME_EMBEDDABLE_VALUE
111 #include "lib/radixtree.h"
112 
113 /* Per-backend state for a TidStore */
114 struct TidStore
115 {
116  /* MemoryContext where the TidStore is allocated */
118 
119  /* MemoryContext that the radix tree uses */
121 
122  /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
123  union
124  {
125  local_ts_radix_tree *local;
126  shared_ts_radix_tree *shared;
127  } tree;
128 
129  /* DSA area for TidStore if using shared memory */
131 };
132 #define TidStoreIsShared(ts) ((ts)->area != NULL)
133 
134 /* Iterator for TidStore */
136 {
138 
139  /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
140  union
141  {
142  shared_ts_iter *shared;
143  local_ts_iter *local;
145 
146  /* output for the caller */
148 };
149 
150 /*
151  * Create a TidStore. The TidStore will live in the memory context that is
152  * CurrentMemoryContext at the time of this call. The TID storage, backed
153  * by a radix tree, will live in its child memory context, rt_context.
154  *
155  * "max_bytes" is not an internally-enforced limit; it is used only as a
156  * hint to cap the memory block size of the memory context for TID storage.
157  * This reduces space wastage due to over-allocation. If the caller wants to
158  * monitor memory usage, it must compare its limit with the value reported
159  * by TidStoreMemoryUsage().
160  */
161 TidStore *
162 TidStoreCreateLocal(size_t max_bytes, bool insert_only)
163 {
164  TidStore *ts;
165  size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
166  size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
167  size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
168 
169  ts = palloc0(sizeof(TidStore));
171 
172  /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
173  while (16 * maxBlockSize > max_bytes)
174  maxBlockSize >>= 1;
175 
176  if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
177  maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
178 
179  /* Create a memory context for the TID storage */
180  if (insert_only)
181  {
183  "TID storage",
184  minContextSize,
185  initBlockSize,
186  maxBlockSize);
187  }
188  else
189  {
191  "TID storage",
192  minContextSize,
193  initBlockSize,
194  maxBlockSize);
195  }
196 
197  ts->tree.local = local_ts_create(ts->rt_context);
198 
199  return ts;
200 }
201 
202 /*
203  * Similar to TidStoreCreateLocal() but create a shared TidStore on a
204  * DSA area. The TID storage will live in the DSA area, and the memory
205  * context rt_context will have only meta data of the radix tree.
206  *
207  * The returned object is allocated in backend-local memory.
208  */
209 TidStore *
210 TidStoreCreateShared(size_t max_bytes, int tranche_id)
211 {
212  TidStore *ts;
213  dsa_area *area;
214  size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
215  size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE;
216 
217  ts = palloc0(sizeof(TidStore));
219 
221  "TID storage meta data",
223 
224  /*
225  * Choose the initial and maximum DSA segment sizes to be no longer than
226  * 1/8 of max_bytes.
227  */
228  while (8 * dsa_max_size > max_bytes)
229  dsa_max_size >>= 1;
230 
231  if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
232  dsa_max_size = DSA_MIN_SEGMENT_SIZE;
233 
234  if (dsa_init_size > dsa_max_size)
235  dsa_init_size = dsa_max_size;
236 
237  area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
238  ts->tree.shared = shared_ts_create(ts->rt_context, area,
239  tranche_id);
240  ts->area = area;
241 
242  return ts;
243 }
244 
245 /*
246  * Attach to the shared TidStore. 'area_handle' is the DSA handle where
247  * the TidStore is created. 'handle' is the dsa_pointer returned by
248  * TidStoreGetHandle(). The returned object is allocated in backend-local
249  * memory using the CurrentMemoryContext.
250  */
251 TidStore *
253 {
254  TidStore *ts;
255  dsa_area *area;
256 
257  Assert(area_handle != DSA_HANDLE_INVALID);
258  Assert(DsaPointerIsValid(handle));
259 
260  /* create per-backend state */
261  ts = palloc0(sizeof(TidStore));
262 
263  area = dsa_attach(area_handle);
264 
265  /* Find the shared the shared radix tree */
266  ts->tree.shared = shared_ts_attach(area, handle);
267  ts->area = area;
268 
269  return ts;
270 }
271 
272 /*
273  * Detach from a TidStore. This also detaches from radix tree and frees
274  * the backend-local resources.
275  */
276 void
278 {
280 
281  shared_ts_detach(ts->tree.shared);
282  dsa_detach(ts->area);
283 
284  pfree(ts);
285 }
286 
287 /*
288  * Lock support functions.
289  *
290  * We can use the radix tree's lock for shared TidStore as the data we
291  * need to protect is only the shared radix tree.
292  */
293 
294 void
296 {
297  if (TidStoreIsShared(ts))
298  shared_ts_lock_exclusive(ts->tree.shared);
299 }
300 
301 void
303 {
304  if (TidStoreIsShared(ts))
305  shared_ts_lock_share(ts->tree.shared);
306 }
307 
308 void
310 {
311  if (TidStoreIsShared(ts))
312  shared_ts_unlock(ts->tree.shared);
313 }
314 
315 /*
316  * Destroy a TidStore, returning all memory.
317  *
318  * Note that the caller must be certain that no other backend will attempt to
319  * access the TidStore before calling this function. Other backend must
320  * explicitly call TidStoreDetach() to free up backend-local memory associated
321  * with the TidStore. The backend that calls TidStoreDestroy() must not call
322  * TidStoreDetach().
323  */
324 void
326 {
327  /* Destroy underlying radix tree */
328  if (TidStoreIsShared(ts))
329  {
330  shared_ts_free(ts->tree.shared);
331 
332  dsa_detach(ts->area);
333  }
334  else
335  local_ts_free(ts->tree.local);
336 
338 
339  pfree(ts);
340 }
341 
342 /*
343  * Create or replace an entry for the given block and array of offsets.
344  *
345  * NB: This function is designed and optimized for vacuum's heap scanning
346  * phase, so has some limitations:
347  *
348  * - The offset numbers "offsets" must be sorted in ascending order.
349  * - If the block number already exists, the entry will be replaced --
350  * there is no way to add or remove offsets from an entry.
351  */
352 void
354  int num_offsets)
355 {
356  union
357  {
359  BlocktableEntry force_align_entry;
360  } data;
361  BlocktableEntry *page = (BlocktableEntry *) data.data;
363  int wordnum;
364  int next_word_threshold;
365  int idx = 0;
366 
367  Assert(num_offsets > 0);
368 
369  /* Check if the given offset numbers are ordered */
370  for (int i = 1; i < num_offsets; i++)
371  Assert(offsets[i] > offsets[i - 1]);
372 
373  memset(page, 0, offsetof(BlocktableEntry, words));
374 
375  if (num_offsets <= NUM_FULL_OFFSETS)
376  {
377  for (int i = 0; i < num_offsets; i++)
378  {
379  OffsetNumber off = offsets[i];
380 
381  /* safety check to ensure we don't overrun bit array bounds */
382  if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
383  elog(ERROR, "tuple offset out of range: %u", off);
384 
385  page->header.full_offsets[i] = off;
386  }
387 
388  page->header.nwords = 0;
389  }
390  else
391  {
392  for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
393  wordnum <= WORDNUM(offsets[num_offsets - 1]);
394  wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
395  {
396  word = 0;
397 
398  while (idx < num_offsets)
399  {
400  OffsetNumber off = offsets[idx];
401 
402  /* safety check to ensure we don't overrun bit array bounds */
403  if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
404  elog(ERROR, "tuple offset out of range: %u", off);
405 
406  if (off >= next_word_threshold)
407  break;
408 
409  word |= ((bitmapword) 1 << BITNUM(off));
410  idx++;
411  }
412 
413  /* write out offset bitmap for this wordnum */
414  page->words[wordnum] = word;
415  }
416 
417  page->header.nwords = wordnum;
418  Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
419  }
420 
421  if (TidStoreIsShared(ts))
422  shared_ts_set(ts->tree.shared, blkno, page);
423  else
424  local_ts_set(ts->tree.local, blkno, page);
425 }
426 
427 /* Return true if the given TID is present in the TidStore */
428 bool
430 {
431  int wordnum;
432  int bitnum;
433  BlocktableEntry *page;
436 
437  if (TidStoreIsShared(ts))
438  page = shared_ts_find(ts->tree.shared, blk);
439  else
440  page = local_ts_find(ts->tree.local, blk);
441 
442  /* no entry for the blk */
443  if (page == NULL)
444  return false;
445 
446  if (page->header.nwords == 0)
447  {
448  /* we have offsets in the header */
449  for (int i = 0; i < NUM_FULL_OFFSETS; i++)
450  {
451  if (page->header.full_offsets[i] == off)
452  return true;
453  }
454  return false;
455  }
456  else
457  {
458  wordnum = WORDNUM(off);
459  bitnum = BITNUM(off);
460 
461  /* no bitmap for the off */
462  if (wordnum >= page->header.nwords)
463  return false;
464 
465  return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
466  }
467 }
468 
469 /*
470  * Prepare to iterate through a TidStore.
471  *
472  * The TidStoreIter struct is created in the caller's memory context, and it
473  * will be freed in TidStoreEndIterate.
474  *
475  * The caller is responsible for locking TidStore until the iteration is
476  * finished.
477  */
478 TidStoreIter *
480 {
481  TidStoreIter *iter;
482 
483  iter = palloc0(sizeof(TidStoreIter));
484  iter->ts = ts;
485 
486  if (TidStoreIsShared(ts))
487  iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
488  else
489  iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
490 
491  return iter;
492 }
493 
494 
495 /*
496  * Return a result that contains the next block number and that can be used to
497  * obtain the set of offsets by calling TidStoreGetBlockOffsets(). The result
498  * is copyable.
499  */
502 {
503  uint64 key;
504  BlocktableEntry *page;
505 
506  if (TidStoreIsShared(iter->ts))
507  page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
508  else
509  page = local_ts_iterate_next(iter->tree_iter.local, &key);
510 
511  if (page == NULL)
512  return NULL;
513 
514  iter->output.blkno = key;
515  iter->output.internal_page = page;
516 
517  return &(iter->output);
518 }
519 
520 /*
521  * Finish the iteration on TidStore.
522  *
523  * The caller is responsible for releasing any locks.
524  */
525 void
527 {
528  if (TidStoreIsShared(iter->ts))
529  shared_ts_end_iterate(iter->tree_iter.shared);
530  else
531  local_ts_end_iterate(iter->tree_iter.local);
532 
533  pfree(iter);
534 }
535 
536 /*
537  * Return the memory usage of TidStore.
538  */
539 size_t
541 {
542  if (TidStoreIsShared(ts))
543  return shared_ts_memory_usage(ts->tree.shared);
544  else
545  return local_ts_memory_usage(ts->tree.local);
546 }
547 
548 /*
549  * Return the DSA area where the TidStore lives.
550  */
551 dsa_area *
553 {
555 
556  return ts->area;
557 }
558 
561 {
563 
564  return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
565 }
566 
567 /*
568  * Given a TidStoreIterResult returned by TidStoreIterateNext(), extract the
569  * offset numbers. Returns the number of offsets filled in, if <=
570  * max_offsets. Otherwise, fills in as much as it can in the given space, and
571  * returns the size of the buffer that would be needed.
572  */
573 int
575  OffsetNumber *offsets,
576  int max_offsets)
577 {
578  BlocktableEntry *page = result->internal_page;
579  int num_offsets = 0;
580  int wordnum;
581 
582  if (page->header.nwords == 0)
583  {
584  /* we have offsets in the header */
585  for (int i = 0; i < NUM_FULL_OFFSETS; i++)
586  {
588  {
589  if (num_offsets < max_offsets)
590  offsets[num_offsets] = page->header.full_offsets[i];
591  num_offsets++;
592  }
593  }
594  }
595  else
596  {
597  for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
598  {
599  bitmapword w = page->words[wordnum];
600  int off = wordnum * BITS_PER_BITMAPWORD;
601 
602  while (w != 0)
603  {
604  if (w & 1)
605  {
606  if (num_offsets < max_offsets)
607  offsets[num_offsets] = (OffsetNumber) off;
608  num_offsets++;
609  }
610  off++;
611  w >>= 1;
612  }
613  }
614  }
615 
616  return num_offsets;
617 }
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:259
uint32 bitmapword
Definition: bitmapset.h:44
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:43
uint32 BlockNumber
Definition: block.h:31
MemoryContext BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: bump.c:131
signed char int8
Definition: c.h:492
#define Assert(condition)
Definition: c.h:858
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:398
unsigned char uint8
Definition: c.h:504
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:510
void dsa_detach(dsa_area *area)
Definition: dsa.c:1952
dsa_area * dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:421
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_MIN_SEGMENT_SIZE
Definition: dsa.h:100
dsm_handle dsa_handle
Definition: dsa.h:136
#define DSA_DEFAULT_INIT_SEGMENT_SIZE
Definition: dsa.h:97
#define DSA_HANDLE_INVALID
Definition: dsa.h:139
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.h:103
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
int i
Definition: isn.c:73
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_MAXSIZE
Definition: memutils.h:159
#define ALLOCSET_DEFAULT_MINSIZE
Definition: memutils.h:157
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define ALLOCSET_DEFAULT_INITSIZE
Definition: memutils.h:158
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
const void * data
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1474
bitmapword words[FLEXIBLE_ARRAY_MEMBER]
Definition: tidstore.c:76
struct BlocktableEntry::@12 header
OffsetNumber full_offsets[NUM_FULL_OFFSETS]
Definition: tidstore.c:63
BlockNumber blkno
Definition: tidstore.h:29
void * internal_page
Definition: tidstore.h:30
shared_ts_iter * shared
Definition: tidstore.c:142
local_ts_iter * local
Definition: tidstore.c:143
TidStore * ts
Definition: tidstore.c:137
TidStoreIterResult output
Definition: tidstore.c:147
union TidStoreIter::@14 tree_iter
dsa_area * area
Definition: tidstore.c:130
MemoryContext context
Definition: tidstore.c:117
MemoryContext rt_context
Definition: tidstore.c:120
shared_ts_radix_tree * shared
Definition: tidstore.c:126
local_ts_radix_tree * local
Definition: tidstore.c:125
union TidStore::@13 tree
Definition: dsa.c:348
TidStore * TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
Definition: tidstore.c:252
#define NUM_FULL_OFFSETS
Definition: tidstore.c:38
#define WORDNUM(x)
Definition: tidstore.c:31
void TidStoreEndIterate(TidStoreIter *iter)
Definition: tidstore.c:526
void TidStoreDetach(TidStore *ts)
Definition: tidstore.c:277
TidStoreIterResult * TidStoreIterateNext(TidStoreIter *iter)
Definition: tidstore.c:501
void TidStoreLockShare(TidStore *ts)
Definition: tidstore.c:302
TidStore * TidStoreCreateShared(size_t max_bytes, int tranche_id)
Definition: tidstore.c:210
void TidStoreDestroy(TidStore *ts)
Definition: tidstore.c:325
dsa_area * TidStoreGetDSA(TidStore *ts)
Definition: tidstore.c:552
#define BITNUM(x)
Definition: tidstore.c:32
void TidStoreUnlock(TidStore *ts)
Definition: tidstore.c:309
TidStore * TidStoreCreateLocal(size_t max_bytes, bool insert_only)
Definition: tidstore.c:162
#define TidStoreIsShared(ts)
Definition: tidstore.c:132
#define WORDS_PER_PAGE(n)
Definition: tidstore.c:35
bool TidStoreIsMember(TidStore *ts, ItemPointer tid)
Definition: tidstore.c:429
int TidStoreGetBlockOffsets(TidStoreIterResult *result, OffsetNumber *offsets, int max_offsets)
Definition: tidstore.c:574
TidStoreIter * TidStoreBeginIterate(TidStore *ts)
Definition: tidstore.c:479
struct BlocktableEntry BlocktableEntry
#define MAX_OFFSET_IN_BITMAP
Definition: tidstore.c:84
void TidStoreLockExclusive(TidStore *ts)
Definition: tidstore.c:295
dsa_pointer TidStoreGetHandle(TidStore *ts)
Definition: tidstore.c:560
#define MaxBlocktableEntrySize
Definition: tidstore.c:86
void TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, int num_offsets)
Definition: tidstore.c:353
size_t TidStoreMemoryUsage(TidStore *ts)
Definition: tidstore.c:540