PostgreSQL Source Code git master
tidstore.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tidstore.c
4 * TID (ItemPointerData) storage implementation.
5 *
6 * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
7 * Internally it uses a radix tree as the storage for TIDs. The key is the
8 * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
9 *
10 * TidStore can be shared among parallel worker processes by using
11 * TidStoreCreateShared(). Other backends can attach to the shared TidStore
12 * by TidStoreAttach().
13 *
14 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
15 * Portions Copyright (c) 1994, Regents of the University of California
16 *
17 * IDENTIFICATION
18 * src/backend/access/common/tidstore.c
19 *
20 *-------------------------------------------------------------------------
21 */
22#include "postgres.h"
23
24#include "access/tidstore.h"
25#include "miscadmin.h"
26#include "nodes/bitmapset.h"
27#include "storage/lwlock.h"
28#include "utils/dsa.h"
29
30
31#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
32#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
33
34/* number of active words for a page: */
35#define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
36
37/* number of offsets we can store in the header of a BlocktableEntry */
38#define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
39
40/*
41 * This is named similarly to PagetableEntry in tidbitmap.c
42 * because the two have a similar function.
43 */
44typedef struct BlocktableEntry
45{
46 struct
47 {
48#ifndef WORDS_BIGENDIAN
49 /*
50 * We need to position this member to reserve space for the backing
51 * radix tree to tag the lowest bit when struct 'header' is stored
52 * inside a pointer or DSA pointer.
53 */
55
57#endif
58
59 /*
60 * We can store a small number of offsets here to avoid wasting space
61 * with a sparse bitmap.
62 */
64
65#ifdef WORDS_BIGENDIAN
68#endif
70
71 /*
72 * We don't expect any padding space here, but to be cautious, code
73 * creating new entries should zero out space up to 'words'.
74 */
75
78
79/*
80 * The type of 'nwords' limits the max number of words in the 'words' array.
81 * This computes the max offset we can actually store in the bitmap. In
82 * practice, it's almost always the same as MaxOffsetNumber.
83 */
84#define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
85
86#define MaxBlocktableEntrySize \
87 offsetof(BlocktableEntry, words) + \
88 (sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
89
90#define RT_PREFIX local_ts
91#define RT_SCOPE static
92#define RT_DECLARE
93#define RT_DEFINE
94#define RT_VALUE_TYPE BlocktableEntry
95#define RT_VARLEN_VALUE_SIZE(page) \
96 (offsetof(BlocktableEntry, words) + \
97 sizeof(bitmapword) * (page)->header.nwords)
98#define RT_RUNTIME_EMBEDDABLE_VALUE
99#include "lib/radixtree.h"
100
101#define RT_PREFIX shared_ts
102#define RT_SHMEM
103#define RT_SCOPE static
104#define RT_DECLARE
105#define RT_DEFINE
106#define RT_VALUE_TYPE BlocktableEntry
107#define RT_VARLEN_VALUE_SIZE(page) \
108 (offsetof(BlocktableEntry, words) + \
109 sizeof(bitmapword) * (page)->header.nwords)
110#define RT_RUNTIME_EMBEDDABLE_VALUE
111#include "lib/radixtree.h"
112
113/* Per-backend state for a TidStore */
115{
116 /*
117 * MemoryContext for the radix tree when using local memory, NULL for
118 * shared memory
119 */
121
122 /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
123 union
124 {
125 local_ts_radix_tree *local;
126 shared_ts_radix_tree *shared;
128
129 /* DSA area for TidStore if using shared memory */
131};
132#define TidStoreIsShared(ts) ((ts)->area != NULL)
133
134/* Iterator for TidStore */
136{
138
139 /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
140 union
141 {
142 shared_ts_iter *shared;
143 local_ts_iter *local;
145
146 /* output for the caller */
148};
149
150/*
151 * Create a TidStore. The TidStore will live in the memory context that is
152 * CurrentMemoryContext at the time of this call. The TID storage, backed
153 * by a radix tree, will live in its child memory context, rt_context.
154 *
155 * "max_bytes" is not an internally-enforced limit; it is used only as a
156 * hint to cap the memory block size of the memory context for TID storage.
157 * This reduces space wastage due to over-allocation. If the caller wants to
158 * monitor memory usage, it must compare its limit with the value reported
159 * by TidStoreMemoryUsage().
160 */
161TidStore *
162TidStoreCreateLocal(size_t max_bytes, bool insert_only)
163{
164 TidStore *ts;
165 size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
166 size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
167 size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
168
169 ts = palloc0(sizeof(TidStore));
170
171 /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
172 while (16 * maxBlockSize > max_bytes)
173 maxBlockSize >>= 1;
174
175 if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
176 maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
177
178 /* Create a memory context for the TID storage */
179 if (insert_only)
180 {
182 "TID storage",
183 minContextSize,
184 initBlockSize,
185 maxBlockSize);
186 }
187 else
188 {
190 "TID storage",
191 minContextSize,
192 initBlockSize,
193 maxBlockSize);
194 }
195
196 ts->tree.local = local_ts_create(ts->rt_context);
197
198 return ts;
199}
200
201/*
202 * Similar to TidStoreCreateLocal() but create a shared TidStore on a
203 * DSA area.
204 *
205 * The returned object is allocated in backend-local memory.
206 */
207TidStore *
208TidStoreCreateShared(size_t max_bytes, int tranche_id)
209{
210 TidStore *ts;
211 dsa_area *area;
212 size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
213 size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE;
214
215 ts = palloc0(sizeof(TidStore));
216
217 /*
218 * Choose the initial and maximum DSA segment sizes to be no longer than
219 * 1/8 of max_bytes.
220 */
221 while (8 * dsa_max_size > max_bytes)
222 dsa_max_size >>= 1;
223
224 if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
225 dsa_max_size = DSA_MIN_SEGMENT_SIZE;
226
227 if (dsa_init_size > dsa_max_size)
228 dsa_init_size = dsa_max_size;
229
230 area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
231 ts->tree.shared = shared_ts_create(area, tranche_id);
232 ts->area = area;
233
234 return ts;
235}
236
237/*
238 * Attach to the shared TidStore. 'area_handle' is the DSA handle where
239 * the TidStore is created. 'handle' is the dsa_pointer returned by
240 * TidStoreGetHandle(). The returned object is allocated in backend-local
241 * memory using the CurrentMemoryContext.
242 */
243TidStore *
245{
246 TidStore *ts;
247 dsa_area *area;
248
249 Assert(area_handle != DSA_HANDLE_INVALID);
250 Assert(DsaPointerIsValid(handle));
251
252 /* create per-backend state */
253 ts = palloc0(sizeof(TidStore));
254
255 area = dsa_attach(area_handle);
256
257 /* Find the shared the shared radix tree */
258 ts->tree.shared = shared_ts_attach(area, handle);
259 ts->area = area;
260
261 return ts;
262}
263
264/*
265 * Detach from a TidStore. This also detaches from radix tree and frees
266 * the backend-local resources.
267 */
268void
270{
272
273 shared_ts_detach(ts->tree.shared);
274 dsa_detach(ts->area);
275
276 pfree(ts);
277}
278
279/*
280 * Lock support functions.
281 *
282 * We can use the radix tree's lock for shared TidStore as the data we
283 * need to protect is only the shared radix tree.
284 */
285
286void
288{
289 if (TidStoreIsShared(ts))
290 shared_ts_lock_exclusive(ts->tree.shared);
291}
292
293void
295{
296 if (TidStoreIsShared(ts))
297 shared_ts_lock_share(ts->tree.shared);
298}
299
300void
302{
303 if (TidStoreIsShared(ts))
304 shared_ts_unlock(ts->tree.shared);
305}
306
307/*
308 * Destroy a TidStore, returning all memory.
309 *
310 * Note that the caller must be certain that no other backend will attempt to
311 * access the TidStore before calling this function. Other backend must
312 * explicitly call TidStoreDetach() to free up backend-local memory associated
313 * with the TidStore. The backend that calls TidStoreDestroy() must not call
314 * TidStoreDetach().
315 */
316void
318{
319 /* Destroy underlying radix tree */
320 if (TidStoreIsShared(ts))
321 {
322 shared_ts_free(ts->tree.shared);
323 dsa_detach(ts->area);
324 }
325 else
326 {
327 local_ts_free(ts->tree.local);
329 }
330
331 pfree(ts);
332}
333
334/*
335 * Create or replace an entry for the given block and array of offsets.
336 *
337 * NB: This function is designed and optimized for vacuum's heap scanning
338 * phase, so has some limitations:
339 *
340 * - The offset numbers "offsets" must be sorted in ascending order.
341 * - If the block number already exists, the entry will be replaced --
342 * there is no way to add or remove offsets from an entry.
343 */
344void
346 int num_offsets)
347{
348 union
349 {
351 BlocktableEntry force_align_entry;
352 } data;
353 BlocktableEntry *page = (BlocktableEntry *) data.data;
355 int wordnum;
356 int next_word_threshold;
357 int idx = 0;
358
359 Assert(num_offsets > 0);
360
361 /* Check if the given offset numbers are ordered */
362 for (int i = 1; i < num_offsets; i++)
363 Assert(offsets[i] > offsets[i - 1]);
364
365 memset(page, 0, offsetof(BlocktableEntry, words));
366
367 if (num_offsets <= NUM_FULL_OFFSETS)
368 {
369 for (int i = 0; i < num_offsets; i++)
370 {
371 OffsetNumber off = offsets[i];
372
373 /* safety check to ensure we don't overrun bit array bounds */
374 if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
375 elog(ERROR, "tuple offset out of range: %u", off);
376
377 page->header.full_offsets[i] = off;
378 }
379
380 page->header.nwords = 0;
381 }
382 else
383 {
384 for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
385 wordnum <= WORDNUM(offsets[num_offsets - 1]);
386 wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
387 {
388 word = 0;
389
390 while (idx < num_offsets)
391 {
392 OffsetNumber off = offsets[idx];
393
394 /* safety check to ensure we don't overrun bit array bounds */
395 if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
396 elog(ERROR, "tuple offset out of range: %u", off);
397
398 if (off >= next_word_threshold)
399 break;
400
401 word |= ((bitmapword) 1 << BITNUM(off));
402 idx++;
403 }
404
405 /* write out offset bitmap for this wordnum */
406 page->words[wordnum] = word;
407 }
408
409 page->header.nwords = wordnum;
410 Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
411 }
412
413 if (TidStoreIsShared(ts))
414 shared_ts_set(ts->tree.shared, blkno, page);
415 else
416 local_ts_set(ts->tree.local, blkno, page);
417}
418
419/* Return true if the given TID is present in the TidStore */
420bool
422{
423 int wordnum;
424 int bitnum;
425 BlocktableEntry *page;
428
429 if (TidStoreIsShared(ts))
430 page = shared_ts_find(ts->tree.shared, blk);
431 else
432 page = local_ts_find(ts->tree.local, blk);
433
434 /* no entry for the blk */
435 if (page == NULL)
436 return false;
437
438 if (page->header.nwords == 0)
439 {
440 /* we have offsets in the header */
441 for (int i = 0; i < NUM_FULL_OFFSETS; i++)
442 {
443 if (page->header.full_offsets[i] == off)
444 return true;
445 }
446 return false;
447 }
448 else
449 {
450 wordnum = WORDNUM(off);
451 bitnum = BITNUM(off);
452
453 /* no bitmap for the off */
454 if (wordnum >= page->header.nwords)
455 return false;
456
457 return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
458 }
459}
460
461/*
462 * Prepare to iterate through a TidStore.
463 *
464 * The TidStoreIter struct is created in the caller's memory context, and it
465 * will be freed in TidStoreEndIterate.
466 *
467 * The caller is responsible for locking TidStore until the iteration is
468 * finished.
469 */
472{
473 TidStoreIter *iter;
474
475 iter = palloc0(sizeof(TidStoreIter));
476 iter->ts = ts;
477
478 if (TidStoreIsShared(ts))
479 iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
480 else
481 iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
482
483 return iter;
484}
485
486
487/*
488 * Return a result that contains the next block number and that can be used to
489 * obtain the set of offsets by calling TidStoreGetBlockOffsets(). The result
490 * is copyable.
491 */
494{
495 uint64 key;
496 BlocktableEntry *page;
497
498 if (TidStoreIsShared(iter->ts))
499 page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
500 else
501 page = local_ts_iterate_next(iter->tree_iter.local, &key);
502
503 if (page == NULL)
504 return NULL;
505
506 iter->output.blkno = key;
507 iter->output.internal_page = page;
508
509 return &(iter->output);
510}
511
512/*
513 * Finish the iteration on TidStore.
514 *
515 * The caller is responsible for releasing any locks.
516 */
517void
519{
520 if (TidStoreIsShared(iter->ts))
521 shared_ts_end_iterate(iter->tree_iter.shared);
522 else
523 local_ts_end_iterate(iter->tree_iter.local);
524
525 pfree(iter);
526}
527
528/*
529 * Return the memory usage of TidStore.
530 */
531size_t
533{
534 if (TidStoreIsShared(ts))
535 return shared_ts_memory_usage(ts->tree.shared);
536 else
537 return local_ts_memory_usage(ts->tree.local);
538}
539
540/*
541 * Return the DSA area where the TidStore lives.
542 */
543dsa_area *
545{
547
548 return ts->area;
549}
550
553{
555
556 return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
557}
558
559/*
560 * Given a TidStoreIterResult returned by TidStoreIterateNext(), extract the
561 * offset numbers. Returns the number of offsets filled in, if <=
562 * max_offsets. Otherwise, fills in as much as it can in the given space, and
563 * returns the size of the buffer that would be needed.
564 */
565int
567 OffsetNumber *offsets,
568 int max_offsets)
569{
570 BlocktableEntry *page = result->internal_page;
571 int num_offsets = 0;
572 int wordnum;
573
574 if (page->header.nwords == 0)
575 {
576 /* we have offsets in the header */
577 for (int i = 0; i < NUM_FULL_OFFSETS; i++)
578 {
580 {
581 if (num_offsets < max_offsets)
582 offsets[num_offsets] = page->header.full_offsets[i];
583 num_offsets++;
584 }
585 }
586 }
587 else
588 {
589 for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
590 {
591 bitmapword w = page->words[wordnum];
592 int off = wordnum * BITS_PER_BITMAPWORD;
593
594 while (w != 0)
595 {
596 if (w & 1)
597 {
598 if (num_offsets < max_offsets)
599 offsets[num_offsets] = (OffsetNumber) off;
600 num_offsets++;
601 }
602 off++;
603 w >>= 1;
604 }
605 }
606 }
607
608 return num_offsets;
609}
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:259
uint32 bitmapword
Definition: bitmapset.h:44
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:43
uint32 BlockNumber
Definition: block.h:31
MemoryContext BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: bump.c:131
uint8_t uint8
Definition: c.h:486
#define Assert(condition)
Definition: c.h:815
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:420
int8_t int8
Definition: c.h:482
uint64_t uint64
Definition: c.h:489
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:510
dsa_area * dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:421
void dsa_detach(dsa_area *area)
Definition: dsa.c:1952
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_MIN_SEGMENT_SIZE
Definition: dsa.h:100
dsm_handle dsa_handle
Definition: dsa.h:136
#define DSA_DEFAULT_INIT_SEGMENT_SIZE
Definition: dsa.h:97
#define DSA_HANDLE_INVALID
Definition: dsa.h:139
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.h:103
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
int i
Definition: isn.c:72
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_MAXSIZE
Definition: memutils.h:159
#define ALLOCSET_DEFAULT_MINSIZE
Definition: memutils.h:157
#define ALLOCSET_DEFAULT_INITSIZE
Definition: memutils.h:158
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
const void * data
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1476
bitmapword words[FLEXIBLE_ARRAY_MEMBER]
Definition: tidstore.c:76
struct BlocktableEntry::@12 header
OffsetNumber full_offsets[NUM_FULL_OFFSETS]
Definition: tidstore.c:63
BlockNumber blkno
Definition: tidstore.h:29
void * internal_page
Definition: tidstore.h:30
shared_ts_iter * shared
Definition: tidstore.c:142
local_ts_iter * local
Definition: tidstore.c:143
TidStore * ts
Definition: tidstore.c:137
TidStoreIterResult output
Definition: tidstore.c:147
union TidStoreIter::@14 tree_iter
dsa_area * area
Definition: tidstore.c:130
MemoryContext rt_context
Definition: tidstore.c:120
shared_ts_radix_tree * shared
Definition: tidstore.c:126
local_ts_radix_tree * local
Definition: tidstore.c:125
union TidStore::@13 tree
Definition: dsa.c:348
dsa_area * TidStoreGetDSA(TidStore *ts)
Definition: tidstore.c:544
#define NUM_FULL_OFFSETS
Definition: tidstore.c:38
#define WORDNUM(x)
Definition: tidstore.c:31
TidStoreIter * TidStoreBeginIterate(TidStore *ts)
Definition: tidstore.c:471
void TidStoreEndIterate(TidStoreIter *iter)
Definition: tidstore.c:518
void TidStoreDetach(TidStore *ts)
Definition: tidstore.c:269
TidStoreIterResult * TidStoreIterateNext(TidStoreIter *iter)
Definition: tidstore.c:493
void TidStoreLockShare(TidStore *ts)
Definition: tidstore.c:294
TidStore * TidStoreCreateLocal(size_t max_bytes, bool insert_only)
Definition: tidstore.c:162
void TidStoreDestroy(TidStore *ts)
Definition: tidstore.c:317
#define BITNUM(x)
Definition: tidstore.c:32
void TidStoreUnlock(TidStore *ts)
Definition: tidstore.c:301
#define TidStoreIsShared(ts)
Definition: tidstore.c:132
#define WORDS_PER_PAGE(n)
Definition: tidstore.c:35
TidStore * TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
Definition: tidstore.c:244
bool TidStoreIsMember(TidStore *ts, ItemPointer tid)
Definition: tidstore.c:421
int TidStoreGetBlockOffsets(TidStoreIterResult *result, OffsetNumber *offsets, int max_offsets)
Definition: tidstore.c:566
struct BlocktableEntry BlocktableEntry
#define MAX_OFFSET_IN_BITMAP
Definition: tidstore.c:84
void TidStoreLockExclusive(TidStore *ts)
Definition: tidstore.c:287
dsa_pointer TidStoreGetHandle(TidStore *ts)
Definition: tidstore.c:552
#define MaxBlocktableEntrySize
Definition: tidstore.c:86
void TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, int num_offsets)
Definition: tidstore.c:345
size_t TidStoreMemoryUsage(TidStore *ts)
Definition: tidstore.c:532
TidStore * TidStoreCreateShared(size_t max_bytes, int tranche_id)
Definition: tidstore.c:208