PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
tidbitmap.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tidbitmap.c
4 * PostgreSQL tuple-id (TID) bitmap package
5 *
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
14 *
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
21 *
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
30 *
31 *
32 * Copyright (c) 2003-2025, PostgreSQL Global Development Group
33 *
34 * IDENTIFICATION
35 * src/backend/nodes/tidbitmap.c
36 *
37 *-------------------------------------------------------------------------
38 */
39#include "postgres.h"
40
41#include <limits.h>
42
43#include "common/hashfn.h"
44#include "common/int.h"
45#include "nodes/bitmapset.h"
46#include "nodes/tidbitmap.h"
47#include "storage/lwlock.h"
48#include "utils/dsa.h"
49
50/*
51 * When we have to switch over to lossy storage, we use a data structure
52 * with one bit per page, where all pages having the same number DIV
53 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
54 * and has the bit set for a given page, there must not be a per-page entry
55 * for that page in the page table.
56 *
57 * We actually store both exact pages and lossy chunks in the same hash
58 * table, using identical data structures. (This is because the memory
59 * management for hashtables doesn't easily/efficiently allow space to be
60 * transferred easily from one hashtable to another.) Therefore it's best
61 * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
62 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
63 * avoid expensive integer remainder operations. So, define it like this:
64 */
65#define PAGES_PER_CHUNK (BLCKSZ / 32)
66
67/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
68
69#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
70#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
71
72/* number of active words for an exact page: */
73#define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
74/* number of active words for a lossy chunk: */
75#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
76
77/*
78 * The hashtable entries are represented by this data structure. For
79 * an exact page, blockno is the page number and bit k of the bitmap
80 * represents tuple offset k+1. For a lossy chunk, blockno is the first
81 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
82 * bit k represents page blockno+k. Note that it is not possible to
83 * have exact storage for the first page of a chunk if we are using
84 * lossy storage for any page in the chunk's range, since the same
85 * hashtable entry has to serve both purposes.
86 *
87 * recheck is used only on exact pages --- it indicates that although
88 * only the stated tuples need be checked, the full index qual condition
89 * must be checked for each (ie, these are candidate matches).
90 */
91typedef struct PagetableEntry
92{
93 BlockNumber blockno; /* page number (hashtable key) */
94 char status; /* hash entry status */
95 bool ischunk; /* T = lossy storage, F = exact */
96 bool recheck; /* should the tuples be rechecked? */
99
100/*
101 * Holds array of pagetable entries.
102 */
103typedef struct PTEntryArray
104{
105 pg_atomic_uint32 refcount; /* no. of iterator attached */
108
109/*
110 * We want to avoid the overhead of creating the hashtable, which is
111 * comparatively large, when not necessary. Particularly when we are using a
112 * bitmap scan on the inside of a nestloop join: a bitmap may well live only
113 * long enough to accumulate one entry in such cases. We therefore avoid
114 * creating an actual hashtable until we need two pagetable entries. When
115 * just one pagetable entry is needed, we store it in a fixed field of
116 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
117 * shrinks down to zero or one page again. So, status can be TBM_HASH even
118 * when nentries is zero or one.)
119 */
120typedef enum
121{
122 TBM_EMPTY, /* no hashtable, nentries == 0 */
123 TBM_ONE_PAGE, /* entry1 contains the single entry */
124 TBM_HASH, /* pagetable is valid, entry1 is not */
125} TBMStatus;
126
127/*
128 * Current iterating state of the TBM.
129 */
130typedef enum
131{
132 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
133 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
134 TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
136
137/*
138 * Here is the representation for a whole TIDBitMap:
139 */
141{
142 NodeTag type; /* to make it a valid Node */
143 MemoryContext mcxt; /* memory context containing me */
144 TBMStatus status; /* see codes above */
145 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
146 int nentries; /* number of entries in pagetable */
147 int maxentries; /* limit on same to meet maxbytes */
148 int npages; /* number of exact entries in pagetable */
149 int nchunks; /* number of lossy entries in pagetable */
150 TBMIteratingState iterating; /* tbm_begin_iterate called? */
151 uint32 lossify_start; /* offset to start lossifying hashtable at */
152 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
153 /* these are valid when iterating is true: */
154 PagetableEntry **spages; /* sorted exact-page list, or NULL */
155 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
156 dsa_pointer dsapagetable; /* dsa_pointer to the element array */
157 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
158 dsa_pointer ptpages; /* dsa_pointer to the page array */
159 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
160 dsa_area *dsa; /* reference to per-query dsa area */
161};
162
163/*
164 * When iterating over a backend-local bitmap in sorted order, a
165 * TBMPrivateIterator is used to track our progress. There can be several
166 * iterators scanning the same bitmap concurrently. Note that the bitmap
167 * becomes read-only as soon as any iterator is created.
168 */
170{
171 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
172 int spageptr; /* next spages index */
173 int schunkptr; /* next schunks index */
174 int schunkbit; /* next bit to check in current schunk */
175};
176
177/*
178 * Holds the shared members of the iterator so that multiple processes
179 * can jointly iterate.
180 */
182{
183 int nentries; /* number of entries in pagetable */
184 int maxentries; /* limit on same to meet maxbytes */
185 int npages; /* number of exact entries in pagetable */
186 int nchunks; /* number of lossy entries in pagetable */
187 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
188 dsa_pointer spages; /* dsa pointer to page array */
189 dsa_pointer schunks; /* dsa pointer to chunk array */
190 LWLock lock; /* lock to protect below members */
191 int spageptr; /* next spages index */
192 int schunkptr; /* next schunks index */
193 int schunkbit; /* next bit to check in current schunk */
195
196/*
197 * pagetable iteration array.
198 */
199typedef struct PTIterationArray
200{
201 pg_atomic_uint32 refcount; /* no. of iterator attached */
202 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
204
205/*
206 * same as TBMPrivateIterator, but it is used for joint iteration, therefore
207 * this also holds a reference to the shared state.
208 */
210{
211 TBMSharedIteratorState *state; /* shared state */
212 PTEntryArray *ptbase; /* pagetable element array */
213 PTIterationArray *ptpages; /* sorted exact page index list */
214 PTIterationArray *ptchunks; /* sorted lossy page index list */
215};
216
217/* Local function prototypes */
218static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
219static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
220 const TIDBitmap *b);
221static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
222 BlockNumber pageno);
224static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
225static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
226static void tbm_lossify(TIDBitmap *tbm);
227static int tbm_comparator(const void *left, const void *right);
228static int tbm_shared_comparator(const void *left, const void *right,
229 void *arg);
230
231/* define hashtable mapping block numbers to PagetableEntry's */
232#define SH_USE_NONDEFAULT_ALLOCATOR
233#define SH_PREFIX pagetable
234#define SH_ELEMENT_TYPE PagetableEntry
235#define SH_KEY_TYPE BlockNumber
236#define SH_KEY blockno
237#define SH_HASH_KEY(tb, key) murmurhash32(key)
238#define SH_EQUAL(tb, a, b) a == b
239#define SH_SCOPE static inline
240#define SH_DEFINE
241#define SH_DECLARE
242#include "lib/simplehash.h"
243
244
245/*
246 * tbm_create - create an initially-empty bitmap
247 *
248 * The bitmap will live in the memory context that is CurrentMemoryContext
249 * at the time of this call. It will be limited to (approximately) maxbytes
250 * total memory consumption. If the DSA passed to this function is not NULL
251 * then the memory for storing elements of the underlying page table will
252 * be allocated from the DSA.
253 */
254TIDBitmap *
255tbm_create(Size maxbytes, dsa_area *dsa)
256{
257 TIDBitmap *tbm;
258
259 /* Create the TIDBitmap struct and zero all its fields */
260 tbm = makeNode(TIDBitmap);
261
263 tbm->status = TBM_EMPTY;
264
265 tbm->maxentries = tbm_calculate_entries(maxbytes);
266 tbm->lossify_start = 0;
267 tbm->dsa = dsa;
272
273 return tbm;
274}
275
276/*
277 * Actually create the hashtable. Since this is a moderately expensive
278 * proposition, we don't do it until we have to.
279 */
280static void
282{
283 Assert(tbm->status != TBM_HASH);
284 Assert(tbm->pagetable == NULL);
285
286 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
287
288 /* If entry1 is valid, push it into the hashtable */
289 if (tbm->status == TBM_ONE_PAGE)
290 {
291 PagetableEntry *page;
292 bool found;
293 char oldstatus;
294
295 page = pagetable_insert(tbm->pagetable,
296 tbm->entry1.blockno,
297 &found);
298 Assert(!found);
299 oldstatus = page->status;
300 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
301 page->status = oldstatus;
302 }
303
304 tbm->status = TBM_HASH;
305}
306
307/*
308 * tbm_free - free a TIDBitmap
309 */
310void
312{
313 if (tbm->pagetable)
314 pagetable_destroy(tbm->pagetable);
315 if (tbm->spages)
316 pfree(tbm->spages);
317 if (tbm->schunks)
318 pfree(tbm->schunks);
319 pfree(tbm);
320}
321
322/*
323 * tbm_free_shared_area - free shared state
324 *
325 * Free shared iterator state, Also free shared pagetable and iterator arrays
326 * memory if they are not referred by any of the shared iterator i.e recount
327 * is becomes 0.
328 */
329void
331{
332 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
333 PTEntryArray *ptbase;
334 PTIterationArray *ptpages;
335 PTIterationArray *ptchunks;
336
337 if (DsaPointerIsValid(istate->pagetable))
338 {
339 ptbase = dsa_get_address(dsa, istate->pagetable);
340 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
341 dsa_free(dsa, istate->pagetable);
342 }
343 if (DsaPointerIsValid(istate->spages))
344 {
345 ptpages = dsa_get_address(dsa, istate->spages);
346 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
347 dsa_free(dsa, istate->spages);
348 }
349 if (DsaPointerIsValid(istate->schunks))
350 {
351 ptchunks = dsa_get_address(dsa, istate->schunks);
352 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
353 dsa_free(dsa, istate->schunks);
354 }
355
356 dsa_free(dsa, dp);
357}
358
359/*
360 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
361 *
362 * If recheck is true, then the recheck flag will be set in the
363 * TBMIterateResult when any of these tuples are reported out.
364 */
365void
366tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
367 bool recheck)
368{
370 PagetableEntry *page = NULL; /* only valid when currblk is valid */
371 int i;
372
374 for (i = 0; i < ntids; i++)
375 {
378 int wordnum,
379 bitnum;
380
381 /* safety check to ensure we don't overrun bit array bounds */
382 if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
383 elog(ERROR, "tuple offset out of range: %u", off);
384
385 /*
386 * Look up target page unless we already did. This saves cycles when
387 * the input includes consecutive tuples on the same page, which is
388 * common enough to justify an extra test here.
389 */
390 if (blk != currblk)
391 {
392 if (tbm_page_is_lossy(tbm, blk))
393 page = NULL; /* remember page is lossy */
394 else
395 page = tbm_get_pageentry(tbm, blk);
396 currblk = blk;
397 }
398
399 if (page == NULL)
400 continue; /* whole page is already marked */
401
402 if (page->ischunk)
403 {
404 /* The page is a lossy chunk header, set bit for itself */
405 wordnum = bitnum = 0;
406 }
407 else
408 {
409 /* Page is exact, so set bit for individual tuple */
410 wordnum = WORDNUM(off - 1);
411 bitnum = BITNUM(off - 1);
412 }
413 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
414 page->recheck |= recheck;
415
416 if (tbm->nentries > tbm->maxentries)
417 {
418 tbm_lossify(tbm);
419 /* Page could have been converted to lossy, so force new lookup */
420 currblk = InvalidBlockNumber;
421 }
422 }
423}
424
425/*
426 * tbm_add_page - add a whole page to a TIDBitmap
427 *
428 * This causes the whole page to be reported (with the recheck flag)
429 * when the TIDBitmap is scanned.
430 */
431void
433{
434 /* Enter the page in the bitmap, or mark it lossy if already present */
435 tbm_mark_page_lossy(tbm, pageno);
436 /* If we went over the memory limit, lossify some more pages */
437 if (tbm->nentries > tbm->maxentries)
438 tbm_lossify(tbm);
439}
440
441/*
442 * tbm_union - set union
443 *
444 * a is modified in-place, b is not changed
445 */
446void
448{
449 Assert(!a->iterating);
450 /* Nothing to do if b is empty */
451 if (b->nentries == 0)
452 return;
453 /* Scan through chunks and pages in b, merge into a */
454 if (b->status == TBM_ONE_PAGE)
455 tbm_union_page(a, &b->entry1);
456 else
457 {
458 pagetable_iterator i;
459 PagetableEntry *bpage;
460
461 Assert(b->status == TBM_HASH);
462 pagetable_start_iterate(b->pagetable, &i);
463 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
464 tbm_union_page(a, bpage);
465 }
466}
467
468/* Process one page of b during a union op */
469static void
471{
472 PagetableEntry *apage;
473 int wordnum;
474
475 if (bpage->ischunk)
476 {
477 /* Scan b's chunk, mark each indicated page lossy in a */
478 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
479 {
480 bitmapword w = bpage->words[wordnum];
481
482 if (w != 0)
483 {
484 BlockNumber pg;
485
486 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
487 while (w != 0)
488 {
489 if (w & 1)
491 pg++;
492 w >>= 1;
493 }
494 }
495 }
496 }
497 else if (tbm_page_is_lossy(a, bpage->blockno))
498 {
499 /* page is already lossy in a, nothing to do */
500 return;
501 }
502 else
503 {
504 apage = tbm_get_pageentry(a, bpage->blockno);
505 if (apage->ischunk)
506 {
507 /* The page is a lossy chunk header, set bit for itself */
508 apage->words[0] |= ((bitmapword) 1 << 0);
509 }
510 else
511 {
512 /* Both pages are exact, merge at the bit level */
513 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
514 apage->words[wordnum] |= bpage->words[wordnum];
515 apage->recheck |= bpage->recheck;
516 }
517 }
518
519 if (a->nentries > a->maxentries)
520 tbm_lossify(a);
521}
522
523/*
524 * tbm_intersect - set intersection
525 *
526 * a is modified in-place, b is not changed
527 */
528void
530{
531 Assert(!a->iterating);
532 /* Nothing to do if a is empty */
533 if (a->nentries == 0)
534 return;
535 /* Scan through chunks and pages in a, try to match to b */
536 if (a->status == TBM_ONE_PAGE)
537 {
538 if (tbm_intersect_page(a, &a->entry1, b))
539 {
540 /* Page is now empty, remove it from a */
541 Assert(!a->entry1.ischunk);
542 a->npages--;
543 a->nentries--;
544 Assert(a->nentries == 0);
545 a->status = TBM_EMPTY;
546 }
547 }
548 else
549 {
550 pagetable_iterator i;
551 PagetableEntry *apage;
552
553 Assert(a->status == TBM_HASH);
554 pagetable_start_iterate(a->pagetable, &i);
555 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
556 {
557 if (tbm_intersect_page(a, apage, b))
558 {
559 /* Page or chunk is now empty, remove it from a */
560 if (apage->ischunk)
561 a->nchunks--;
562 else
563 a->npages--;
564 a->nentries--;
565 if (!pagetable_delete(a->pagetable, apage->blockno))
566 elog(ERROR, "hash table corrupted");
567 }
568 }
569 }
570}
571
572/*
573 * Process one page of a during an intersection op
574 *
575 * Returns true if apage is now empty and should be deleted from a
576 */
577static bool
579{
580 const PagetableEntry *bpage;
581 int wordnum;
582
583 if (apage->ischunk)
584 {
585 /* Scan each bit in chunk, try to clear */
586 bool candelete = true;
587
588 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
589 {
590 bitmapword w = apage->words[wordnum];
591
592 if (w != 0)
593 {
594 bitmapword neww = w;
595 BlockNumber pg;
596 int bitnum;
597
598 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
599 bitnum = 0;
600 while (w != 0)
601 {
602 if (w & 1)
603 {
604 if (!tbm_page_is_lossy(b, pg) &&
605 tbm_find_pageentry(b, pg) == NULL)
606 {
607 /* Page is not in b at all, lose lossy bit */
608 neww &= ~((bitmapword) 1 << bitnum);
609 }
610 }
611 pg++;
612 bitnum++;
613 w >>= 1;
614 }
615 apage->words[wordnum] = neww;
616 if (neww != 0)
617 candelete = false;
618 }
619 }
620 return candelete;
621 }
622 else if (tbm_page_is_lossy(b, apage->blockno))
623 {
624 /*
625 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
626 * because the page 'b' is lossy, we don't know which ones. Therefore
627 * we mark 'a' as requiring rechecks, to indicate that at most those
628 * tuples set in 'a' are matches.
629 */
630 apage->recheck = true;
631 return false;
632 }
633 else
634 {
635 bool candelete = true;
636
637 bpage = tbm_find_pageentry(b, apage->blockno);
638 if (bpage != NULL)
639 {
640 /* Both pages are exact, merge at the bit level */
641 Assert(!bpage->ischunk);
642 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
643 {
644 apage->words[wordnum] &= bpage->words[wordnum];
645 if (apage->words[wordnum] != 0)
646 candelete = false;
647 }
648 apage->recheck |= bpage->recheck;
649 }
650 /* If there is no matching b page, we can just delete the a page */
651 return candelete;
652 }
653}
654
655/*
656 * tbm_is_empty - is a TIDBitmap completely empty?
657 */
658bool
660{
661 return (tbm->nentries == 0);
662}
663
664/*
665 * tbm_begin_private_iterate - prepare to iterate through a TIDBitmap
666 *
667 * The TBMPrivateIterator struct is created in the caller's memory context.
668 * For a clean shutdown of the iteration, call tbm_end_private_iterate; but
669 * it's okay to just allow the memory context to be released, too. It is
670 * caller's responsibility not to touch the TBMPrivateIterator anymore once
671 * the TIDBitmap is freed.
672 *
673 * NB: after this is called, it is no longer allowed to modify the contents
674 * of the bitmap. However, you can call this multiple times to scan the
675 * contents repeatedly, including parallel scans.
676 */
679{
680 TBMPrivateIterator *iterator;
681
683
684 /*
685 * Create the TBMPrivateIterator struct, with enough trailing space to
686 * serve the needs of the TBMIterateResult sub-struct.
687 */
688 iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
689 iterator->tbm = tbm;
690
691 /*
692 * Initialize iteration pointers.
693 */
694 iterator->spageptr = 0;
695 iterator->schunkptr = 0;
696 iterator->schunkbit = 0;
697
698 /*
699 * If we have a hashtable, create and fill the sorted page lists, unless
700 * we already did that for a previous iterator. Note that the lists are
701 * attached to the bitmap not the iterator, so they can be used by more
702 * than one iterator.
703 */
704 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
705 {
706 pagetable_iterator i;
707 PagetableEntry *page;
708 int npages;
709 int nchunks;
710
711 if (!tbm->spages && tbm->npages > 0)
712 tbm->spages = (PagetableEntry **)
714 tbm->npages * sizeof(PagetableEntry *));
715 if (!tbm->schunks && tbm->nchunks > 0)
716 tbm->schunks = (PagetableEntry **)
718 tbm->nchunks * sizeof(PagetableEntry *));
719
720 npages = nchunks = 0;
721 pagetable_start_iterate(tbm->pagetable, &i);
722 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
723 {
724 if (page->ischunk)
725 tbm->schunks[nchunks++] = page;
726 else
727 tbm->spages[npages++] = page;
728 }
729 Assert(npages == tbm->npages);
730 Assert(nchunks == tbm->nchunks);
731 if (npages > 1)
732 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
734 if (nchunks > 1)
735 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
737 }
738
740
741 return iterator;
742}
743
744/*
745 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
746 *
747 * The necessary shared state will be allocated from the DSA passed to
748 * tbm_create, so that multiple processes can attach to it and iterate jointly.
749 *
750 * This will convert the pagetable hash into page and chunk array of the index
751 * into pagetable array.
752 */
755{
756 dsa_pointer dp;
758 PTEntryArray *ptbase = NULL;
759 PTIterationArray *ptpages = NULL;
760 PTIterationArray *ptchunks = NULL;
761
762 Assert(tbm->dsa != NULL);
764
765 /*
766 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
767 * lock, this will also be used by multiple worker for shared iterate.
768 */
769 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
770 istate = dsa_get_address(tbm->dsa, dp);
771
772 /*
773 * If we're not already iterating, create and fill the sorted page lists.
774 * (If we are, the sorted page lists are already stored in the TIDBitmap,
775 * and we can just reuse them.)
776 */
777 if (tbm->iterating == TBM_NOT_ITERATING)
778 {
779 pagetable_iterator i;
780 PagetableEntry *page;
781 int idx;
782 int npages;
783 int nchunks;
784
785 /*
786 * Allocate the page and chunk array memory from the DSA to share
787 * across multiple processes.
788 */
789 if (tbm->npages)
790 {
791 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
792 tbm->npages * sizeof(int));
793 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
794 pg_atomic_init_u32(&ptpages->refcount, 0);
795 }
796 if (tbm->nchunks)
797 {
798 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
799 tbm->nchunks * sizeof(int));
800 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
801 pg_atomic_init_u32(&ptchunks->refcount, 0);
802 }
803
804 /*
805 * If TBM status is TBM_HASH then iterate over the pagetable and
806 * convert it to page and chunk arrays. But if it's in the
807 * TBM_ONE_PAGE mode then directly allocate the space for one entry
808 * from the DSA.
809 */
810 npages = nchunks = 0;
811 if (tbm->status == TBM_HASH)
812 {
813 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
814
815 pagetable_start_iterate(tbm->pagetable, &i);
816 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
817 {
818 idx = page - ptbase->ptentry;
819 if (page->ischunk)
820 ptchunks->index[nchunks++] = idx;
821 else
822 ptpages->index[npages++] = idx;
823 }
824
825 Assert(npages == tbm->npages);
826 Assert(nchunks == tbm->nchunks);
827 }
828 else if (tbm->status == TBM_ONE_PAGE)
829 {
830 /*
831 * In one page mode allocate the space for one pagetable entry,
832 * initialize it, and directly store its index (i.e. 0) in the
833 * page array.
834 */
835 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
836 sizeof(PagetableEntry));
837 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
838 memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
839 ptpages->index[0] = 0;
840 }
841
842 if (ptbase != NULL)
843 pg_atomic_init_u32(&ptbase->refcount, 0);
844 if (npages > 1)
845 qsort_arg(ptpages->index, npages, sizeof(int),
847 if (nchunks > 1)
848 qsort_arg(ptchunks->index, nchunks, sizeof(int),
850 }
851
852 /*
853 * Store the TBM members in the shared state so that we can share them
854 * across multiple processes.
855 */
856 istate->nentries = tbm->nentries;
857 istate->maxentries = tbm->maxentries;
858 istate->npages = tbm->npages;
859 istate->nchunks = tbm->nchunks;
860 istate->pagetable = tbm->dsapagetable;
861 istate->spages = tbm->ptpages;
862 istate->schunks = tbm->ptchunks;
863
864 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
865 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
866 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
867
868 /*
869 * For every shared iterator referring to pagetable and iterator array,
870 * increase the refcount by 1 so that while freeing the shared iterator we
871 * don't free pagetable and iterator array until its refcount becomes 0.
872 */
873 if (ptbase != NULL)
875 if (ptpages != NULL)
876 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
877 if (ptchunks != NULL)
878 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
879
880 /* Initialize the iterator lock */
882
883 /* Initialize the shared iterator state */
884 istate->schunkbit = 0;
885 istate->schunkptr = 0;
886 istate->spageptr = 0;
887
889
890 return dp;
891}
892
893/*
894 * tbm_extract_page_tuple - extract the tuple offsets from a page
895 *
896 * Returns the number of offsets it filled in if <= max_offsets. Otherwise,
897 * fills in as many offsets as fit and returns the total number of offsets in
898 * the page.
899 */
900int
902 OffsetNumber *offsets,
903 uint32 max_offsets)
904{
905 PagetableEntry *page = iteritem->internal_page;
906 int wordnum;
907 int ntuples = 0;
908
909 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
910 {
911 bitmapword w = page->words[wordnum];
912
913 if (w != 0)
914 {
915 int off = wordnum * BITS_PER_BITMAPWORD + 1;
916
917 while (w != 0)
918 {
919 if (w & 1)
920 {
921 if (ntuples < max_offsets)
922 offsets[ntuples] = (OffsetNumber) off;
923 ntuples++;
924 }
925 off++;
926 w >>= 1;
927 }
928 }
929 }
930
931 return ntuples;
932}
933
934/*
935 * tbm_advance_schunkbit - Advance the schunkbit
936 */
937static inline void
938tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
939{
940 int schunkbit = *schunkbitp;
941
942 while (schunkbit < PAGES_PER_CHUNK)
943 {
944 int wordnum = WORDNUM(schunkbit);
945 int bitnum = BITNUM(schunkbit);
946
947 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
948 break;
949 schunkbit++;
950 }
951
952 *schunkbitp = schunkbit;
953}
954
955/*
956 * tbm_private_iterate - scan through next page of a TIDBitmap
957 *
958 * Caller must pass in a TBMIterateResult to be filled.
959 *
960 * Pages are guaranteed to be delivered in numerical order.
961 *
962 * Returns false when there are no more pages to scan and true otherwise. When
963 * there are no more pages to scan, tbmres->blockno is set to
964 * InvalidBlockNumber.
965 *
966 * If lossy is true, then the bitmap is "lossy" and failed to remember
967 * the exact tuples to look at on this page --- the caller must examine all
968 * tuples on the page and check if they meet the intended condition. If lossy
969 * is false, the caller must later extract the tuple offsets from the page
970 * pointed to by internal_page with tbm_extract_page_tuple.
971 *
972 * If tbmres->recheck is true, only the indicated tuples need be examined, but
973 * the condition must be rechecked anyway. (For ease of testing, recheck is
974 * always set true when lossy is true.)
975 */
976bool
978{
979 TIDBitmap *tbm = iterator->tbm;
980
982
983 /*
984 * If lossy chunk pages remain, make sure we've advanced schunkptr/
985 * schunkbit to the next set bit.
986 */
987 while (iterator->schunkptr < tbm->nchunks)
988 {
989 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
990 int schunkbit = iterator->schunkbit;
991
992 tbm_advance_schunkbit(chunk, &schunkbit);
993 if (schunkbit < PAGES_PER_CHUNK)
994 {
995 iterator->schunkbit = schunkbit;
996 break;
997 }
998 /* advance to next chunk */
999 iterator->schunkptr++;
1000 iterator->schunkbit = 0;
1001 }
1002
1003 /*
1004 * If both chunk and per-page data remain, must output the numerically
1005 * earlier page.
1006 */
1007 if (iterator->schunkptr < tbm->nchunks)
1008 {
1009 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1010 BlockNumber chunk_blockno;
1011
1012 chunk_blockno = chunk->blockno + iterator->schunkbit;
1013 if (iterator->spageptr >= tbm->npages ||
1014 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1015 {
1016 /* Return a lossy page indicator from the chunk */
1017 tbmres->blockno = chunk_blockno;
1018 tbmres->lossy = true;
1019 tbmres->recheck = true;
1020 tbmres->internal_page = NULL;
1021 iterator->schunkbit++;
1022 return true;
1023 }
1024 }
1025
1026 if (iterator->spageptr < tbm->npages)
1027 {
1028 PagetableEntry *page;
1029
1030 /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1031 if (tbm->status == TBM_ONE_PAGE)
1032 page = &tbm->entry1;
1033 else
1034 page = tbm->spages[iterator->spageptr];
1035
1036 tbmres->internal_page = page;
1037 tbmres->blockno = page->blockno;
1038 tbmres->lossy = false;
1039 tbmres->recheck = page->recheck;
1040 iterator->spageptr++;
1041 return true;
1042 }
1043
1044 /* Nothing more in the bitmap */
1045 tbmres->blockno = InvalidBlockNumber;
1046 return false;
1047}
1048
1049/*
1050 * tbm_shared_iterate - scan through next page of a TIDBitmap
1051 *
1052 * As above, but this will iterate using an iterator which is shared
1053 * across multiple processes. We need to acquire the iterator LWLock,
1054 * before accessing the shared members.
1055 */
1056bool
1058{
1059 TBMSharedIteratorState *istate = iterator->state;
1060 PagetableEntry *ptbase = NULL;
1061 int *idxpages = NULL;
1062 int *idxchunks = NULL;
1063
1064 if (iterator->ptbase != NULL)
1065 ptbase = iterator->ptbase->ptentry;
1066 if (iterator->ptpages != NULL)
1067 idxpages = iterator->ptpages->index;
1068 if (iterator->ptchunks != NULL)
1069 idxchunks = iterator->ptchunks->index;
1070
1071 /* Acquire the LWLock before accessing the shared members */
1072 LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1073
1074 /*
1075 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1076 * schunkbit to the next set bit.
1077 */
1078 while (istate->schunkptr < istate->nchunks)
1079 {
1080 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1081 int schunkbit = istate->schunkbit;
1082
1083 tbm_advance_schunkbit(chunk, &schunkbit);
1084 if (schunkbit < PAGES_PER_CHUNK)
1085 {
1086 istate->schunkbit = schunkbit;
1087 break;
1088 }
1089 /* advance to next chunk */
1090 istate->schunkptr++;
1091 istate->schunkbit = 0;
1092 }
1093
1094 /*
1095 * If both chunk and per-page data remain, must output the numerically
1096 * earlier page.
1097 */
1098 if (istate->schunkptr < istate->nchunks)
1099 {
1100 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1101 BlockNumber chunk_blockno;
1102
1103 chunk_blockno = chunk->blockno + istate->schunkbit;
1104
1105 if (istate->spageptr >= istate->npages ||
1106 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1107 {
1108 /* Return a lossy page indicator from the chunk */
1109 tbmres->blockno = chunk_blockno;
1110 tbmres->lossy = true;
1111 tbmres->recheck = true;
1112 tbmres->internal_page = NULL;
1113 istate->schunkbit++;
1114
1115 LWLockRelease(&istate->lock);
1116 return true;
1117 }
1118 }
1119
1120 if (istate->spageptr < istate->npages)
1121 {
1122 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1123
1124 tbmres->internal_page = page;
1125 tbmres->blockno = page->blockno;
1126 tbmres->lossy = false;
1127 tbmres->recheck = page->recheck;
1128 istate->spageptr++;
1129
1130 LWLockRelease(&istate->lock);
1131
1132 return true;
1133 }
1134
1135 LWLockRelease(&istate->lock);
1136
1137 /* Nothing more in the bitmap */
1138 tbmres->blockno = InvalidBlockNumber;
1139 return false;
1140}
1141
1142/*
1143 * tbm_end_private_iterate - finish an iteration over a TIDBitmap
1144 *
1145 * Currently this is just a pfree, but it might do more someday. (For
1146 * instance, it could be useful to count open iterators and allow the
1147 * bitmap to return to read/write status when there are no more iterators.)
1148 */
1149void
1151{
1152 pfree(iterator);
1153}
1154
1155/*
1156 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1157 *
1158 * This doesn't free any of the shared state associated with the iterator,
1159 * just our backend-private state.
1160 */
1161void
1163{
1164 pfree(iterator);
1165}
1166
1167/*
1168 * tbm_find_pageentry - find a PagetableEntry for the pageno
1169 *
1170 * Returns NULL if there is no non-lossy entry for the pageno.
1171 */
1172static const PagetableEntry *
1174{
1175 const PagetableEntry *page;
1176
1177 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1178 return NULL;
1179
1180 if (tbm->status == TBM_ONE_PAGE)
1181 {
1182 page = &tbm->entry1;
1183 if (page->blockno != pageno)
1184 return NULL;
1185 Assert(!page->ischunk);
1186 return page;
1187 }
1188
1189 page = pagetable_lookup(tbm->pagetable, pageno);
1190 if (page == NULL)
1191 return NULL;
1192 if (page->ischunk)
1193 return NULL; /* don't want a lossy chunk header */
1194 return page;
1195}
1196
1197/*
1198 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1199 *
1200 * If new, the entry is marked as an exact (non-chunk) entry.
1201 *
1202 * This may cause the table to exceed the desired memory size. It is
1203 * up to the caller to call tbm_lossify() at the next safe point if so.
1204 */
1205static PagetableEntry *
1207{
1208 PagetableEntry *page;
1209 bool found;
1210
1211 if (tbm->status == TBM_EMPTY)
1212 {
1213 /* Use the fixed slot */
1214 page = &tbm->entry1;
1215 found = false;
1216 tbm->status = TBM_ONE_PAGE;
1217 }
1218 else
1219 {
1220 if (tbm->status == TBM_ONE_PAGE)
1221 {
1222 page = &tbm->entry1;
1223 if (page->blockno == pageno)
1224 return page;
1225 /* Time to switch from one page to a hashtable */
1227 }
1228
1229 /* Look up or create an entry */
1230 page = pagetable_insert(tbm->pagetable, pageno, &found);
1231 }
1232
1233 /* Initialize it if not present before */
1234 if (!found)
1235 {
1236 char oldstatus = page->status;
1237
1238 MemSet(page, 0, sizeof(PagetableEntry));
1239 page->status = oldstatus;
1240 page->blockno = pageno;
1241 /* must count it too */
1242 tbm->nentries++;
1243 tbm->npages++;
1244 }
1245
1246 return page;
1247}
1248
1249/*
1250 * tbm_page_is_lossy - is the page marked as lossily stored?
1251 */
1252static bool
1254{
1255 PagetableEntry *page;
1256 BlockNumber chunk_pageno;
1257 int bitno;
1258
1259 /* we can skip the lookup if there are no lossy chunks */
1260 if (tbm->nchunks == 0)
1261 return false;
1262 Assert(tbm->status == TBM_HASH);
1263
1264 bitno = pageno % PAGES_PER_CHUNK;
1265 chunk_pageno = pageno - bitno;
1266
1267 page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1268
1269 if (page != NULL && page->ischunk)
1270 {
1271 int wordnum = WORDNUM(bitno);
1272 int bitnum = BITNUM(bitno);
1273
1274 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1275 return true;
1276 }
1277 return false;
1278}
1279
1280/*
1281 * tbm_mark_page_lossy - mark the page number as lossily stored
1282 *
1283 * This may cause the table to exceed the desired memory size. It is
1284 * up to the caller to call tbm_lossify() at the next safe point if so.
1285 */
1286static void
1288{
1289 PagetableEntry *page;
1290 bool found;
1291 BlockNumber chunk_pageno;
1292 int bitno;
1293 int wordnum;
1294 int bitnum;
1295
1296 /* We force the bitmap into hashtable mode whenever it's lossy */
1297 if (tbm->status != TBM_HASH)
1299
1300 bitno = pageno % PAGES_PER_CHUNK;
1301 chunk_pageno = pageno - bitno;
1302
1303 /*
1304 * Remove any extant non-lossy entry for the page. If the page is its own
1305 * chunk header, however, we skip this and handle the case below.
1306 */
1307 if (bitno != 0)
1308 {
1309 if (pagetable_delete(tbm->pagetable, pageno))
1310 {
1311 /* It was present, so adjust counts */
1312 tbm->nentries--;
1313 tbm->npages--; /* assume it must have been non-lossy */
1314 }
1315 }
1316
1317 /* Look up or create entry for chunk-header page */
1318 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1319
1320 /* Initialize it if not present before */
1321 if (!found)
1322 {
1323 char oldstatus = page->status;
1324
1325 MemSet(page, 0, sizeof(PagetableEntry));
1326 page->status = oldstatus;
1327 page->blockno = chunk_pageno;
1328 page->ischunk = true;
1329 /* must count it too */
1330 tbm->nentries++;
1331 tbm->nchunks++;
1332 }
1333 else if (!page->ischunk)
1334 {
1335 char oldstatus = page->status;
1336
1337 /* chunk header page was formerly non-lossy, make it lossy */
1338 MemSet(page, 0, sizeof(PagetableEntry));
1339 page->status = oldstatus;
1340 page->blockno = chunk_pageno;
1341 page->ischunk = true;
1342 /* we assume it had some tuple bit(s) set, so mark it lossy */
1343 page->words[0] = ((bitmapword) 1 << 0);
1344 /* adjust counts */
1345 tbm->nchunks++;
1346 tbm->npages--;
1347 }
1348
1349 /* Now set the original target page's bit */
1350 wordnum = WORDNUM(bitno);
1351 bitnum = BITNUM(bitno);
1352 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1353}
1354
1355/*
1356 * tbm_lossify - lose some information to get back under the memory limit
1357 */
1358static void
1360{
1361 pagetable_iterator i;
1362 PagetableEntry *page;
1363
1364 /*
1365 * XXX Really stupid implementation: this just lossifies pages in
1366 * essentially random order. We should be paying some attention to the
1367 * number of bits set in each page, instead.
1368 *
1369 * Since we are called as soon as nentries exceeds maxentries, we should
1370 * push nentries down to significantly less than maxentries, or else we'll
1371 * just end up doing this again very soon. We shoot for maxentries/2.
1372 */
1374 Assert(tbm->status == TBM_HASH);
1375
1376 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1377 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1378 {
1379 if (page->ischunk)
1380 continue; /* already a chunk header */
1381
1382 /*
1383 * If the page would become a chunk header, we won't save anything by
1384 * converting it to lossy, so skip it.
1385 */
1386 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1387 continue;
1388
1389 /* This does the dirty work ... */
1390 tbm_mark_page_lossy(tbm, page->blockno);
1391
1392 if (tbm->nentries <= tbm->maxentries / 2)
1393 {
1394 /*
1395 * We have made enough room. Remember where to start lossifying
1396 * next round, so we evenly iterate over the hashtable.
1397 */
1398 tbm->lossify_start = i.cur;
1399 break;
1400 }
1401
1402 /*
1403 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1404 * hashtable and may have deleted the non-lossy chunk. We can
1405 * continue the same hash table scan, since failure to visit one
1406 * element or visiting the newly inserted element, isn't fatal.
1407 */
1408 }
1409
1410 /*
1411 * With a big bitmap and small work_mem, it's possible that we cannot get
1412 * under maxentries. Again, if that happens, we'd end up uselessly
1413 * calling tbm_lossify over and over. To prevent this from becoming a
1414 * performance sink, force maxentries up to at least double the current
1415 * number of entries. (In essence, we're admitting inability to fit
1416 * within work_mem when we do this.) Note that this test will not fire if
1417 * we broke out of the loop early; and if we didn't, the current number of
1418 * entries is simply not reducible any further.
1419 */
1420 if (tbm->nentries > tbm->maxentries / 2)
1421 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1422}
1423
1424/*
1425 * qsort comparator to handle PagetableEntry pointers.
1426 */
1427static int
1428tbm_comparator(const void *left, const void *right)
1429{
1430 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1431 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1432
1433 return pg_cmp_u32(l, r);
1434}
1435
1436/*
1437 * As above, but this will get index into PagetableEntry array. Therefore,
1438 * it needs to get actual PagetableEntry using the index before comparing the
1439 * blockno.
1440 */
1441static int
1442tbm_shared_comparator(const void *left, const void *right, void *arg)
1443{
1444 PagetableEntry *base = (PagetableEntry *) arg;
1445 PagetableEntry *lpage = &base[*(int *) left];
1446 PagetableEntry *rpage = &base[*(int *) right];
1447
1448 if (lpage->blockno < rpage->blockno)
1449 return -1;
1450 else if (lpage->blockno > rpage->blockno)
1451 return 1;
1452 return 0;
1453}
1454
1455/*
1456 * tbm_attach_shared_iterate
1457 *
1458 * Allocate a backend-private iterator and attach the shared iterator state
1459 * to it so that multiple processed can iterate jointly.
1460 *
1461 * We also converts the DSA pointers to local pointers and store them into
1462 * our private iterator.
1463 */
1466{
1467 TBMSharedIterator *iterator;
1468 TBMSharedIteratorState *istate;
1469
1470 /*
1471 * Create the TBMSharedIterator struct, with enough trailing space to
1472 * serve the needs of the TBMIterateResult sub-struct.
1473 */
1474 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
1475
1476 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477
1478 iterator->state = istate;
1479
1480 iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481
1482 if (istate->npages)
1483 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484 if (istate->nchunks)
1485 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486
1487 return iterator;
1488}
1489
1490/*
1491 * pagetable_allocate
1492 *
1493 * Callback function for allocating the memory for hashtable elements.
1494 * Allocate memory for hashtable elements, using DSA if available.
1495 */
1496static inline void *
1497pagetable_allocate(pagetable_hash *pagetable, Size size)
1498{
1499 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1500 PTEntryArray *ptbase;
1501
1502 if (tbm->dsa == NULL)
1503 return MemoryContextAllocExtended(pagetable->ctx, size,
1505
1506 /*
1507 * Save the dsapagetable reference in dsapagetableold before allocating
1508 * new memory so that pagetable_free can free the old entry.
1509 */
1510 tbm->dsapagetableold = tbm->dsapagetable;
1512 sizeof(PTEntryArray) + size,
1514 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515
1516 return ptbase->ptentry;
1517}
1518
1519/*
1520 * pagetable_free
1521 *
1522 * Callback function for freeing hash table elements.
1523 */
1524static inline void
1525pagetable_free(pagetable_hash *pagetable, void *pointer)
1526{
1527 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1528
1529 /* pfree the input pointer if DSA is not available */
1530 if (tbm->dsa == NULL)
1531 pfree(pointer);
1532 else if (DsaPointerIsValid(tbm->dsapagetableold))
1533 {
1534 dsa_free(tbm->dsa, tbm->dsapagetableold);
1536 }
1537}
1538
1539/*
1540 * tbm_calculate_entries
1541 *
1542 * Estimate number of hashtable entries we can have within maxbytes.
1543 */
1544int
1546{
1547 Size nbuckets;
1548
1549 /*
1550 * Estimate number of hashtable entries we can have within maxbytes. This
1551 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552 * for our purpose. Also count an extra Pointer per entry for the arrays
1553 * created during iteration readout.
1554 */
1555 nbuckets = maxbytes /
1556 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1558 nbuckets = Max(nbuckets, 16); /* sanity limit */
1559
1560 return (int) nbuckets;
1561}
1562
1563/*
1564 * Create a shared or private bitmap iterator and start iteration.
1565 *
1566 * `tbm` is only used to create the private iterator and dsa and dsp are only
1567 * used to create the shared iterator.
1568 *
1569 * Before invoking tbm_begin_iterate() to create a shared iterator, one
1570 * process must already have invoked tbm_prepare_shared_iterate() to create
1571 * and set up the TBMSharedIteratorState.
1572 */
1575{
1576 TBMIterator iterator = {0};
1577
1578 /* Allocate a private iterator and attach the shared state to it */
1579 if (DsaPointerIsValid(dsp))
1580 {
1581 iterator.shared = true;
1582 iterator.i.shared_iterator = tbm_attach_shared_iterate(dsa, dsp);
1583 }
1584 else
1585 {
1586 iterator.shared = false;
1588 }
1589
1590 return iterator;
1591}
1592
1593/*
1594 * Clean up shared or private bitmap iterator.
1595 */
1596void
1598{
1599 Assert(iterator && !tbm_exhausted(iterator));
1600
1601 if (iterator->shared)
1603 else
1605
1606 *iterator = (TBMIterator)
1607 {
1608 0
1609 };
1610}
1611
1612/*
1613 * Populate the next TBMIterateResult using the shared or private bitmap
1614 * iterator. Returns false when there is nothing more to scan.
1615 */
1616bool
1618{
1619 Assert(iterator);
1620 Assert(tbmres);
1621
1622 if (iterator->shared)
1623 return tbm_shared_iterate(iterator->i.shared_iterator, tbmres);
1624 else
1625 return tbm_private_iterate(iterator->i.private_iterator, tbmres);
1626}
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:262
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:439
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:221
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:424
uint32 bitmapword
Definition: bitmapset.h:44
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:43
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define Min(x, y)
Definition: c.h:975
#define Max(x, y)
Definition: c.h:969
char * Pointer
Definition: c.h:493
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:434
uint32_t uint32
Definition: c.h:502
#define MemSet(start, val, len)
Definition: c.h:991
size_t Size
Definition: c.h:576
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:942
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:671
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:826
#define dsa_allocate0(area, size)
Definition: dsa.h:113
uint64 dsa_pointer
Definition: dsa.h:62
#define dsa_allocate(area, size)
Definition: dsa.h:109
#define InvalidDsaPointer
Definition: dsa.h:78
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:28
Assert(PointerIsAligned(start, uint64))
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:652
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:721
@ LWTRANCHE_SHARED_TIDBITMAP
Definition: lwlock.h:203
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1256
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc0(Size size)
Definition: mcxt.c:1970
void * palloc(Size size)
Definition: mcxt.c:1940
MemoryContext CurrentMemoryContext
Definition: mcxt.c:159
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:1313
NodeTag
Definition: nodes.h:27
#define makeNode(_type_)
Definition: nodes.h:161
uint16 OffsetNumber
Definition: off.h:24
void * arg
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:479
Definition: lwlock.h:42
pg_atomic_uint32 refcount
Definition: tidbitmap.c:105
PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:106
pg_atomic_uint32 refcount
Definition: tidbitmap.c:201
int index[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:202
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:97
BlockNumber blockno
Definition: tidbitmap.c:93
void * internal_page
Definition: tidbitmap.h:79
BlockNumber blockno
Definition: tidbitmap.h:64
union TBMIterator::@109 i
TBMSharedIterator * shared_iterator
Definition: tidbitmap.h:57
TBMPrivateIterator * private_iterator
Definition: tidbitmap.h:56
bool shared
Definition: tidbitmap.h:53
TIDBitmap * tbm
Definition: tidbitmap.c:171
dsa_pointer pagetable
Definition: tidbitmap.c:187
dsa_pointer spages
Definition: tidbitmap.c:188
dsa_pointer schunks
Definition: tidbitmap.c:189
TBMSharedIteratorState * state
Definition: tidbitmap.c:211
PTEntryArray * ptbase
Definition: tidbitmap.c:212
PTIterationArray * ptchunks
Definition: tidbitmap.c:214
PTIterationArray * ptpages
Definition: tidbitmap.c:213
dsa_pointer ptpages
Definition: tidbitmap.c:158
TBMIteratingState iterating
Definition: tidbitmap.c:150
int nentries
Definition: tidbitmap.c:146
dsa_pointer dsapagetableold
Definition: tidbitmap.c:157
struct pagetable_hash * pagetable
Definition: tidbitmap.c:145
PagetableEntry ** schunks
Definition: tidbitmap.c:155
MemoryContext mcxt
Definition: tidbitmap.c:143
int npages
Definition: tidbitmap.c:148
int maxentries
Definition: tidbitmap.c:147
uint32 lossify_start
Definition: tidbitmap.c:151
int nchunks
Definition: tidbitmap.c:149
NodeTag type
Definition: tidbitmap.c:142
dsa_pointer ptchunks
Definition: tidbitmap.c:159
TBMStatus status
Definition: tidbitmap.c:144
PagetableEntry entry1
Definition: tidbitmap.c:152
dsa_area * dsa
Definition: tidbitmap.c:160
dsa_pointer dsapagetable
Definition: tidbitmap.c:156
PagetableEntry ** spages
Definition: tidbitmap.c:154
Definition: dsa.c:348
Definition: type.h:96
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:311
struct PTIterationArray PTIterationArray
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:1617
bool tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:1057
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:366
#define WORDNUM(x)
Definition: tidbitmap.c:69
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:1359
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:659
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1597
#define WORDS_PER_PAGE
Definition: tidbitmap.c:73
TBMIteratingState
Definition: tidbitmap.c:131
@ TBM_ITERATING_SHARED
Definition: tidbitmap.c:134
@ TBM_NOT_ITERATING
Definition: tidbitmap.c:132
@ TBM_ITERATING_PRIVATE
Definition: tidbitmap.c:133
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1162
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1253
static void * pagetable_allocate(pagetable_hash *pagetable, Size size)
Definition: tidbitmap.c:1497
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:754
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:529
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:330
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1287
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:470
#define BITNUM(x)
Definition: tidbitmap.c:70
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1173
bool tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:977
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:432
static void tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
Definition: tidbitmap.c:938
TBMStatus
Definition: tidbitmap.c:121
@ TBM_EMPTY
Definition: tidbitmap.c:122
@ TBM_ONE_PAGE
Definition: tidbitmap.c:123
@ TBM_HASH
Definition: tidbitmap.c:124
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1465
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:65
TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp)
Definition: tidbitmap.c:1574
static int tbm_shared_comparator(const void *left, const void *right, void *arg)
Definition: tidbitmap.c:1442
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1428
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:281
int tbm_extract_page_tuple(TBMIterateResult *iteritem, OffsetNumber *offsets, uint32 max_offsets)
Definition: tidbitmap.c:901
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:75
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:447
void tbm_end_private_iterate(TBMPrivateIterator *iterator)
Definition: tidbitmap.c:1150
TIDBitmap * tbm_create(Size maxbytes, dsa_area *dsa)
Definition: tidbitmap.c:255
struct PagetableEntry PagetableEntry
TBMPrivateIterator * tbm_begin_private_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:678
struct PTEntryArray PTEntryArray
int tbm_calculate_entries(Size maxbytes)
Definition: tidbitmap.c:1545
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:578
struct TBMSharedIteratorState TBMSharedIteratorState
static void pagetable_free(pagetable_hash *pagetable, void *pointer)
Definition: tidbitmap.c:1525
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1206
struct TBMIterator TBMIterator
#define TBM_MAX_TUPLES_PER_PAGE
Definition: tidbitmap.h:35
static bool tbm_exhausted(TBMIterator *iterator)
Definition: tidbitmap.h:119