PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
tidbitmap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  * PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory. People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness. In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking. This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  * src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include <limits.h>
42 
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48 
49 /*
50  * The maximum number of tuples per page is not large (typically 256 with
51  * 8K pages, or 1024 with 32K pages). So there's not much point in making
52  * the per-page bitmaps variable size. We just legislate that the size
53  * is this:
54  */
55 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
56 
57 /*
58  * When we have to switch over to lossy storage, we use a data structure
59  * with one bit per page, where all pages having the same number DIV
60  * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
61  * and has the bit set for a given page, there must not be a per-page entry
62  * for that page in the page table.
63  *
64  * We actually store both exact pages and lossy chunks in the same hash
65  * table, using identical data structures. (This is because the memory
66  * management for hashtables doesn't easily/efficiently allow space to be
67  * transferred easily from one hashtable to another.) Therefore it's best
68  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
69  * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
70  * avoid expensive integer remainder operations. So, define it like this:
71  */
72 #define PAGES_PER_CHUNK (BLCKSZ / 32)
73 
74 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
75 
76 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
77 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
78 
79 /* number of active words for an exact page: */
80 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
81 /* number of active words for a lossy chunk: */
82 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
83 
84 /*
85  * The hashtable entries are represented by this data structure. For
86  * an exact page, blockno is the page number and bit k of the bitmap
87  * represents tuple offset k+1. For a lossy chunk, blockno is the first
88  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
89  * bit k represents page blockno+k. Note that it is not possible to
90  * have exact storage for the first page of a chunk if we are using
91  * lossy storage for any page in the chunk's range, since the same
92  * hashtable entry has to serve both purposes.
93  *
94  * recheck is used only on exact pages --- it indicates that although
95  * only the stated tuples need be checked, the full index qual condition
96  * must be checked for each (ie, these are candidate matches).
97  */
98 typedef struct PagetableEntry
99 {
100  BlockNumber blockno; /* page number (hashtable key) */
101  char status; /* hash entry status */
102  bool ischunk; /* T = lossy storage, F = exact */
103  bool recheck; /* should the tuples be rechecked? */
106 
107 /*
108  * Holds array of pagetable entries.
109  */
110 typedef struct PTEntryArray
111 {
112  pg_atomic_uint32 refcount; /* no. of iterator attached */
113  PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
114 } PTEntryArray;
115 
116 /*
117  * We want to avoid the overhead of creating the hashtable, which is
118  * comparatively large, when not necessary. Particularly when we are using a
119  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
120  * long enough to accumulate one entry in such cases. We therefore avoid
121  * creating an actual hashtable until we need two pagetable entries. When
122  * just one pagetable entry is needed, we store it in a fixed field of
123  * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
124  * shrinks down to zero or one page again. So, status can be TBM_HASH even
125  * when nentries is zero or one.)
126  */
127 typedef enum
128 {
129  TBM_EMPTY, /* no hashtable, nentries == 0 */
130  TBM_ONE_PAGE, /* entry1 contains the single entry */
131  TBM_HASH /* pagetable is valid, entry1 is not */
132 } TBMStatus;
133 
134 /*
135  * Current iterating state of the TBM.
136  */
137 typedef enum
138 {
139  TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
140  TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
141  TBM_ITERATING_SHARED /* converted to shared page and chunk array */
143 
144 /*
145  * Here is the representation for a whole TIDBitMap:
146  */
147 struct TIDBitmap
148 {
149  NodeTag type; /* to make it a valid Node */
150  MemoryContext mcxt; /* memory context containing me */
151  TBMStatus status; /* see codes above */
152  struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
153  int nentries; /* number of entries in pagetable */
154  int maxentries; /* limit on same to meet maxbytes */
155  int npages; /* number of exact entries in pagetable */
156  int nchunks; /* number of lossy entries in pagetable */
157  TBMIteratingState iterating; /* tbm_begin_iterate called? */
158  uint32 lossify_start; /* offset to start lossifying hashtable at */
159  PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
160  /* these are valid when iterating is true: */
161  PagetableEntry **spages; /* sorted exact-page list, or NULL */
162  PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
163  dsa_pointer dsapagetable; /* dsa_pointer to the element array */
164  dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
165  dsa_pointer ptpages; /* dsa_pointer to the page array */
166  dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
167  dsa_area *dsa; /* reference to per-query dsa area */
168 };
169 
170 /*
171  * When iterating over a bitmap in sorted order, a TBMIterator is used to
172  * track our progress. There can be several iterators scanning the same
173  * bitmap concurrently. Note that the bitmap becomes read-only as soon as
174  * any iterator is created.
175  */
177 {
178  TIDBitmap *tbm; /* TIDBitmap we're iterating over */
179  int spageptr; /* next spages index */
180  int schunkptr; /* next schunks index */
181  int schunkbit; /* next bit to check in current schunk */
182  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
183 };
184 
185 /*
186  * Holds the shared members of the iterator so that multiple processes
187  * can jointly iterate.
188  */
190 {
191  int nentries; /* number of entries in pagetable */
192  int maxentries; /* limit on same to meet maxbytes */
193  int npages; /* number of exact entries in pagetable */
194  int nchunks; /* number of lossy entries in pagetable */
195  dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
196  dsa_pointer spages; /* dsa pointer to page array */
197  dsa_pointer schunks; /* dsa pointer to chunk array */
198  LWLock lock; /* lock to protect below members */
199  int spageptr; /* next spages index */
200  int schunkptr; /* next schunks index */
201  int schunkbit; /* next bit to check in current schunk */
203 
204 /*
205  * pagetable iteration array.
206  */
207 typedef struct PTIterationArray
208 {
209  pg_atomic_uint32 refcount; /* no. of iterator attached */
210  int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
212 
213 /*
214  * same as TBMIterator, but it is used for joint iteration, therefore this
215  * also holds a reference to the shared state.
216  */
218 {
219  TBMSharedIteratorState *state; /* shared state */
220  PTEntryArray *ptbase; /* pagetable element array */
221  PTIterationArray *ptpages; /* sorted exact page index list */
222  PTIterationArray *ptchunks; /* sorted lossy page index list */
223  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
224 };
225 
226 /* Local function prototypes */
227 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
228 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
229  const TIDBitmap *b);
230 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
231  BlockNumber pageno);
233 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
234 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_lossify(TIDBitmap *tbm);
236 static int tbm_comparator(const void *left, const void *right);
237 static int tbm_shared_comparator(const void *left, const void *right,
238  void *arg);
239 
240 /*
241  * Simple inline murmur hash implementation for the exact width required, for
242  * performance.
243  */
244 static inline uint32
246 {
247  uint32 h = b;
248 
249  h ^= h >> 16;
250  h *= 0x85ebca6b;
251  h ^= h >> 13;
252  h *= 0xc2b2ae35;
253  h ^= h >> 16;
254  return h;
255 }
256 
257 /* define hashtable mapping block numbers to PagetableEntry's */
258 #define SH_USE_NONDEFAULT_ALLOCATOR
259 #define SH_PREFIX pagetable
260 #define SH_ELEMENT_TYPE PagetableEntry
261 #define SH_KEY_TYPE BlockNumber
262 #define SH_KEY blockno
263 #define SH_HASH_KEY(tb, key) hash_blockno(key)
264 #define SH_EQUAL(tb, a, b) a == b
265 #define SH_SCOPE static inline
266 #define SH_DEFINE
267 #define SH_DECLARE
268 #include "lib/simplehash.h"
269 
270 
271 /*
272  * tbm_create - create an initially-empty bitmap
273  *
274  * The bitmap will live in the memory context that is CurrentMemoryContext
275  * at the time of this call. It will be limited to (approximately) maxbytes
276  * total memory consumption. If the DSA passed to this function is not NULL
277  * then the memory for storing elements of the underlying page table will
278  * be allocated from the DSA.
279  */
280 TIDBitmap *
281 tbm_create(long maxbytes, dsa_area *dsa)
282 {
283  TIDBitmap *tbm;
284  long nbuckets;
285 
286  /* Create the TIDBitmap struct and zero all its fields */
287  tbm = makeNode(TIDBitmap);
288 
289  tbm->mcxt = CurrentMemoryContext;
290  tbm->status = TBM_EMPTY;
291 
292  /*
293  * Estimate number of hashtable entries we can have within maxbytes. This
294  * estimates the hash cost as sizeof(PagetableEntry), which is good enough
295  * for our purpose. Also count an extra Pointer per entry for the arrays
296  * created during iteration readout.
297  */
298  nbuckets = maxbytes /
299  (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
300  nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
301  nbuckets = Max(nbuckets, 16); /* sanity limit */
302  tbm->maxentries = (int) nbuckets;
303  tbm->lossify_start = 0;
304  tbm->dsa = dsa;
307  tbm->ptpages = InvalidDsaPointer;
309 
310  return tbm;
311 }
312 
313 /*
314  * Actually create the hashtable. Since this is a moderately expensive
315  * proposition, we don't do it until we have to.
316  */
317 static void
319 {
320  Assert(tbm->status != TBM_HASH);
321  Assert(tbm->pagetable == NULL);
322 
323  tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
324 
325  /* If entry1 is valid, push it into the hashtable */
326  if (tbm->status == TBM_ONE_PAGE)
327  {
328  PagetableEntry *page;
329  bool found;
330  char oldstatus;
331 
332  page = pagetable_insert(tbm->pagetable,
333  tbm->entry1.blockno,
334  &found);
335  Assert(!found);
336  oldstatus = page->status;
337  memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
338  page->status = oldstatus;
339  }
340 
341  tbm->status = TBM_HASH;
342 }
343 
344 /*
345  * tbm_free - free a TIDBitmap
346  */
347 void
349 {
350  if (tbm->pagetable)
351  pagetable_destroy(tbm->pagetable);
352  if (tbm->spages)
353  pfree(tbm->spages);
354  if (tbm->schunks)
355  pfree(tbm->schunks);
356  pfree(tbm);
357 }
358 
359 /*
360  * tbm_free_shared_area - free shared state
361  *
362  * Free shared iterator state, Also free shared pagetable and iterator arrays
363  * memory if they are not referred by any of the shared iterator i.e recount
364  * is becomes 0.
365  */
366 void
368 {
369  TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
370  PTEntryArray *ptbase;
371  PTIterationArray *ptpages;
372  PTIterationArray *ptchunks;
373 
374  if (DsaPointerIsValid(istate->pagetable))
375  {
376  ptbase = dsa_get_address(dsa, istate->pagetable);
377  if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
378  dsa_free(dsa, istate->pagetable);
379  }
380  if (DsaPointerIsValid(istate->spages))
381  {
382  ptpages = dsa_get_address(dsa, istate->spages);
383  if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
384  dsa_free(dsa, istate->spages);
385  }
386  if (DsaPointerIsValid(istate->schunks))
387  {
388  ptchunks = dsa_get_address(dsa, istate->schunks);
389  if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
390  dsa_free(dsa, istate->schunks);
391  }
392 
393  dsa_free(dsa, dp);
394 }
395 
396 /*
397  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
398  *
399  * If recheck is true, then the recheck flag will be set in the
400  * TBMIterateResult when any of these tuples are reported out.
401  */
402 void
403 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
404  bool recheck)
405 {
407  PagetableEntry *page = NULL; /* only valid when currblk is valid */
408  int i;
409 
411  for (i = 0; i < ntids; i++)
412  {
413  BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
415  int wordnum,
416  bitnum;
417 
418  /* safety check to ensure we don't overrun bit array bounds */
419  if (off < 1 || off > MAX_TUPLES_PER_PAGE)
420  elog(ERROR, "tuple offset out of range: %u", off);
421 
422  /*
423  * Look up target page unless we already did. This saves cycles when
424  * the input includes consecutive tuples on the same page, which is
425  * common enough to justify an extra test here.
426  */
427  if (blk != currblk)
428  {
429  if (tbm_page_is_lossy(tbm, blk))
430  page = NULL; /* remember page is lossy */
431  else
432  page = tbm_get_pageentry(tbm, blk);
433  currblk = blk;
434  }
435 
436  if (page == NULL)
437  continue; /* whole page is already marked */
438 
439  if (page->ischunk)
440  {
441  /* The page is a lossy chunk header, set bit for itself */
442  wordnum = bitnum = 0;
443  }
444  else
445  {
446  /* Page is exact, so set bit for individual tuple */
447  wordnum = WORDNUM(off - 1);
448  bitnum = BITNUM(off - 1);
449  }
450  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
451  page->recheck |= recheck;
452 
453  if (tbm->nentries > tbm->maxentries)
454  {
455  tbm_lossify(tbm);
456  /* Page could have been converted to lossy, so force new lookup */
457  currblk = InvalidBlockNumber;
458  }
459  }
460 }
461 
462 /*
463  * tbm_add_page - add a whole page to a TIDBitmap
464  *
465  * This causes the whole page to be reported (with the recheck flag)
466  * when the TIDBitmap is scanned.
467  */
468 void
470 {
471  /* Enter the page in the bitmap, or mark it lossy if already present */
472  tbm_mark_page_lossy(tbm, pageno);
473  /* If we went over the memory limit, lossify some more pages */
474  if (tbm->nentries > tbm->maxentries)
475  tbm_lossify(tbm);
476 }
477 
478 /*
479  * tbm_union - set union
480  *
481  * a is modified in-place, b is not changed
482  */
483 void
485 {
486  Assert(!a->iterating);
487  /* Nothing to do if b is empty */
488  if (b->nentries == 0)
489  return;
490  /* Scan through chunks and pages in b, merge into a */
491  if (b->status == TBM_ONE_PAGE)
492  tbm_union_page(a, &b->entry1);
493  else
494  {
495  pagetable_iterator i;
496  PagetableEntry *bpage;
497 
498  Assert(b->status == TBM_HASH);
499  pagetable_start_iterate(b->pagetable, &i);
500  while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
501  tbm_union_page(a, bpage);
502  }
503 }
504 
505 /* Process one page of b during a union op */
506 static void
508 {
509  PagetableEntry *apage;
510  int wordnum;
511 
512  if (bpage->ischunk)
513  {
514  /* Scan b's chunk, mark each indicated page lossy in a */
515  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
516  {
517  bitmapword w = bpage->words[wordnum];
518 
519  if (w != 0)
520  {
521  BlockNumber pg;
522 
523  pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
524  while (w != 0)
525  {
526  if (w & 1)
527  tbm_mark_page_lossy(a, pg);
528  pg++;
529  w >>= 1;
530  }
531  }
532  }
533  }
534  else if (tbm_page_is_lossy(a, bpage->blockno))
535  {
536  /* page is already lossy in a, nothing to do */
537  return;
538  }
539  else
540  {
541  apage = tbm_get_pageentry(a, bpage->blockno);
542  if (apage->ischunk)
543  {
544  /* The page is a lossy chunk header, set bit for itself */
545  apage->words[0] |= ((bitmapword) 1 << 0);
546  }
547  else
548  {
549  /* Both pages are exact, merge at the bit level */
550  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
551  apage->words[wordnum] |= bpage->words[wordnum];
552  apage->recheck |= bpage->recheck;
553  }
554  }
555 
556  if (a->nentries > a->maxentries)
557  tbm_lossify(a);
558 }
559 
560 /*
561  * tbm_intersect - set intersection
562  *
563  * a is modified in-place, b is not changed
564  */
565 void
567 {
568  Assert(!a->iterating);
569  /* Nothing to do if a is empty */
570  if (a->nentries == 0)
571  return;
572  /* Scan through chunks and pages in a, try to match to b */
573  if (a->status == TBM_ONE_PAGE)
574  {
575  if (tbm_intersect_page(a, &a->entry1, b))
576  {
577  /* Page is now empty, remove it from a */
578  Assert(!a->entry1.ischunk);
579  a->npages--;
580  a->nentries--;
581  Assert(a->nentries == 0);
582  a->status = TBM_EMPTY;
583  }
584  }
585  else
586  {
587  pagetable_iterator i;
588  PagetableEntry *apage;
589 
590  Assert(a->status == TBM_HASH);
591  pagetable_start_iterate(a->pagetable, &i);
592  while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
593  {
594  if (tbm_intersect_page(a, apage, b))
595  {
596  /* Page or chunk is now empty, remove it from a */
597  if (apage->ischunk)
598  a->nchunks--;
599  else
600  a->npages--;
601  a->nentries--;
602  if (!pagetable_delete(a->pagetable, apage->blockno))
603  elog(ERROR, "hash table corrupted");
604  }
605  }
606  }
607 }
608 
609 /*
610  * Process one page of a during an intersection op
611  *
612  * Returns TRUE if apage is now empty and should be deleted from a
613  */
614 static bool
616 {
617  const PagetableEntry *bpage;
618  int wordnum;
619 
620  if (apage->ischunk)
621  {
622  /* Scan each bit in chunk, try to clear */
623  bool candelete = true;
624 
625  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
626  {
627  bitmapword w = apage->words[wordnum];
628 
629  if (w != 0)
630  {
631  bitmapword neww = w;
632  BlockNumber pg;
633  int bitnum;
634 
635  pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
636  bitnum = 0;
637  while (w != 0)
638  {
639  if (w & 1)
640  {
641  if (!tbm_page_is_lossy(b, pg) &&
642  tbm_find_pageentry(b, pg) == NULL)
643  {
644  /* Page is not in b at all, lose lossy bit */
645  neww &= ~((bitmapword) 1 << bitnum);
646  }
647  }
648  pg++;
649  bitnum++;
650  w >>= 1;
651  }
652  apage->words[wordnum] = neww;
653  if (neww != 0)
654  candelete = false;
655  }
656  }
657  return candelete;
658  }
659  else if (tbm_page_is_lossy(b, apage->blockno))
660  {
661  /*
662  * Some of the tuples in 'a' might not satisfy the quals for 'b', but
663  * because the page 'b' is lossy, we don't know which ones. Therefore
664  * we mark 'a' as requiring rechecks, to indicate that at most those
665  * tuples set in 'a' are matches.
666  */
667  apage->recheck = true;
668  return false;
669  }
670  else
671  {
672  bool candelete = true;
673 
674  bpage = tbm_find_pageentry(b, apage->blockno);
675  if (bpage != NULL)
676  {
677  /* Both pages are exact, merge at the bit level */
678  Assert(!bpage->ischunk);
679  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
680  {
681  apage->words[wordnum] &= bpage->words[wordnum];
682  if (apage->words[wordnum] != 0)
683  candelete = false;
684  }
685  apage->recheck |= bpage->recheck;
686  }
687  /* If there is no matching b page, we can just delete the a page */
688  return candelete;
689  }
690 }
691 
692 /*
693  * tbm_is_empty - is a TIDBitmap completely empty?
694  */
695 bool
697 {
698  return (tbm->nentries == 0);
699 }
700 
701 /*
702  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
703  *
704  * The TBMIterator struct is created in the caller's memory context.
705  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
706  * okay to just allow the memory context to be released, too. It is caller's
707  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
708  * is freed.
709  *
710  * NB: after this is called, it is no longer allowed to modify the contents
711  * of the bitmap. However, you can call this multiple times to scan the
712  * contents repeatedly, including parallel scans.
713  */
714 TBMIterator *
716 {
717  TBMIterator *iterator;
718 
720 
721  /*
722  * Create the TBMIterator struct, with enough trailing space to serve the
723  * needs of the TBMIterateResult sub-struct.
724  */
725  iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
727  iterator->tbm = tbm;
728 
729  /*
730  * Initialize iteration pointers.
731  */
732  iterator->spageptr = 0;
733  iterator->schunkptr = 0;
734  iterator->schunkbit = 0;
735 
736  /*
737  * If we have a hashtable, create and fill the sorted page lists, unless
738  * we already did that for a previous iterator. Note that the lists are
739  * attached to the bitmap not the iterator, so they can be used by more
740  * than one iterator.
741  */
742  if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
743  {
744  pagetable_iterator i;
745  PagetableEntry *page;
746  int npages;
747  int nchunks;
748 
749  if (!tbm->spages && tbm->npages > 0)
750  tbm->spages = (PagetableEntry **)
752  tbm->npages * sizeof(PagetableEntry *));
753  if (!tbm->schunks && tbm->nchunks > 0)
754  tbm->schunks = (PagetableEntry **)
756  tbm->nchunks * sizeof(PagetableEntry *));
757 
758  npages = nchunks = 0;
759  pagetable_start_iterate(tbm->pagetable, &i);
760  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
761  {
762  if (page->ischunk)
763  tbm->schunks[nchunks++] = page;
764  else
765  tbm->spages[npages++] = page;
766  }
767  Assert(npages == tbm->npages);
768  Assert(nchunks == tbm->nchunks);
769  if (npages > 1)
770  qsort(tbm->spages, npages, sizeof(PagetableEntry *),
772  if (nchunks > 1)
773  qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
775  }
776 
778 
779  return iterator;
780 }
781 
782 /*
783  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
784  *
785  * The necessary shared state will be allocated from the DSA passed to
786  * tbm_create, so that multiple processes can attach to it and iterate jointly.
787  *
788  * This will convert the pagetable hash into page and chunk array of the index
789  * into pagetable array.
790  */
793 {
794  dsa_pointer dp;
795  TBMSharedIteratorState *istate;
796  PTEntryArray *ptbase = NULL;
797  PTIterationArray *ptpages = NULL;
798  PTIterationArray *ptchunks = NULL;
799 
800  Assert(tbm->dsa != NULL);
802 
803  /*
804  * Allocate TBMSharedIteratorState from DSA to hold the shared members and
805  * lock, this will also be used by multiple worker for shared iterate.
806  */
807  dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
808  istate = dsa_get_address(tbm->dsa, dp);
809 
810  /*
811  * If we're not already iterating, create and fill the sorted page lists.
812  * (If we are, the sorted page lists are already stored in the TIDBitmap,
813  * and we can just reuse them.)
814  */
815  if (tbm->iterating == TBM_NOT_ITERATING)
816  {
817  pagetable_iterator i;
818  PagetableEntry *page;
819  int idx;
820  int npages;
821  int nchunks;
822 
823  /*
824  * Allocate the page and chunk array memory from the DSA to share
825  * across multiple processes.
826  */
827  if (tbm->npages)
828  {
829  tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
830  tbm->npages * sizeof(int));
831  ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
832  pg_atomic_init_u32(&ptpages->refcount, 0);
833  }
834  if (tbm->nchunks)
835  {
836  tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
837  tbm->nchunks * sizeof(int));
838  ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
839  pg_atomic_init_u32(&ptchunks->refcount, 0);
840  }
841 
842  /*
843  * If TBM status is TBM_HASH then iterate over the pagetable and
844  * convert it to page and chunk arrays. But if it's in the
845  * TBM_ONE_PAGE mode then directly allocate the space for one entry
846  * from the DSA.
847  */
848  npages = nchunks = 0;
849  if (tbm->status == TBM_HASH)
850  {
851  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
852 
853  pagetable_start_iterate(tbm->pagetable, &i);
854  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
855  {
856  idx = page - ptbase->ptentry;
857  if (page->ischunk)
858  ptchunks->index[nchunks++] = idx;
859  else
860  ptpages->index[npages++] = idx;
861  }
862 
863  Assert(npages == tbm->npages);
864  Assert(nchunks == tbm->nchunks);
865  }
866  else if (tbm->status == TBM_ONE_PAGE)
867  {
868  /*
869  * In one page mode allocate the space for one pagetable entry,
870  * initialize it, and directly store its index (i.e. 0) in the
871  * page array.
872  */
873  tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
874  sizeof(PagetableEntry));
875  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876  memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
877  ptpages->index[0] = 0;
878  }
879 
880  if (ptbase != NULL)
881  pg_atomic_init_u32(&ptbase->refcount, 0);
882  if (npages > 1)
883  qsort_arg((void *) (ptpages->index), npages, sizeof(int),
884  tbm_shared_comparator, (void *) ptbase->ptentry);
885  if (nchunks > 1)
886  qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
887  tbm_shared_comparator, (void *) ptbase->ptentry);
888  }
889 
890  /*
891  * Store the TBM members in the shared state so that we can share them
892  * across multiple processes.
893  */
894  istate->nentries = tbm->nentries;
895  istate->maxentries = tbm->maxentries;
896  istate->npages = tbm->npages;
897  istate->nchunks = tbm->nchunks;
898  istate->pagetable = tbm->dsapagetable;
899  istate->spages = tbm->ptpages;
900  istate->schunks = tbm->ptchunks;
901 
902  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
903  ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
904  ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
905 
906  /*
907  * For every shared iterator, referring to pagetable and iterator array,
908  * increase the refcount by 1 so that while freeing the shared iterator we
909  * don't free pagetable and iterator array until its refcount becomes 0.
910  */
911  if (ptbase != NULL)
912  pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
913  if (ptpages != NULL)
914  pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
915  if (ptchunks != NULL)
916  pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
917 
918  /* Initialize the iterator lock */
920 
921  /* Initialize the shared iterator state */
922  istate->schunkbit = 0;
923  istate->schunkptr = 0;
924  istate->spageptr = 0;
925 
927 
928  return dp;
929 }
930 
931 /*
932  * tbm_extract_page_tuple - extract the tuple offsets from a page
933  *
934  * The extracted offsets are stored into TBMIterateResult.
935  */
936 static inline int
938 {
939  int wordnum;
940  int ntuples = 0;
941 
942  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
943  {
944  bitmapword w = page->words[wordnum];
945 
946  if (w != 0)
947  {
948  int off = wordnum * BITS_PER_BITMAPWORD + 1;
949 
950  while (w != 0)
951  {
952  if (w & 1)
953  output->offsets[ntuples++] = (OffsetNumber) off;
954  off++;
955  w >>= 1;
956  }
957  }
958  }
959 
960  return ntuples;
961 }
962 
963 /*
964  * tbm_advance_schunkbit - Advance the chunkbit
965  */
966 static inline void
967 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
968 {
969  int schunkbit = *schunkbitp;
970 
971  while (schunkbit < PAGES_PER_CHUNK)
972  {
973  int wordnum = WORDNUM(schunkbit);
974  int bitnum = BITNUM(schunkbit);
975 
976  if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
977  break;
978  schunkbit++;
979  }
980 
981  *schunkbitp = schunkbit;
982 }
983 
984 /*
985  * tbm_iterate - scan through next page of a TIDBitmap
986  *
987  * Returns a TBMIterateResult representing one page, or NULL if there are
988  * no more pages to scan. Pages are guaranteed to be delivered in numerical
989  * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
990  * remember the exact tuples to look at on this page --- the caller must
991  * examine all tuples on the page and check if they meet the intended
992  * condition. If result->recheck is true, only the indicated tuples need
993  * be examined, but the condition must be rechecked anyway. (For ease of
994  * testing, recheck is always set true when ntuples < 0.)
995  */
998 {
999  TIDBitmap *tbm = iterator->tbm;
1000  TBMIterateResult *output = &(iterator->output);
1001 
1003 
1004  /*
1005  * If lossy chunk pages remain, make sure we've advanced schunkptr/
1006  * schunkbit to the next set bit.
1007  */
1008  while (iterator->schunkptr < tbm->nchunks)
1009  {
1010  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1011  int schunkbit = iterator->schunkbit;
1012 
1013  tbm_advance_schunkbit(chunk, &schunkbit);
1014  if (schunkbit < PAGES_PER_CHUNK)
1015  {
1016  iterator->schunkbit = schunkbit;
1017  break;
1018  }
1019  /* advance to next chunk */
1020  iterator->schunkptr++;
1021  iterator->schunkbit = 0;
1022  }
1023 
1024  /*
1025  * If both chunk and per-page data remain, must output the numerically
1026  * earlier page.
1027  */
1028  if (iterator->schunkptr < tbm->nchunks)
1029  {
1030  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1031  BlockNumber chunk_blockno;
1032 
1033  chunk_blockno = chunk->blockno + iterator->schunkbit;
1034  if (iterator->spageptr >= tbm->npages ||
1035  chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1036  {
1037  /* Return a lossy page indicator from the chunk */
1038  output->blockno = chunk_blockno;
1039  output->ntuples = -1;
1040  output->recheck = true;
1041  iterator->schunkbit++;
1042  return output;
1043  }
1044  }
1045 
1046  if (iterator->spageptr < tbm->npages)
1047  {
1048  PagetableEntry *page;
1049  int ntuples;
1050 
1051  /* In ONE_PAGE state, we don't allocate an spages[] array */
1052  if (tbm->status == TBM_ONE_PAGE)
1053  page = &tbm->entry1;
1054  else
1055  page = tbm->spages[iterator->spageptr];
1056 
1057  /* scan bitmap to extract individual offset numbers */
1058  ntuples = tbm_extract_page_tuple(page, output);
1059  output->blockno = page->blockno;
1060  output->ntuples = ntuples;
1061  output->recheck = page->recheck;
1062  iterator->spageptr++;
1063  return output;
1064  }
1065 
1066  /* Nothing more in the bitmap */
1067  return NULL;
1068 }
1069 
1070 /*
1071  * tbm_shared_iterate - scan through next page of a TIDBitmap
1072  *
1073  * As above, but this will iterate using an iterator which is shared
1074  * across multiple processes. We need to acquire the iterator LWLock,
1075  * before accessing the shared members.
1076  */
1079 {
1080  TBMIterateResult *output = &iterator->output;
1081  TBMSharedIteratorState *istate = iterator->state;
1082  PagetableEntry *ptbase = NULL;
1083  int *idxpages = NULL;
1084  int *idxchunks = NULL;
1085 
1086  if (iterator->ptbase != NULL)
1087  ptbase = iterator->ptbase->ptentry;
1088  if (iterator->ptpages != NULL)
1089  idxpages = iterator->ptpages->index;
1090  if (iterator->ptchunks != NULL)
1091  idxchunks = iterator->ptchunks->index;
1092 
1093  /* Acquire the LWLock before accessing the shared members */
1094  LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1095 
1096  /*
1097  * If lossy chunk pages remain, make sure we've advanced schunkptr/
1098  * schunkbit to the next set bit.
1099  */
1100  while (istate->schunkptr < istate->nchunks)
1101  {
1102  PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1103  int schunkbit = istate->schunkbit;
1104 
1105  tbm_advance_schunkbit(chunk, &schunkbit);
1106  if (schunkbit < PAGES_PER_CHUNK)
1107  {
1108  istate->schunkbit = schunkbit;
1109  break;
1110  }
1111  /* advance to next chunk */
1112  istate->schunkptr++;
1113  istate->schunkbit = 0;
1114  }
1115 
1116  /*
1117  * If both chunk and per-page data remain, must output the numerically
1118  * earlier page.
1119  */
1120  if (istate->schunkptr < istate->nchunks)
1121  {
1122  PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1123  BlockNumber chunk_blockno;
1124 
1125  chunk_blockno = chunk->blockno + istate->schunkbit;
1126 
1127  if (istate->spageptr >= istate->npages ||
1128  chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1129  {
1130  /* Return a lossy page indicator from the chunk */
1131  output->blockno = chunk_blockno;
1132  output->ntuples = -1;
1133  output->recheck = true;
1134  istate->schunkbit++;
1135 
1136  LWLockRelease(&istate->lock);
1137  return output;
1138  }
1139  }
1140 
1141  if (istate->spageptr < istate->npages)
1142  {
1143  PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1144  int ntuples;
1145 
1146  /* scan bitmap to extract individual offset numbers */
1147  ntuples = tbm_extract_page_tuple(page, output);
1148  output->blockno = page->blockno;
1149  output->ntuples = ntuples;
1150  output->recheck = page->recheck;
1151  istate->spageptr++;
1152 
1153  LWLockRelease(&istate->lock);
1154 
1155  return output;
1156  }
1157 
1158  LWLockRelease(&istate->lock);
1159 
1160  /* Nothing more in the bitmap */
1161  return NULL;
1162 }
1163 
1164 /*
1165  * tbm_end_iterate - finish an iteration over a TIDBitmap
1166  *
1167  * Currently this is just a pfree, but it might do more someday. (For
1168  * instance, it could be useful to count open iterators and allow the
1169  * bitmap to return to read/write status when there are no more iterators.)
1170  */
1171 void
1173 {
1174  pfree(iterator);
1175 }
1176 
1177 /*
1178  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1179  *
1180  * This doesn't free any of the shared state associated with the iterator,
1181  * just our backend-private state.
1182  */
1183 void
1185 {
1186  pfree(iterator);
1187 }
1188 
1189 /*
1190  * tbm_find_pageentry - find a PagetableEntry for the pageno
1191  *
1192  * Returns NULL if there is no non-lossy entry for the pageno.
1193  */
1194 static const PagetableEntry *
1196 {
1197  const PagetableEntry *page;
1198 
1199  if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1200  return NULL;
1201 
1202  if (tbm->status == TBM_ONE_PAGE)
1203  {
1204  page = &tbm->entry1;
1205  if (page->blockno != pageno)
1206  return NULL;
1207  Assert(!page->ischunk);
1208  return page;
1209  }
1210 
1211  page = pagetable_lookup(tbm->pagetable, pageno);
1212  if (page == NULL)
1213  return NULL;
1214  if (page->ischunk)
1215  return NULL; /* don't want a lossy chunk header */
1216  return page;
1217 }
1218 
1219 /*
1220  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1221  *
1222  * If new, the entry is marked as an exact (non-chunk) entry.
1223  *
1224  * This may cause the table to exceed the desired memory size. It is
1225  * up to the caller to call tbm_lossify() at the next safe point if so.
1226  */
1227 static PagetableEntry *
1229 {
1230  PagetableEntry *page;
1231  bool found;
1232 
1233  if (tbm->status == TBM_EMPTY)
1234  {
1235  /* Use the fixed slot */
1236  page = &tbm->entry1;
1237  found = false;
1238  tbm->status = TBM_ONE_PAGE;
1239  }
1240  else
1241  {
1242  if (tbm->status == TBM_ONE_PAGE)
1243  {
1244  page = &tbm->entry1;
1245  if (page->blockno == pageno)
1246  return page;
1247  /* Time to switch from one page to a hashtable */
1248  tbm_create_pagetable(tbm);
1249  }
1250 
1251  /* Look up or create an entry */
1252  page = pagetable_insert(tbm->pagetable, pageno, &found);
1253  }
1254 
1255  /* Initialize it if not present before */
1256  if (!found)
1257  {
1258  char oldstatus = page->status;
1259 
1260  MemSet(page, 0, sizeof(PagetableEntry));
1261  page->status = oldstatus;
1262  page->blockno = pageno;
1263  /* must count it too */
1264  tbm->nentries++;
1265  tbm->npages++;
1266  }
1267 
1268  return page;
1269 }
1270 
1271 /*
1272  * tbm_page_is_lossy - is the page marked as lossily stored?
1273  */
1274 static bool
1276 {
1277  PagetableEntry *page;
1278  BlockNumber chunk_pageno;
1279  int bitno;
1280 
1281  /* we can skip the lookup if there are no lossy chunks */
1282  if (tbm->nchunks == 0)
1283  return false;
1284  Assert(tbm->status == TBM_HASH);
1285 
1286  bitno = pageno % PAGES_PER_CHUNK;
1287  chunk_pageno = pageno - bitno;
1288 
1289  page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1290 
1291  if (page != NULL && page->ischunk)
1292  {
1293  int wordnum = WORDNUM(bitno);
1294  int bitnum = BITNUM(bitno);
1295 
1296  if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1297  return true;
1298  }
1299  return false;
1300 }
1301 
1302 /*
1303  * tbm_mark_page_lossy - mark the page number as lossily stored
1304  *
1305  * This may cause the table to exceed the desired memory size. It is
1306  * up to the caller to call tbm_lossify() at the next safe point if so.
1307  */
1308 static void
1310 {
1311  PagetableEntry *page;
1312  bool found;
1313  BlockNumber chunk_pageno;
1314  int bitno;
1315  int wordnum;
1316  int bitnum;
1317 
1318  /* We force the bitmap into hashtable mode whenever it's lossy */
1319  if (tbm->status != TBM_HASH)
1320  tbm_create_pagetable(tbm);
1321 
1322  bitno = pageno % PAGES_PER_CHUNK;
1323  chunk_pageno = pageno - bitno;
1324 
1325  /*
1326  * Remove any extant non-lossy entry for the page. If the page is its own
1327  * chunk header, however, we skip this and handle the case below.
1328  */
1329  if (bitno != 0)
1330  {
1331  if (pagetable_delete(tbm->pagetable, pageno))
1332  {
1333  /* It was present, so adjust counts */
1334  tbm->nentries--;
1335  tbm->npages--; /* assume it must have been non-lossy */
1336  }
1337  }
1338 
1339  /* Look up or create entry for chunk-header page */
1340  page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1341 
1342  /* Initialize it if not present before */
1343  if (!found)
1344  {
1345  char oldstatus = page->status;
1346 
1347  MemSet(page, 0, sizeof(PagetableEntry));
1348  page->status = oldstatus;
1349  page->blockno = chunk_pageno;
1350  page->ischunk = true;
1351  /* must count it too */
1352  tbm->nentries++;
1353  tbm->nchunks++;
1354  }
1355  else if (!page->ischunk)
1356  {
1357  char oldstatus = page->status;
1358 
1359  /* chunk header page was formerly non-lossy, make it lossy */
1360  MemSet(page, 0, sizeof(PagetableEntry));
1361  page->status = oldstatus;
1362  page->blockno = chunk_pageno;
1363  page->ischunk = true;
1364  /* we assume it had some tuple bit(s) set, so mark it lossy */
1365  page->words[0] = ((bitmapword) 1 << 0);
1366  /* adjust counts */
1367  tbm->nchunks++;
1368  tbm->npages--;
1369  }
1370 
1371  /* Now set the original target page's bit */
1372  wordnum = WORDNUM(bitno);
1373  bitnum = BITNUM(bitno);
1374  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1375 }
1376 
1377 /*
1378  * tbm_lossify - lose some information to get back under the memory limit
1379  */
1380 static void
1382 {
1383  pagetable_iterator i;
1384  PagetableEntry *page;
1385 
1386  /*
1387  * XXX Really stupid implementation: this just lossifies pages in
1388  * essentially random order. We should be paying some attention to the
1389  * number of bits set in each page, instead.
1390  *
1391  * Since we are called as soon as nentries exceeds maxentries, we should
1392  * push nentries down to significantly less than maxentries, or else we'll
1393  * just end up doing this again very soon. We shoot for maxentries/2.
1394  */
1396  Assert(tbm->status == TBM_HASH);
1397 
1398  pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1399  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1400  {
1401  if (page->ischunk)
1402  continue; /* already a chunk header */
1403 
1404  /*
1405  * If the page would become a chunk header, we won't save anything by
1406  * converting it to lossy, so skip it.
1407  */
1408  if ((page->blockno % PAGES_PER_CHUNK) == 0)
1409  continue;
1410 
1411  /* This does the dirty work ... */
1412  tbm_mark_page_lossy(tbm, page->blockno);
1413 
1414  if (tbm->nentries <= tbm->maxentries / 2)
1415  {
1416  /*
1417  * We have made enough room. Remember where to start lossifying
1418  * next round, so we evenly iterate over the hashtable.
1419  */
1420  tbm->lossify_start = i.cur;
1421  break;
1422  }
1423 
1424  /*
1425  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1426  * hashtable and may have deleted the non-lossy chunk. We can
1427  * continue the same hash table scan, since failure to visit one
1428  * element or visiting the newly inserted element, isn't fatal.
1429  */
1430  }
1431 
1432  /*
1433  * With a big bitmap and small work_mem, it's possible that we cannot get
1434  * under maxentries. Again, if that happens, we'd end up uselessly
1435  * calling tbm_lossify over and over. To prevent this from becoming a
1436  * performance sink, force maxentries up to at least double the current
1437  * number of entries. (In essence, we're admitting inability to fit
1438  * within work_mem when we do this.) Note that this test will not fire if
1439  * we broke out of the loop early; and if we didn't, the current number of
1440  * entries is simply not reducible any further.
1441  */
1442  if (tbm->nentries > tbm->maxentries / 2)
1443  tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1444 }
1445 
1446 /*
1447  * qsort comparator to handle PagetableEntry pointers.
1448  */
1449 static int
1450 tbm_comparator(const void *left, const void *right)
1451 {
1452  BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1453  BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1454 
1455  if (l < r)
1456  return -1;
1457  else if (l > r)
1458  return 1;
1459  return 0;
1460 }
1461 
1462 /*
1463  * As above, but this will get index into PagetableEntry array. Therefore,
1464  * it needs to get actual PagetableEntry using the index before comparing the
1465  * blockno.
1466  */
1467 static int
1468 tbm_shared_comparator(const void *left, const void *right, void *arg)
1469 {
1470  PagetableEntry *base = (PagetableEntry *) arg;
1471  PagetableEntry *lpage = &base[*(int *) left];
1472  PagetableEntry *rpage = &base[*(int *) right];
1473 
1474  if (lpage->blockno < rpage->blockno)
1475  return -1;
1476  else if (lpage->blockno > rpage->blockno)
1477  return 1;
1478  return 0;
1479 }
1480 
1481 /*
1482  * tbm_attach_shared_iterate
1483  *
1484  * Allocate a backend-private iterator and attach the shared iterator state
1485  * to it so that multiple processed can iterate jointly.
1486  *
1487  * We also converts the DSA pointers to local pointers and store them into
1488  * our private iterator.
1489  */
1492 {
1493  TBMSharedIterator *iterator;
1494  TBMSharedIteratorState *istate;
1495 
1496  /*
1497  * Create the TBMSharedIterator struct, with enough trailing space to
1498  * serve the needs of the TBMIterateResult sub-struct.
1499  */
1500  iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1501  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1502 
1503  istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1504 
1505  iterator->state = istate;
1506 
1507  iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1508 
1509  if (istate->npages)
1510  iterator->ptpages = dsa_get_address(dsa, istate->spages);
1511  if (istate->nchunks)
1512  iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1513 
1514  return iterator;
1515 }
1516 
1517 /*
1518  * pagetable_allocate
1519  *
1520  * Callback function for allocating the memory for hashtable elements.
1521  * Allocate memory for hashtable elements, using DSA if available.
1522  */
1523 static inline void *
1524 pagetable_allocate(pagetable_hash *pagetable, Size size)
1525 {
1526  TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1527  PTEntryArray *ptbase;
1528 
1529  if (tbm->dsa == NULL)
1530  return MemoryContextAllocExtended(pagetable->ctx, size,
1532 
1533  /*
1534  * Save the dsapagetable reference in dsapagetableold before allocating
1535  * new memory so that pagetable_free can free the old entry.
1536  */
1537  tbm->dsapagetableold = tbm->dsapagetable;
1539  sizeof(PTEntryArray) + size,
1541  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1542 
1543  return ptbase->ptentry;
1544 }
1545 
1546 /*
1547  * pagetable_free
1548  *
1549  * Callback function for freeing hash table elements.
1550  */
1551 static inline void
1552 pagetable_free(pagetable_hash *pagetable, void *pointer)
1553 {
1554  TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1555 
1556  /* pfree the input pointer if DSA is not available */
1557  if (tbm->dsa == NULL)
1558  pfree(pointer);
1559  else if (DsaPointerIsValid(tbm->dsapagetableold))
1560  {
1561  dsa_free(tbm->dsa, tbm->dsapagetableold);
1563  }
1564 }
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1450
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:615
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1172
Definition: lwlock.h:32
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:566
struct PTIterationArray PTIterationArray
#define InvalidDsaPointer
Definition: dsa.h:78
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:16
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:792
dsa_pointer dsa_allocate_extended(dsa_area *area, Size size, int flags)
Definition: dsa.c:664
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:484
dsa_pointer spages
Definition: tidbitmap.c:196
struct pagetable_hash * pagetable
Definition: tidbitmap.c:152
static void output(uint64 loop_count)
TBMSharedIteratorState * state
Definition: tidbitmap.c:219
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:813
#define BITNUM(x)
Definition: tidbitmap.c:77
TIDBitmap * tbm_create(long maxbytes, dsa_area *dsa)
Definition: tidbitmap.c:281
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:412
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:403
#define Min(x, y)
Definition: c.h:807
uint32 lossify_start
Definition: tidbitmap.c:158
static int tbm_shared_comparator(const void *left, const void *right, void *arg)
Definition: tidbitmap.c:1468
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsa_pointer schunks
Definition: tidbitmap.c:197
PTEntryArray * ptbase
Definition: tidbitmap.c:220
#define MemSet(start, val, len)
Definition: c.h:858
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:397
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:264
uint32 BlockNumber
Definition: block.h:31
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:33
PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:113
static void tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
Definition: tidbitmap.c:967
NodeTag
Definition: nodes.h:26
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:507
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:82
uint64 dsa_pointer
Definition: dsa.h:62
uint32 bitmapword
Definition: bitmapset.h:34
dsa_pointer dsapagetable
Definition: tidbitmap.c:163
TBMStatus
Definition: tidbitmap.c:127
#define WORDS_PER_PAGE
Definition: tidbitmap.c:80
uint16 OffsetNumber
Definition: off.h:24
MemoryContext mcxt
Definition: tidbitmap.c:150
Definition: type.h:89
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1715
PagetableEntry entry1
Definition: tidbitmap.c:159
BlockNumber blockno
Definition: tidbitmap.h:42
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:924
void pfree(void *pointer)
Definition: mcxt.c:950
char * Pointer
Definition: c.h:245
struct TBMSharedIteratorState TBMSharedIteratorState
dsa_pointer ptpages
Definition: tidbitmap.c:165
#define ERROR
Definition: elog.h:43
int schunkptr
Definition: tidbitmap.c:180
BlockNumber blockno
Definition: tidbitmap.c:100
static int tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
Definition: tidbitmap.c:937
int spageptr
Definition: tidbitmap.c:179
#define dsa_allocate0(area, size)
Definition: dsa.h:88
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:348
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.h:46
TBMIterateResult output
Definition: tidbitmap.c:223
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1195
dsa_pointer ptchunks
Definition: tidbitmap.c:166
int maxentries
Definition: tidbitmap.c:154
unsigned int uint32
Definition: c.h:268
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:367
static void pagetable_free(pagetable_hash *pagetable, void *pointer)
Definition: tidbitmap.c:1552
NodeTag type
Definition: tidbitmap.c:149
#define WORDNUM(x)
Definition: tidbitmap.c:76
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:667
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
TIDBitmap * tbm
Definition: tidbitmap.c:178
dsa_pointer pagetable
Definition: tidbitmap.c:195
void * palloc0(Size size)
Definition: mcxt.c:878
PTIterationArray * ptpages
Definition: tidbitmap.c:221
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:696
pg_atomic_uint32 refcount
Definition: tidbitmap.c:112
int nchunks
Definition: tidbitmap.c:156
PagetableEntry ** spages
Definition: tidbitmap.c:161
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:72
TBMIterateResult * tbm_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:997
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:104
int npages
Definition: tidbitmap.c:155
#define Max(x, y)
Definition: c.h:801
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define makeNode(_type_)
Definition: nodes.h:557
static uint32 hash_blockno(BlockNumber b)
Definition: tidbitmap.c:245
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:676
PagetableEntry ** schunks
Definition: tidbitmap.c:162
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:19
TBMIterateResult output
Definition: tidbitmap.c:182
size_t Size
Definition: c.h:356
TBMIterator * tbm_begin_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:715
TBMIterateResult * tbm_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1078
#define InvalidBlockNumber
Definition: block.h:33
TBMStatus status
Definition: tidbitmap.c:151
#define MAX_TUPLES_PER_PAGE
Definition: tidbitmap.c:55
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1111
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:95
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1275
dsa_pointer dsapagetableold
Definition: tidbitmap.c:164
struct PagetableEntry PagetableEntry
#define DsaPointerIsValid(x)
Definition: dsa.h:81
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:469
Definition: dsa.c:354
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:812
void * palloc(Size size)
Definition: mcxt.c:849
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:707
pg_atomic_uint32 refcount
Definition: tidbitmap.c:209
int i
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:1381
void * arg
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1309
int schunkbit
Definition: tidbitmap.c:181
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1228
TBMIteratingState iterating
Definition: tidbitmap.c:157
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:234
#define elog
Definition: elog.h:219
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:76
#define qsort(a, b, c, d)
Definition: port.h:443
TBMIteratingState
Definition: tidbitmap.c:137
static void * pagetable_allocate(pagetable_hash *pagetable, Size size)
Definition: tidbitmap.c:1524
int nentries
Definition: tidbitmap.c:153
dsa_area * dsa
Definition: tidbitmap.c:167
struct PTEntryArray PTEntryArray
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1491
int index[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:210
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:318
PTIterationArray * ptchunks
Definition: tidbitmap.c:222
#define dsa_allocate(area, size)
Definition: dsa.h:84
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1184