PostgreSQL Source Code  git master
tidbitmap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  * PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory. People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness. In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking. This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  * src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include <limits.h>
42 
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48 #include "utils/hashutils.h"
49 
50 /*
51  * The maximum number of tuples per page is not large (typically 256 with
52  * 8K pages, or 1024 with 32K pages). So there's not much point in making
53  * the per-page bitmaps variable size. We just legislate that the size
54  * is this:
55  */
56 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57 
58 /*
59  * When we have to switch over to lossy storage, we use a data structure
60  * with one bit per page, where all pages having the same number DIV
61  * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62  * and has the bit set for a given page, there must not be a per-page entry
63  * for that page in the page table.
64  *
65  * We actually store both exact pages and lossy chunks in the same hash
66  * table, using identical data structures. (This is because the memory
67  * management for hashtables doesn't easily/efficiently allow space to be
68  * transferred easily from one hashtable to another.) Therefore it's best
69  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70  * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71  * avoid expensive integer remainder operations. So, define it like this:
72  */
73 #define PAGES_PER_CHUNK (BLCKSZ / 32)
74 
75 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 
77 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79 
80 /* number of active words for an exact page: */
81 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 /* number of active words for a lossy chunk: */
83 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 
85 /*
86  * The hashtable entries are represented by this data structure. For
87  * an exact page, blockno is the page number and bit k of the bitmap
88  * represents tuple offset k+1. For a lossy chunk, blockno is the first
89  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90  * bit k represents page blockno+k. Note that it is not possible to
91  * have exact storage for the first page of a chunk if we are using
92  * lossy storage for any page in the chunk's range, since the same
93  * hashtable entry has to serve both purposes.
94  *
95  * recheck is used only on exact pages --- it indicates that although
96  * only the stated tuples need be checked, the full index qual condition
97  * must be checked for each (ie, these are candidate matches).
98  */
99 typedef struct PagetableEntry
100 {
101  BlockNumber blockno; /* page number (hashtable key) */
102  char status; /* hash entry status */
103  bool ischunk; /* T = lossy storage, F = exact */
104  bool recheck; /* should the tuples be rechecked? */
107 
108 /*
109  * Holds array of pagetable entries.
110  */
111 typedef struct PTEntryArray
112 {
113  pg_atomic_uint32 refcount; /* no. of iterator attached */
114  PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 } PTEntryArray;
116 
117 /*
118  * We want to avoid the overhead of creating the hashtable, which is
119  * comparatively large, when not necessary. Particularly when we are using a
120  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121  * long enough to accumulate one entry in such cases. We therefore avoid
122  * creating an actual hashtable until we need two pagetable entries. When
123  * just one pagetable entry is needed, we store it in a fixed field of
124  * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125  * shrinks down to zero or one page again. So, status can be TBM_HASH even
126  * when nentries is zero or one.)
127  */
128 typedef enum
129 {
130  TBM_EMPTY, /* no hashtable, nentries == 0 */
131  TBM_ONE_PAGE, /* entry1 contains the single entry */
132  TBM_HASH /* pagetable is valid, entry1 is not */
133 } TBMStatus;
134 
135 /*
136  * Current iterating state of the TBM.
137  */
138 typedef enum
139 {
140  TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
141  TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
142  TBM_ITERATING_SHARED /* converted to shared page and chunk array */
144 
145 /*
146  * Here is the representation for a whole TIDBitMap:
147  */
148 struct TIDBitmap
149 {
150  NodeTag type; /* to make it a valid Node */
151  MemoryContext mcxt; /* memory context containing me */
152  TBMStatus status; /* see codes above */
153  struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
154  int nentries; /* number of entries in pagetable */
155  int maxentries; /* limit on same to meet maxbytes */
156  int npages; /* number of exact entries in pagetable */
157  int nchunks; /* number of lossy entries in pagetable */
158  TBMIteratingState iterating; /* tbm_begin_iterate called? */
159  uint32 lossify_start; /* offset to start lossifying hashtable at */
160  PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
161  /* these are valid when iterating is true: */
162  PagetableEntry **spages; /* sorted exact-page list, or NULL */
163  PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
164  dsa_pointer dsapagetable; /* dsa_pointer to the element array */
165  dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
166  dsa_pointer ptpages; /* dsa_pointer to the page array */
167  dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
168  dsa_area *dsa; /* reference to per-query dsa area */
169 };
170 
171 /*
172  * When iterating over a bitmap in sorted order, a TBMIterator is used to
173  * track our progress. There can be several iterators scanning the same
174  * bitmap concurrently. Note that the bitmap becomes read-only as soon as
175  * any iterator is created.
176  */
178 {
179  TIDBitmap *tbm; /* TIDBitmap we're iterating over */
180  int spageptr; /* next spages index */
181  int schunkptr; /* next schunks index */
182  int schunkbit; /* next bit to check in current schunk */
183  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
184 };
185 
186 /*
187  * Holds the shared members of the iterator so that multiple processes
188  * can jointly iterate.
189  */
191 {
192  int nentries; /* number of entries in pagetable */
193  int maxentries; /* limit on same to meet maxbytes */
194  int npages; /* number of exact entries in pagetable */
195  int nchunks; /* number of lossy entries in pagetable */
196  dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
197  dsa_pointer spages; /* dsa pointer to page array */
198  dsa_pointer schunks; /* dsa pointer to chunk array */
199  LWLock lock; /* lock to protect below members */
200  int spageptr; /* next spages index */
201  int schunkptr; /* next schunks index */
202  int schunkbit; /* next bit to check in current schunk */
204 
205 /*
206  * pagetable iteration array.
207  */
208 typedef struct PTIterationArray
209 {
210  pg_atomic_uint32 refcount; /* no. of iterator attached */
211  int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
213 
214 /*
215  * same as TBMIterator, but it is used for joint iteration, therefore this
216  * also holds a reference to the shared state.
217  */
219 {
220  TBMSharedIteratorState *state; /* shared state */
221  PTEntryArray *ptbase; /* pagetable element array */
222  PTIterationArray *ptpages; /* sorted exact page index list */
223  PTIterationArray *ptchunks; /* sorted lossy page index list */
224  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
225 };
226 
227 /* Local function prototypes */
228 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230  const TIDBitmap *b);
231 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232  BlockNumber pageno);
234 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236 static void tbm_lossify(TIDBitmap *tbm);
237 static int tbm_comparator(const void *left, const void *right);
238 static int tbm_shared_comparator(const void *left, const void *right,
239  void *arg);
240 
241 /* define hashtable mapping block numbers to PagetableEntry's */
242 #define SH_USE_NONDEFAULT_ALLOCATOR
243 #define SH_PREFIX pagetable
244 #define SH_ELEMENT_TYPE PagetableEntry
245 #define SH_KEY_TYPE BlockNumber
246 #define SH_KEY blockno
247 #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 #define SH_EQUAL(tb, a, b) a == b
249 #define SH_SCOPE static inline
250 #define SH_DEFINE
251 #define SH_DECLARE
252 #include "lib/simplehash.h"
253 
254 
255 /*
256  * tbm_create - create an initially-empty bitmap
257  *
258  * The bitmap will live in the memory context that is CurrentMemoryContext
259  * at the time of this call. It will be limited to (approximately) maxbytes
260  * total memory consumption. If the DSA passed to this function is not NULL
261  * then the memory for storing elements of the underlying page table will
262  * be allocated from the DSA.
263  */
264 TIDBitmap *
265 tbm_create(long maxbytes, dsa_area *dsa)
266 {
267  TIDBitmap *tbm;
268 
269  /* Create the TIDBitmap struct and zero all its fields */
270  tbm = makeNode(TIDBitmap);
271 
272  tbm->mcxt = CurrentMemoryContext;
273  tbm->status = TBM_EMPTY;
274 
275  tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
276  tbm->lossify_start = 0;
277  tbm->dsa = dsa;
280  tbm->ptpages = InvalidDsaPointer;
282 
283  return tbm;
284 }
285 
286 /*
287  * Actually create the hashtable. Since this is a moderately expensive
288  * proposition, we don't do it until we have to.
289  */
290 static void
292 {
293  Assert(tbm->status != TBM_HASH);
294  Assert(tbm->pagetable == NULL);
295 
296  tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
297 
298  /* If entry1 is valid, push it into the hashtable */
299  if (tbm->status == TBM_ONE_PAGE)
300  {
301  PagetableEntry *page;
302  bool found;
303  char oldstatus;
304 
305  page = pagetable_insert(tbm->pagetable,
306  tbm->entry1.blockno,
307  &found);
308  Assert(!found);
309  oldstatus = page->status;
310  memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
311  page->status = oldstatus;
312  }
313 
314  tbm->status = TBM_HASH;
315 }
316 
317 /*
318  * tbm_free - free a TIDBitmap
319  */
320 void
322 {
323  if (tbm->pagetable)
324  pagetable_destroy(tbm->pagetable);
325  if (tbm->spages)
326  pfree(tbm->spages);
327  if (tbm->schunks)
328  pfree(tbm->schunks);
329  pfree(tbm);
330 }
331 
332 /*
333  * tbm_free_shared_area - free shared state
334  *
335  * Free shared iterator state, Also free shared pagetable and iterator arrays
336  * memory if they are not referred by any of the shared iterator i.e recount
337  * is becomes 0.
338  */
339 void
341 {
342  TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
343  PTEntryArray *ptbase;
344  PTIterationArray *ptpages;
345  PTIterationArray *ptchunks;
346 
347  if (DsaPointerIsValid(istate->pagetable))
348  {
349  ptbase = dsa_get_address(dsa, istate->pagetable);
350  if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
351  dsa_free(dsa, istate->pagetable);
352  }
353  if (DsaPointerIsValid(istate->spages))
354  {
355  ptpages = dsa_get_address(dsa, istate->spages);
356  if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
357  dsa_free(dsa, istate->spages);
358  }
359  if (DsaPointerIsValid(istate->schunks))
360  {
361  ptchunks = dsa_get_address(dsa, istate->schunks);
362  if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
363  dsa_free(dsa, istate->schunks);
364  }
365 
366  dsa_free(dsa, dp);
367 }
368 
369 /*
370  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
371  *
372  * If recheck is true, then the recheck flag will be set in the
373  * TBMIterateResult when any of these tuples are reported out.
374  */
375 void
376 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
377  bool recheck)
378 {
380  PagetableEntry *page = NULL; /* only valid when currblk is valid */
381  int i;
382 
384  for (i = 0; i < ntids; i++)
385  {
386  BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
388  int wordnum,
389  bitnum;
390 
391  /* safety check to ensure we don't overrun bit array bounds */
392  if (off < 1 || off > MAX_TUPLES_PER_PAGE)
393  elog(ERROR, "tuple offset out of range: %u", off);
394 
395  /*
396  * Look up target page unless we already did. This saves cycles when
397  * the input includes consecutive tuples on the same page, which is
398  * common enough to justify an extra test here.
399  */
400  if (blk != currblk)
401  {
402  if (tbm_page_is_lossy(tbm, blk))
403  page = NULL; /* remember page is lossy */
404  else
405  page = tbm_get_pageentry(tbm, blk);
406  currblk = blk;
407  }
408 
409  if (page == NULL)
410  continue; /* whole page is already marked */
411 
412  if (page->ischunk)
413  {
414  /* The page is a lossy chunk header, set bit for itself */
415  wordnum = bitnum = 0;
416  }
417  else
418  {
419  /* Page is exact, so set bit for individual tuple */
420  wordnum = WORDNUM(off - 1);
421  bitnum = BITNUM(off - 1);
422  }
423  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
424  page->recheck |= recheck;
425 
426  if (tbm->nentries > tbm->maxentries)
427  {
428  tbm_lossify(tbm);
429  /* Page could have been converted to lossy, so force new lookup */
430  currblk = InvalidBlockNumber;
431  }
432  }
433 }
434 
435 /*
436  * tbm_add_page - add a whole page to a TIDBitmap
437  *
438  * This causes the whole page to be reported (with the recheck flag)
439  * when the TIDBitmap is scanned.
440  */
441 void
443 {
444  /* Enter the page in the bitmap, or mark it lossy if already present */
445  tbm_mark_page_lossy(tbm, pageno);
446  /* If we went over the memory limit, lossify some more pages */
447  if (tbm->nentries > tbm->maxentries)
448  tbm_lossify(tbm);
449 }
450 
451 /*
452  * tbm_union - set union
453  *
454  * a is modified in-place, b is not changed
455  */
456 void
458 {
459  Assert(!a->iterating);
460  /* Nothing to do if b is empty */
461  if (b->nentries == 0)
462  return;
463  /* Scan through chunks and pages in b, merge into a */
464  if (b->status == TBM_ONE_PAGE)
465  tbm_union_page(a, &b->entry1);
466  else
467  {
468  pagetable_iterator i;
469  PagetableEntry *bpage;
470 
471  Assert(b->status == TBM_HASH);
472  pagetable_start_iterate(b->pagetable, &i);
473  while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
474  tbm_union_page(a, bpage);
475  }
476 }
477 
478 /* Process one page of b during a union op */
479 static void
481 {
482  PagetableEntry *apage;
483  int wordnum;
484 
485  if (bpage->ischunk)
486  {
487  /* Scan b's chunk, mark each indicated page lossy in a */
488  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
489  {
490  bitmapword w = bpage->words[wordnum];
491 
492  if (w != 0)
493  {
494  BlockNumber pg;
495 
496  pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
497  while (w != 0)
498  {
499  if (w & 1)
500  tbm_mark_page_lossy(a, pg);
501  pg++;
502  w >>= 1;
503  }
504  }
505  }
506  }
507  else if (tbm_page_is_lossy(a, bpage->blockno))
508  {
509  /* page is already lossy in a, nothing to do */
510  return;
511  }
512  else
513  {
514  apage = tbm_get_pageentry(a, bpage->blockno);
515  if (apage->ischunk)
516  {
517  /* The page is a lossy chunk header, set bit for itself */
518  apage->words[0] |= ((bitmapword) 1 << 0);
519  }
520  else
521  {
522  /* Both pages are exact, merge at the bit level */
523  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
524  apage->words[wordnum] |= bpage->words[wordnum];
525  apage->recheck |= bpage->recheck;
526  }
527  }
528 
529  if (a->nentries > a->maxentries)
530  tbm_lossify(a);
531 }
532 
533 /*
534  * tbm_intersect - set intersection
535  *
536  * a is modified in-place, b is not changed
537  */
538 void
540 {
541  Assert(!a->iterating);
542  /* Nothing to do if a is empty */
543  if (a->nentries == 0)
544  return;
545  /* Scan through chunks and pages in a, try to match to b */
546  if (a->status == TBM_ONE_PAGE)
547  {
548  if (tbm_intersect_page(a, &a->entry1, b))
549  {
550  /* Page is now empty, remove it from a */
551  Assert(!a->entry1.ischunk);
552  a->npages--;
553  a->nentries--;
554  Assert(a->nentries == 0);
555  a->status = TBM_EMPTY;
556  }
557  }
558  else
559  {
560  pagetable_iterator i;
561  PagetableEntry *apage;
562 
563  Assert(a->status == TBM_HASH);
564  pagetable_start_iterate(a->pagetable, &i);
565  while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
566  {
567  if (tbm_intersect_page(a, apage, b))
568  {
569  /* Page or chunk is now empty, remove it from a */
570  if (apage->ischunk)
571  a->nchunks--;
572  else
573  a->npages--;
574  a->nentries--;
575  if (!pagetable_delete(a->pagetable, apage->blockno))
576  elog(ERROR, "hash table corrupted");
577  }
578  }
579  }
580 }
581 
582 /*
583  * Process one page of a during an intersection op
584  *
585  * Returns true if apage is now empty and should be deleted from a
586  */
587 static bool
589 {
590  const PagetableEntry *bpage;
591  int wordnum;
592 
593  if (apage->ischunk)
594  {
595  /* Scan each bit in chunk, try to clear */
596  bool candelete = true;
597 
598  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
599  {
600  bitmapword w = apage->words[wordnum];
601 
602  if (w != 0)
603  {
604  bitmapword neww = w;
605  BlockNumber pg;
606  int bitnum;
607 
608  pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
609  bitnum = 0;
610  while (w != 0)
611  {
612  if (w & 1)
613  {
614  if (!tbm_page_is_lossy(b, pg) &&
615  tbm_find_pageentry(b, pg) == NULL)
616  {
617  /* Page is not in b at all, lose lossy bit */
618  neww &= ~((bitmapword) 1 << bitnum);
619  }
620  }
621  pg++;
622  bitnum++;
623  w >>= 1;
624  }
625  apage->words[wordnum] = neww;
626  if (neww != 0)
627  candelete = false;
628  }
629  }
630  return candelete;
631  }
632  else if (tbm_page_is_lossy(b, apage->blockno))
633  {
634  /*
635  * Some of the tuples in 'a' might not satisfy the quals for 'b', but
636  * because the page 'b' is lossy, we don't know which ones. Therefore
637  * we mark 'a' as requiring rechecks, to indicate that at most those
638  * tuples set in 'a' are matches.
639  */
640  apage->recheck = true;
641  return false;
642  }
643  else
644  {
645  bool candelete = true;
646 
647  bpage = tbm_find_pageentry(b, apage->blockno);
648  if (bpage != NULL)
649  {
650  /* Both pages are exact, merge at the bit level */
651  Assert(!bpage->ischunk);
652  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
653  {
654  apage->words[wordnum] &= bpage->words[wordnum];
655  if (apage->words[wordnum] != 0)
656  candelete = false;
657  }
658  apage->recheck |= bpage->recheck;
659  }
660  /* If there is no matching b page, we can just delete the a page */
661  return candelete;
662  }
663 }
664 
665 /*
666  * tbm_is_empty - is a TIDBitmap completely empty?
667  */
668 bool
670 {
671  return (tbm->nentries == 0);
672 }
673 
674 /*
675  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
676  *
677  * The TBMIterator struct is created in the caller's memory context.
678  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
679  * okay to just allow the memory context to be released, too. It is caller's
680  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
681  * is freed.
682  *
683  * NB: after this is called, it is no longer allowed to modify the contents
684  * of the bitmap. However, you can call this multiple times to scan the
685  * contents repeatedly, including parallel scans.
686  */
687 TBMIterator *
689 {
690  TBMIterator *iterator;
691 
693 
694  /*
695  * Create the TBMIterator struct, with enough trailing space to serve the
696  * needs of the TBMIterateResult sub-struct.
697  */
698  iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
700  iterator->tbm = tbm;
701 
702  /*
703  * Initialize iteration pointers.
704  */
705  iterator->spageptr = 0;
706  iterator->schunkptr = 0;
707  iterator->schunkbit = 0;
708 
709  /*
710  * If we have a hashtable, create and fill the sorted page lists, unless
711  * we already did that for a previous iterator. Note that the lists are
712  * attached to the bitmap not the iterator, so they can be used by more
713  * than one iterator.
714  */
715  if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
716  {
717  pagetable_iterator i;
718  PagetableEntry *page;
719  int npages;
720  int nchunks;
721 
722  if (!tbm->spages && tbm->npages > 0)
723  tbm->spages = (PagetableEntry **)
725  tbm->npages * sizeof(PagetableEntry *));
726  if (!tbm->schunks && tbm->nchunks > 0)
727  tbm->schunks = (PagetableEntry **)
729  tbm->nchunks * sizeof(PagetableEntry *));
730 
731  npages = nchunks = 0;
732  pagetable_start_iterate(tbm->pagetable, &i);
733  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
734  {
735  if (page->ischunk)
736  tbm->schunks[nchunks++] = page;
737  else
738  tbm->spages[npages++] = page;
739  }
740  Assert(npages == tbm->npages);
741  Assert(nchunks == tbm->nchunks);
742  if (npages > 1)
743  qsort(tbm->spages, npages, sizeof(PagetableEntry *),
745  if (nchunks > 1)
746  qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
748  }
749 
751 
752  return iterator;
753 }
754 
755 /*
756  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
757  *
758  * The necessary shared state will be allocated from the DSA passed to
759  * tbm_create, so that multiple processes can attach to it and iterate jointly.
760  *
761  * This will convert the pagetable hash into page and chunk array of the index
762  * into pagetable array.
763  */
766 {
767  dsa_pointer dp;
768  TBMSharedIteratorState *istate;
769  PTEntryArray *ptbase = NULL;
770  PTIterationArray *ptpages = NULL;
771  PTIterationArray *ptchunks = NULL;
772 
773  Assert(tbm->dsa != NULL);
775 
776  /*
777  * Allocate TBMSharedIteratorState from DSA to hold the shared members and
778  * lock, this will also be used by multiple worker for shared iterate.
779  */
780  dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
781  istate = dsa_get_address(tbm->dsa, dp);
782 
783  /*
784  * If we're not already iterating, create and fill the sorted page lists.
785  * (If we are, the sorted page lists are already stored in the TIDBitmap,
786  * and we can just reuse them.)
787  */
788  if (tbm->iterating == TBM_NOT_ITERATING)
789  {
790  pagetable_iterator i;
791  PagetableEntry *page;
792  int idx;
793  int npages;
794  int nchunks;
795 
796  /*
797  * Allocate the page and chunk array memory from the DSA to share
798  * across multiple processes.
799  */
800  if (tbm->npages)
801  {
802  tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
803  tbm->npages * sizeof(int));
804  ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
805  pg_atomic_init_u32(&ptpages->refcount, 0);
806  }
807  if (tbm->nchunks)
808  {
809  tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
810  tbm->nchunks * sizeof(int));
811  ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
812  pg_atomic_init_u32(&ptchunks->refcount, 0);
813  }
814 
815  /*
816  * If TBM status is TBM_HASH then iterate over the pagetable and
817  * convert it to page and chunk arrays. But if it's in the
818  * TBM_ONE_PAGE mode then directly allocate the space for one entry
819  * from the DSA.
820  */
821  npages = nchunks = 0;
822  if (tbm->status == TBM_HASH)
823  {
824  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
825 
826  pagetable_start_iterate(tbm->pagetable, &i);
827  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
828  {
829  idx = page - ptbase->ptentry;
830  if (page->ischunk)
831  ptchunks->index[nchunks++] = idx;
832  else
833  ptpages->index[npages++] = idx;
834  }
835 
836  Assert(npages == tbm->npages);
837  Assert(nchunks == tbm->nchunks);
838  }
839  else if (tbm->status == TBM_ONE_PAGE)
840  {
841  /*
842  * In one page mode allocate the space for one pagetable entry,
843  * initialize it, and directly store its index (i.e. 0) in the
844  * page array.
845  */
846  tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
847  sizeof(PagetableEntry));
848  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
849  memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
850  ptpages->index[0] = 0;
851  }
852 
853  if (ptbase != NULL)
854  pg_atomic_init_u32(&ptbase->refcount, 0);
855  if (npages > 1)
856  qsort_arg((void *) (ptpages->index), npages, sizeof(int),
857  tbm_shared_comparator, (void *) ptbase->ptentry);
858  if (nchunks > 1)
859  qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
860  tbm_shared_comparator, (void *) ptbase->ptentry);
861  }
862 
863  /*
864  * Store the TBM members in the shared state so that we can share them
865  * across multiple processes.
866  */
867  istate->nentries = tbm->nentries;
868  istate->maxentries = tbm->maxentries;
869  istate->npages = tbm->npages;
870  istate->nchunks = tbm->nchunks;
871  istate->pagetable = tbm->dsapagetable;
872  istate->spages = tbm->ptpages;
873  istate->schunks = tbm->ptchunks;
874 
875  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876  ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
877  ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
878 
879  /*
880  * For every shared iterator, referring to pagetable and iterator array,
881  * increase the refcount by 1 so that while freeing the shared iterator we
882  * don't free pagetable and iterator array until its refcount becomes 0.
883  */
884  if (ptbase != NULL)
885  pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
886  if (ptpages != NULL)
887  pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
888  if (ptchunks != NULL)
889  pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
890 
891  /* Initialize the iterator lock */
893 
894  /* Initialize the shared iterator state */
895  istate->schunkbit = 0;
896  istate->schunkptr = 0;
897  istate->spageptr = 0;
898 
900 
901  return dp;
902 }
903 
904 /*
905  * tbm_extract_page_tuple - extract the tuple offsets from a page
906  *
907  * The extracted offsets are stored into TBMIterateResult.
908  */
909 static inline int
911 {
912  int wordnum;
913  int ntuples = 0;
914 
915  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
916  {
917  bitmapword w = page->words[wordnum];
918 
919  if (w != 0)
920  {
921  int off = wordnum * BITS_PER_BITMAPWORD + 1;
922 
923  while (w != 0)
924  {
925  if (w & 1)
926  output->offsets[ntuples++] = (OffsetNumber) off;
927  off++;
928  w >>= 1;
929  }
930  }
931  }
932 
933  return ntuples;
934 }
935 
936 /*
937  * tbm_advance_schunkbit - Advance the chunkbit
938  */
939 static inline void
940 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
941 {
942  int schunkbit = *schunkbitp;
943 
944  while (schunkbit < PAGES_PER_CHUNK)
945  {
946  int wordnum = WORDNUM(schunkbit);
947  int bitnum = BITNUM(schunkbit);
948 
949  if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
950  break;
951  schunkbit++;
952  }
953 
954  *schunkbitp = schunkbit;
955 }
956 
957 /*
958  * tbm_iterate - scan through next page of a TIDBitmap
959  *
960  * Returns a TBMIterateResult representing one page, or NULL if there are
961  * no more pages to scan. Pages are guaranteed to be delivered in numerical
962  * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
963  * remember the exact tuples to look at on this page --- the caller must
964  * examine all tuples on the page and check if they meet the intended
965  * condition. If result->recheck is true, only the indicated tuples need
966  * be examined, but the condition must be rechecked anyway. (For ease of
967  * testing, recheck is always set true when ntuples < 0.)
968  */
971 {
972  TIDBitmap *tbm = iterator->tbm;
973  TBMIterateResult *output = &(iterator->output);
974 
976 
977  /*
978  * If lossy chunk pages remain, make sure we've advanced schunkptr/
979  * schunkbit to the next set bit.
980  */
981  while (iterator->schunkptr < tbm->nchunks)
982  {
983  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
984  int schunkbit = iterator->schunkbit;
985 
986  tbm_advance_schunkbit(chunk, &schunkbit);
987  if (schunkbit < PAGES_PER_CHUNK)
988  {
989  iterator->schunkbit = schunkbit;
990  break;
991  }
992  /* advance to next chunk */
993  iterator->schunkptr++;
994  iterator->schunkbit = 0;
995  }
996 
997  /*
998  * If both chunk and per-page data remain, must output the numerically
999  * earlier page.
1000  */
1001  if (iterator->schunkptr < tbm->nchunks)
1002  {
1003  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1004  BlockNumber chunk_blockno;
1005 
1006  chunk_blockno = chunk->blockno + iterator->schunkbit;
1007  if (iterator->spageptr >= tbm->npages ||
1008  chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1009  {
1010  /* Return a lossy page indicator from the chunk */
1011  output->blockno = chunk_blockno;
1012  output->ntuples = -1;
1013  output->recheck = true;
1014  iterator->schunkbit++;
1015  return output;
1016  }
1017  }
1018 
1019  if (iterator->spageptr < tbm->npages)
1020  {
1021  PagetableEntry *page;
1022  int ntuples;
1023 
1024  /* In ONE_PAGE state, we don't allocate an spages[] array */
1025  if (tbm->status == TBM_ONE_PAGE)
1026  page = &tbm->entry1;
1027  else
1028  page = tbm->spages[iterator->spageptr];
1029 
1030  /* scan bitmap to extract individual offset numbers */
1031  ntuples = tbm_extract_page_tuple(page, output);
1032  output->blockno = page->blockno;
1033  output->ntuples = ntuples;
1034  output->recheck = page->recheck;
1035  iterator->spageptr++;
1036  return output;
1037  }
1038 
1039  /* Nothing more in the bitmap */
1040  return NULL;
1041 }
1042 
1043 /*
1044  * tbm_shared_iterate - scan through next page of a TIDBitmap
1045  *
1046  * As above, but this will iterate using an iterator which is shared
1047  * across multiple processes. We need to acquire the iterator LWLock,
1048  * before accessing the shared members.
1049  */
1052 {
1053  TBMIterateResult *output = &iterator->output;
1054  TBMSharedIteratorState *istate = iterator->state;
1055  PagetableEntry *ptbase = NULL;
1056  int *idxpages = NULL;
1057  int *idxchunks = NULL;
1058 
1059  if (iterator->ptbase != NULL)
1060  ptbase = iterator->ptbase->ptentry;
1061  if (iterator->ptpages != NULL)
1062  idxpages = iterator->ptpages->index;
1063  if (iterator->ptchunks != NULL)
1064  idxchunks = iterator->ptchunks->index;
1065 
1066  /* Acquire the LWLock before accessing the shared members */
1067  LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1068 
1069  /*
1070  * If lossy chunk pages remain, make sure we've advanced schunkptr/
1071  * schunkbit to the next set bit.
1072  */
1073  while (istate->schunkptr < istate->nchunks)
1074  {
1075  PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1076  int schunkbit = istate->schunkbit;
1077 
1078  tbm_advance_schunkbit(chunk, &schunkbit);
1079  if (schunkbit < PAGES_PER_CHUNK)
1080  {
1081  istate->schunkbit = schunkbit;
1082  break;
1083  }
1084  /* advance to next chunk */
1085  istate->schunkptr++;
1086  istate->schunkbit = 0;
1087  }
1088 
1089  /*
1090  * If both chunk and per-page data remain, must output the numerically
1091  * earlier page.
1092  */
1093  if (istate->schunkptr < istate->nchunks)
1094  {
1095  PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1096  BlockNumber chunk_blockno;
1097 
1098  chunk_blockno = chunk->blockno + istate->schunkbit;
1099 
1100  if (istate->spageptr >= istate->npages ||
1101  chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1102  {
1103  /* Return a lossy page indicator from the chunk */
1104  output->blockno = chunk_blockno;
1105  output->ntuples = -1;
1106  output->recheck = true;
1107  istate->schunkbit++;
1108 
1109  LWLockRelease(&istate->lock);
1110  return output;
1111  }
1112  }
1113 
1114  if (istate->spageptr < istate->npages)
1115  {
1116  PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1117  int ntuples;
1118 
1119  /* scan bitmap to extract individual offset numbers */
1120  ntuples = tbm_extract_page_tuple(page, output);
1121  output->blockno = page->blockno;
1122  output->ntuples = ntuples;
1123  output->recheck = page->recheck;
1124  istate->spageptr++;
1125 
1126  LWLockRelease(&istate->lock);
1127 
1128  return output;
1129  }
1130 
1131  LWLockRelease(&istate->lock);
1132 
1133  /* Nothing more in the bitmap */
1134  return NULL;
1135 }
1136 
1137 /*
1138  * tbm_end_iterate - finish an iteration over a TIDBitmap
1139  *
1140  * Currently this is just a pfree, but it might do more someday. (For
1141  * instance, it could be useful to count open iterators and allow the
1142  * bitmap to return to read/write status when there are no more iterators.)
1143  */
1144 void
1146 {
1147  pfree(iterator);
1148 }
1149 
1150 /*
1151  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1152  *
1153  * This doesn't free any of the shared state associated with the iterator,
1154  * just our backend-private state.
1155  */
1156 void
1158 {
1159  pfree(iterator);
1160 }
1161 
1162 /*
1163  * tbm_find_pageentry - find a PagetableEntry for the pageno
1164  *
1165  * Returns NULL if there is no non-lossy entry for the pageno.
1166  */
1167 static const PagetableEntry *
1169 {
1170  const PagetableEntry *page;
1171 
1172  if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1173  return NULL;
1174 
1175  if (tbm->status == TBM_ONE_PAGE)
1176  {
1177  page = &tbm->entry1;
1178  if (page->blockno != pageno)
1179  return NULL;
1180  Assert(!page->ischunk);
1181  return page;
1182  }
1183 
1184  page = pagetable_lookup(tbm->pagetable, pageno);
1185  if (page == NULL)
1186  return NULL;
1187  if (page->ischunk)
1188  return NULL; /* don't want a lossy chunk header */
1189  return page;
1190 }
1191 
1192 /*
1193  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1194  *
1195  * If new, the entry is marked as an exact (non-chunk) entry.
1196  *
1197  * This may cause the table to exceed the desired memory size. It is
1198  * up to the caller to call tbm_lossify() at the next safe point if so.
1199  */
1200 static PagetableEntry *
1202 {
1203  PagetableEntry *page;
1204  bool found;
1205 
1206  if (tbm->status == TBM_EMPTY)
1207  {
1208  /* Use the fixed slot */
1209  page = &tbm->entry1;
1210  found = false;
1211  tbm->status = TBM_ONE_PAGE;
1212  }
1213  else
1214  {
1215  if (tbm->status == TBM_ONE_PAGE)
1216  {
1217  page = &tbm->entry1;
1218  if (page->blockno == pageno)
1219  return page;
1220  /* Time to switch from one page to a hashtable */
1221  tbm_create_pagetable(tbm);
1222  }
1223 
1224  /* Look up or create an entry */
1225  page = pagetable_insert(tbm->pagetable, pageno, &found);
1226  }
1227 
1228  /* Initialize it if not present before */
1229  if (!found)
1230  {
1231  char oldstatus = page->status;
1232 
1233  MemSet(page, 0, sizeof(PagetableEntry));
1234  page->status = oldstatus;
1235  page->blockno = pageno;
1236  /* must count it too */
1237  tbm->nentries++;
1238  tbm->npages++;
1239  }
1240 
1241  return page;
1242 }
1243 
1244 /*
1245  * tbm_page_is_lossy - is the page marked as lossily stored?
1246  */
1247 static bool
1249 {
1250  PagetableEntry *page;
1251  BlockNumber chunk_pageno;
1252  int bitno;
1253 
1254  /* we can skip the lookup if there are no lossy chunks */
1255  if (tbm->nchunks == 0)
1256  return false;
1257  Assert(tbm->status == TBM_HASH);
1258 
1259  bitno = pageno % PAGES_PER_CHUNK;
1260  chunk_pageno = pageno - bitno;
1261 
1262  page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1263 
1264  if (page != NULL && page->ischunk)
1265  {
1266  int wordnum = WORDNUM(bitno);
1267  int bitnum = BITNUM(bitno);
1268 
1269  if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1270  return true;
1271  }
1272  return false;
1273 }
1274 
1275 /*
1276  * tbm_mark_page_lossy - mark the page number as lossily stored
1277  *
1278  * This may cause the table to exceed the desired memory size. It is
1279  * up to the caller to call tbm_lossify() at the next safe point if so.
1280  */
1281 static void
1283 {
1284  PagetableEntry *page;
1285  bool found;
1286  BlockNumber chunk_pageno;
1287  int bitno;
1288  int wordnum;
1289  int bitnum;
1290 
1291  /* We force the bitmap into hashtable mode whenever it's lossy */
1292  if (tbm->status != TBM_HASH)
1293  tbm_create_pagetable(tbm);
1294 
1295  bitno = pageno % PAGES_PER_CHUNK;
1296  chunk_pageno = pageno - bitno;
1297 
1298  /*
1299  * Remove any extant non-lossy entry for the page. If the page is its own
1300  * chunk header, however, we skip this and handle the case below.
1301  */
1302  if (bitno != 0)
1303  {
1304  if (pagetable_delete(tbm->pagetable, pageno))
1305  {
1306  /* It was present, so adjust counts */
1307  tbm->nentries--;
1308  tbm->npages--; /* assume it must have been non-lossy */
1309  }
1310  }
1311 
1312  /* Look up or create entry for chunk-header page */
1313  page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1314 
1315  /* Initialize it if not present before */
1316  if (!found)
1317  {
1318  char oldstatus = page->status;
1319 
1320  MemSet(page, 0, sizeof(PagetableEntry));
1321  page->status = oldstatus;
1322  page->blockno = chunk_pageno;
1323  page->ischunk = true;
1324  /* must count it too */
1325  tbm->nentries++;
1326  tbm->nchunks++;
1327  }
1328  else if (!page->ischunk)
1329  {
1330  char oldstatus = page->status;
1331 
1332  /* chunk header page was formerly non-lossy, make it lossy */
1333  MemSet(page, 0, sizeof(PagetableEntry));
1334  page->status = oldstatus;
1335  page->blockno = chunk_pageno;
1336  page->ischunk = true;
1337  /* we assume it had some tuple bit(s) set, so mark it lossy */
1338  page->words[0] = ((bitmapword) 1 << 0);
1339  /* adjust counts */
1340  tbm->nchunks++;
1341  tbm->npages--;
1342  }
1343 
1344  /* Now set the original target page's bit */
1345  wordnum = WORDNUM(bitno);
1346  bitnum = BITNUM(bitno);
1347  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1348 }
1349 
1350 /*
1351  * tbm_lossify - lose some information to get back under the memory limit
1352  */
1353 static void
1355 {
1356  pagetable_iterator i;
1357  PagetableEntry *page;
1358 
1359  /*
1360  * XXX Really stupid implementation: this just lossifies pages in
1361  * essentially random order. We should be paying some attention to the
1362  * number of bits set in each page, instead.
1363  *
1364  * Since we are called as soon as nentries exceeds maxentries, we should
1365  * push nentries down to significantly less than maxentries, or else we'll
1366  * just end up doing this again very soon. We shoot for maxentries/2.
1367  */
1369  Assert(tbm->status == TBM_HASH);
1370 
1371  pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1372  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1373  {
1374  if (page->ischunk)
1375  continue; /* already a chunk header */
1376 
1377  /*
1378  * If the page would become a chunk header, we won't save anything by
1379  * converting it to lossy, so skip it.
1380  */
1381  if ((page->blockno % PAGES_PER_CHUNK) == 0)
1382  continue;
1383 
1384  /* This does the dirty work ... */
1385  tbm_mark_page_lossy(tbm, page->blockno);
1386 
1387  if (tbm->nentries <= tbm->maxentries / 2)
1388  {
1389  /*
1390  * We have made enough room. Remember where to start lossifying
1391  * next round, so we evenly iterate over the hashtable.
1392  */
1393  tbm->lossify_start = i.cur;
1394  break;
1395  }
1396 
1397  /*
1398  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1399  * hashtable and may have deleted the non-lossy chunk. We can
1400  * continue the same hash table scan, since failure to visit one
1401  * element or visiting the newly inserted element, isn't fatal.
1402  */
1403  }
1404 
1405  /*
1406  * With a big bitmap and small work_mem, it's possible that we cannot get
1407  * under maxentries. Again, if that happens, we'd end up uselessly
1408  * calling tbm_lossify over and over. To prevent this from becoming a
1409  * performance sink, force maxentries up to at least double the current
1410  * number of entries. (In essence, we're admitting inability to fit
1411  * within work_mem when we do this.) Note that this test will not fire if
1412  * we broke out of the loop early; and if we didn't, the current number of
1413  * entries is simply not reducible any further.
1414  */
1415  if (tbm->nentries > tbm->maxentries / 2)
1416  tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1417 }
1418 
1419 /*
1420  * qsort comparator to handle PagetableEntry pointers.
1421  */
1422 static int
1423 tbm_comparator(const void *left, const void *right)
1424 {
1425  BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1426  BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1427 
1428  if (l < r)
1429  return -1;
1430  else if (l > r)
1431  return 1;
1432  return 0;
1433 }
1434 
1435 /*
1436  * As above, but this will get index into PagetableEntry array. Therefore,
1437  * it needs to get actual PagetableEntry using the index before comparing the
1438  * blockno.
1439  */
1440 static int
1441 tbm_shared_comparator(const void *left, const void *right, void *arg)
1442 {
1443  PagetableEntry *base = (PagetableEntry *) arg;
1444  PagetableEntry *lpage = &base[*(int *) left];
1445  PagetableEntry *rpage = &base[*(int *) right];
1446 
1447  if (lpage->blockno < rpage->blockno)
1448  return -1;
1449  else if (lpage->blockno > rpage->blockno)
1450  return 1;
1451  return 0;
1452 }
1453 
1454 /*
1455  * tbm_attach_shared_iterate
1456  *
1457  * Allocate a backend-private iterator and attach the shared iterator state
1458  * to it so that multiple processed can iterate jointly.
1459  *
1460  * We also converts the DSA pointers to local pointers and store them into
1461  * our private iterator.
1462  */
1465 {
1466  TBMSharedIterator *iterator;
1467  TBMSharedIteratorState *istate;
1468 
1469  /*
1470  * Create the TBMSharedIterator struct, with enough trailing space to
1471  * serve the needs of the TBMIterateResult sub-struct.
1472  */
1473  iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1474  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1475 
1476  istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477 
1478  iterator->state = istate;
1479 
1480  iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481 
1482  if (istate->npages)
1483  iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484  if (istate->nchunks)
1485  iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486 
1487  return iterator;
1488 }
1489 
1490 /*
1491  * pagetable_allocate
1492  *
1493  * Callback function for allocating the memory for hashtable elements.
1494  * Allocate memory for hashtable elements, using DSA if available.
1495  */
1496 static inline void *
1497 pagetable_allocate(pagetable_hash *pagetable, Size size)
1498 {
1499  TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1500  PTEntryArray *ptbase;
1501 
1502  if (tbm->dsa == NULL)
1503  return MemoryContextAllocExtended(pagetable->ctx, size,
1505 
1506  /*
1507  * Save the dsapagetable reference in dsapagetableold before allocating
1508  * new memory so that pagetable_free can free the old entry.
1509  */
1510  tbm->dsapagetableold = tbm->dsapagetable;
1512  sizeof(PTEntryArray) + size,
1514  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515 
1516  return ptbase->ptentry;
1517 }
1518 
1519 /*
1520  * pagetable_free
1521  *
1522  * Callback function for freeing hash table elements.
1523  */
1524 static inline void
1525 pagetable_free(pagetable_hash *pagetable, void *pointer)
1526 {
1527  TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1528 
1529  /* pfree the input pointer if DSA is not available */
1530  if (tbm->dsa == NULL)
1531  pfree(pointer);
1532  else if (DsaPointerIsValid(tbm->dsapagetableold))
1533  {
1534  dsa_free(tbm->dsa, tbm->dsapagetableold);
1536  }
1537 }
1538 
1539 /*
1540  * tbm_calculate_entries
1541  *
1542  * Estimate number of hashtable entries we can have within maxbytes.
1543  */
1544 long
1545 tbm_calculate_entries(double maxbytes)
1546 {
1547  long nbuckets;
1548 
1549  /*
1550  * Estimate number of hashtable entries we can have within maxbytes. This
1551  * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552  * for our purpose. Also count an extra Pointer per entry for the arrays
1553  * created during iteration readout.
1554  */
1555  nbuckets = maxbytes /
1556  (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557  nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1558  nbuckets = Max(nbuckets, 16); /* sanity limit */
1559 
1560  return nbuckets;
1561 }
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1423
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:588
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1145
Definition: lwlock.h:32
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:539
struct PTIterationArray PTIterationArray
#define InvalidDsaPointer
Definition: dsa.h:78
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:16
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:765
dsa_pointer dsa_allocate_extended(dsa_area *area, Size size, int flags)
Definition: dsa.c:664
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:457
dsa_pointer spages
Definition: tidbitmap.c:197
struct pagetable_hash * pagetable
Definition: tidbitmap.c:153
static void output(uint64 loop_count)
TBMSharedIteratorState * state
Definition: tidbitmap.c:220
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:812
#define BITNUM(x)
Definition: tidbitmap.c:78
TIDBitmap * tbm_create(long maxbytes, dsa_area *dsa)
Definition: tidbitmap.c:265
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:412
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
#define Min(x, y)
Definition: c.h:802
uint32 lossify_start
Definition: tidbitmap.c:159
static int tbm_shared_comparator(const void *left, const void *right, void *arg)
Definition: tidbitmap.c:1441
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsa_pointer schunks
Definition: tidbitmap.c:198
PTEntryArray * ptbase
Definition: tidbitmap.c:221
#define MemSet(start, val, len)
Definition: c.h:853
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:397
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:264
uint32 BlockNumber
Definition: block.h:31
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:33
PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:114
static void tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
Definition: tidbitmap.c:940
NodeTag
Definition: nodes.h:26
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:480
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:83
uint64 dsa_pointer
Definition: dsa.h:62
uint32 bitmapword
Definition: bitmapset.h:34
dsa_pointer dsapagetable
Definition: tidbitmap.c:164
TBMStatus
Definition: tidbitmap.c:128
#define WORDS_PER_PAGE
Definition: tidbitmap.c:81
uint16 OffsetNumber
Definition: off.h:24
MemoryContext mcxt
Definition: tidbitmap.c:151
Definition: type.h:89
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1722
PagetableEntry entry1
Definition: tidbitmap.c:160
BlockNumber blockno
Definition: tidbitmap.h:42
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:924
void pfree(void *pointer)
Definition: mcxt.c:949
char * Pointer
Definition: c.h:273
struct TBMSharedIteratorState TBMSharedIteratorState
dsa_pointer ptpages
Definition: tidbitmap.c:166
#define ERROR
Definition: elog.h:43
int schunkptr
Definition: tidbitmap.c:181
BlockNumber blockno
Definition: tidbitmap.c:101
static int tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
Definition: tidbitmap.c:910
int spageptr
Definition: tidbitmap.c:180
#define dsa_allocate0(area, size)
Definition: dsa.h:88
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:321
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.h:46
TBMIterateResult output
Definition: tidbitmap.c:224
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1168
dsa_pointer ptchunks
Definition: tidbitmap.c:167
int maxentries
Definition: tidbitmap.c:155
unsigned int uint32
Definition: c.h:296
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:340
static void pagetable_free(pagetable_hash *pagetable, void *pointer)
Definition: tidbitmap.c:1525
NodeTag type
Definition: tidbitmap.c:150
#define WORDNUM(x)
Definition: tidbitmap.c:77
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:674
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
TIDBitmap * tbm
Definition: tidbitmap.c:179
dsa_pointer pagetable
Definition: tidbitmap.c:196
void * palloc0(Size size)
Definition: mcxt.c:877
PTIterationArray * ptpages
Definition: tidbitmap.c:222
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:669
pg_atomic_uint32 refcount
Definition: tidbitmap.c:113
int nchunks
Definition: tidbitmap.c:157
PagetableEntry ** spages
Definition: tidbitmap.c:162
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:73
TBMIterateResult * tbm_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:970
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:105
int npages
Definition: tidbitmap.c:156
#define Max(x, y)
Definition: c.h:796
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define makeNode(_type_)
Definition: nodes.h:560
#define Assert(condition)
Definition: c.h:670
PagetableEntry ** schunks
Definition: tidbitmap.c:163
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:19
TBMIterateResult output
Definition: tidbitmap.c:183
size_t Size
Definition: c.h:404
TBMIterator * tbm_begin_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:688
TBMIterateResult * tbm_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1051
#define InvalidBlockNumber
Definition: block.h:33
TBMStatus status
Definition: tidbitmap.c:152
#define MAX_TUPLES_PER_PAGE
Definition: tidbitmap.c:56
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1118
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:95
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1248
dsa_pointer dsapagetableold
Definition: tidbitmap.c:165
struct PagetableEntry PagetableEntry
#define DsaPointerIsValid(x)
Definition: dsa.h:81
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:442
Definition: dsa.c:354
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:812
void * palloc(Size size)
Definition: mcxt.c:848
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:706
pg_atomic_uint32 refcount
Definition: tidbitmap.c:210
int i
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:1354
void * arg
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1282
int schunkbit
Definition: tidbitmap.c:182
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1201
TBMIteratingState iterating
Definition: tidbitmap.c:158
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:234
#define elog
Definition: elog.h:219
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:76
long tbm_calculate_entries(double maxbytes)
Definition: tidbitmap.c:1545
#define qsort(a, b, c, d)
Definition: port.h:408
TBMIteratingState
Definition: tidbitmap.c:138
static void * pagetable_allocate(pagetable_hash *pagetable, Size size)
Definition: tidbitmap.c:1497
int nentries
Definition: tidbitmap.c:154
dsa_area * dsa
Definition: tidbitmap.c:168
struct PTEntryArray PTEntryArray
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1464
int index[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:211
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:291
PTIterationArray * ptchunks
Definition: tidbitmap.c:223
#define dsa_allocate(area, size)
Definition: dsa.h:84
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1157