PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
tidbitmap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  * PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory. People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness. In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking. This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  * src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include <limits.h>
42 
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 
47 /*
48  * The maximum number of tuples per page is not large (typically 256 with
49  * 8K pages, or 1024 with 32K pages). So there's not much point in making
50  * the per-page bitmaps variable size. We just legislate that the size
51  * is this:
52  */
53 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
54 
55 /*
56  * When we have to switch over to lossy storage, we use a data structure
57  * with one bit per page, where all pages having the same number DIV
58  * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
59  * and has the bit set for a given page, there must not be a per-page entry
60  * for that page in the page table.
61  *
62  * We actually store both exact pages and lossy chunks in the same hash
63  * table, using identical data structures. (This is because the memory
64  * management for hashtables doesn't easily/efficiently allow space to be
65  * transferred easily from one hashtable to another.) Therefore it's best
66  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
67  * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
68  * avoid expensive integer remainder operations. So, define it like this:
69  */
70 #define PAGES_PER_CHUNK (BLCKSZ / 32)
71 
72 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
73 
74 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
75 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
76 
77 /* number of active words for an exact page: */
78 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
79 /* number of active words for a lossy chunk: */
80 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
81 
82 /*
83  * The hashtable entries are represented by this data structure. For
84  * an exact page, blockno is the page number and bit k of the bitmap
85  * represents tuple offset k+1. For a lossy chunk, blockno is the first
86  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
87  * bit k represents page blockno+k. Note that it is not possible to
88  * have exact storage for the first page of a chunk if we are using
89  * lossy storage for any page in the chunk's range, since the same
90  * hashtable entry has to serve both purposes.
91  *
92  * recheck is used only on exact pages --- it indicates that although
93  * only the stated tuples need be checked, the full index qual condition
94  * must be checked for each (ie, these are candidate matches).
95  */
96 typedef struct PagetableEntry
97 {
98  BlockNumber blockno; /* page number (hashtable key) */
99  char status; /* hash entry status */
100  bool ischunk; /* T = lossy storage, F = exact */
101  bool recheck; /* should the tuples be rechecked? */
104 
105 /*
106  * We want to avoid the overhead of creating the hashtable, which is
107  * comparatively large, when not necessary. Particularly when we are using a
108  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
109  * long enough to accumulate one entry in such cases. We therefore avoid
110  * creating an actual hashtable until we need two pagetable entries. When
111  * just one pagetable entry is needed, we store it in a fixed field of
112  * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
113  * shrinks down to zero or one page again. So, status can be TBM_HASH even
114  * when nentries is zero or one.)
115  */
116 typedef enum
117 {
118  TBM_EMPTY, /* no hashtable, nentries == 0 */
119  TBM_ONE_PAGE, /* entry1 contains the single entry */
120  TBM_HASH /* pagetable is valid, entry1 is not */
121 } TBMStatus;
122 
123 /*
124  * Here is the representation for a whole TIDBitMap:
125  */
126 struct TIDBitmap
127 {
128  NodeTag type; /* to make it a valid Node */
129  MemoryContext mcxt; /* memory context containing me */
130  TBMStatus status; /* see codes above */
131  struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
132  int nentries; /* number of entries in pagetable */
133  int maxentries; /* limit on same to meet maxbytes */
134  int npages; /* number of exact entries in pagetable */
135  int nchunks; /* number of lossy entries in pagetable */
136  bool iterating; /* tbm_begin_iterate called? */
137  uint32 lossify_start; /* offset to start lossifying hashtable at */
138  PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
139  /* these are valid when iterating is true: */
140  PagetableEntry **spages; /* sorted exact-page list, or NULL */
141  PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
142 };
143 
144 /*
145  * When iterating over a bitmap in sorted order, a TBMIterator is used to
146  * track our progress. There can be several iterators scanning the same
147  * bitmap concurrently. Note that the bitmap becomes read-only as soon as
148  * any iterator is created.
149  */
151 {
152  TIDBitmap *tbm; /* TIDBitmap we're iterating over */
153  int spageptr; /* next spages index */
154  int schunkptr; /* next schunks index */
155  int schunkbit; /* next bit to check in current schunk */
156  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
157 };
158 
159 
160 /* Local function prototypes */
161 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
162 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
163  const TIDBitmap *b);
164 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
165  BlockNumber pageno);
167 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
168 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
169 static void tbm_lossify(TIDBitmap *tbm);
170 static int tbm_comparator(const void *left, const void *right);
171 
172 /*
173  * Simple inline murmur hash implementation for the exact width required, for
174  * performance.
175  */
176 static inline uint32
178 {
179  uint32 h = b;
180 
181  h ^= h >> 16;
182  h *= 0x85ebca6b;
183  h ^= h >> 13;
184  h *= 0xc2b2ae35;
185  h ^= h >> 16;
186  return h;
187 }
188 
189 /* define hashtable mapping block numbers to PagetableEntry's */
190 #define SH_PREFIX pagetable
191 #define SH_ELEMENT_TYPE PagetableEntry
192 #define SH_KEY_TYPE BlockNumber
193 #define SH_KEY blockno
194 #define SH_HASH_KEY(tb, key) hash_blockno(key)
195 #define SH_EQUAL(tb, a, b) a == b
196 #define SH_SCOPE static inline
197 #define SH_DEFINE
198 #define SH_DECLARE
199 #include "lib/simplehash.h"
200 
201 
202 /*
203  * tbm_create - create an initially-empty bitmap
204  *
205  * The bitmap will live in the memory context that is CurrentMemoryContext
206  * at the time of this call. It will be limited to (approximately) maxbytes
207  * total memory consumption.
208  */
209 TIDBitmap *
210 tbm_create(long maxbytes)
211 {
212  TIDBitmap *tbm;
213  long nbuckets;
214 
215  /* Create the TIDBitmap struct and zero all its fields */
216  tbm = makeNode(TIDBitmap);
217 
218  tbm->mcxt = CurrentMemoryContext;
219  tbm->status = TBM_EMPTY;
220 
221  /*
222  * Estimate number of hashtable entries we can have within maxbytes. This
223  * estimates the hash cost as sizeof(PagetableEntry), which is good enough
224  * for our purpose. Also count an extra Pointer per entry for the arrays
225  * created during iteration readout.
226  */
227  nbuckets = maxbytes /
228  (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
229  nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
230  nbuckets = Max(nbuckets, 16); /* sanity limit */
231  tbm->maxentries = (int) nbuckets;
232  tbm->lossify_start = 0;
233 
234  return tbm;
235 }
236 
237 /*
238  * Actually create the hashtable. Since this is a moderately expensive
239  * proposition, we don't do it until we have to.
240  */
241 static void
243 {
244  Assert(tbm->status != TBM_HASH);
245  Assert(tbm->pagetable == NULL);
246 
247  tbm->pagetable = pagetable_create(tbm->mcxt, 128, NULL);
248 
249  /* If entry1 is valid, push it into the hashtable */
250  if (tbm->status == TBM_ONE_PAGE)
251  {
252  PagetableEntry *page;
253  bool found;
254  char oldstatus;
255 
256  page = pagetable_insert(tbm->pagetable,
257  tbm->entry1.blockno,
258  &found);
259  Assert(!found);
260  oldstatus = page->status;
261  memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
262  page->status = oldstatus;
263  }
264 
265  tbm->status = TBM_HASH;
266 }
267 
268 /*
269  * tbm_free - free a TIDBitmap
270  */
271 void
273 {
274  if (tbm->pagetable)
275  pagetable_destroy(tbm->pagetable);
276  if (tbm->spages)
277  pfree(tbm->spages);
278  if (tbm->schunks)
279  pfree(tbm->schunks);
280  pfree(tbm);
281 }
282 
283 /*
284  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
285  *
286  * If recheck is true, then the recheck flag will be set in the
287  * TBMIterateResult when any of these tuples are reported out.
288  */
289 void
290 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
291  bool recheck)
292 {
294  PagetableEntry *page = NULL; /* only valid when currblk is valid */
295  int i;
296 
297  Assert(!tbm->iterating);
298  for (i = 0; i < ntids; i++)
299  {
300  BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
302  int wordnum,
303  bitnum;
304 
305  /* safety check to ensure we don't overrun bit array bounds */
306  if (off < 1 || off > MAX_TUPLES_PER_PAGE)
307  elog(ERROR, "tuple offset out of range: %u", off);
308 
309  /*
310  * Look up target page unless we already did. This saves cycles when
311  * the input includes consecutive tuples on the same page, which is
312  * common enough to justify an extra test here.
313  */
314  if (blk != currblk)
315  {
316  if (tbm_page_is_lossy(tbm, blk))
317  page = NULL; /* remember page is lossy */
318  else
319  page = tbm_get_pageentry(tbm, blk);
320  currblk = blk;
321  }
322 
323  if (page == NULL)
324  continue; /* whole page is already marked */
325 
326  if (page->ischunk)
327  {
328  /* The page is a lossy chunk header, set bit for itself */
329  wordnum = bitnum = 0;
330  }
331  else
332  {
333  /* Page is exact, so set bit for individual tuple */
334  wordnum = WORDNUM(off - 1);
335  bitnum = BITNUM(off - 1);
336  }
337  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
338  page->recheck |= recheck;
339 
340  if (tbm->nentries > tbm->maxentries)
341  {
342  tbm_lossify(tbm);
343  /* Page could have been converted to lossy, so force new lookup */
344  currblk = InvalidBlockNumber;
345  }
346  }
347 }
348 
349 /*
350  * tbm_add_page - add a whole page to a TIDBitmap
351  *
352  * This causes the whole page to be reported (with the recheck flag)
353  * when the TIDBitmap is scanned.
354  */
355 void
357 {
358  /* Enter the page in the bitmap, or mark it lossy if already present */
359  tbm_mark_page_lossy(tbm, pageno);
360  /* If we went over the memory limit, lossify some more pages */
361  if (tbm->nentries > tbm->maxentries)
362  tbm_lossify(tbm);
363 }
364 
365 /*
366  * tbm_union - set union
367  *
368  * a is modified in-place, b is not changed
369  */
370 void
372 {
373  Assert(!a->iterating);
374  /* Nothing to do if b is empty */
375  if (b->nentries == 0)
376  return;
377  /* Scan through chunks and pages in b, merge into a */
378  if (b->status == TBM_ONE_PAGE)
379  tbm_union_page(a, &b->entry1);
380  else
381  {
382  pagetable_iterator i;
383  PagetableEntry *bpage;
384 
385  Assert(b->status == TBM_HASH);
386  pagetable_start_iterate(b->pagetable, &i);
387  while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
388  tbm_union_page(a, bpage);
389  }
390 }
391 
392 /* Process one page of b during a union op */
393 static void
395 {
396  PagetableEntry *apage;
397  int wordnum;
398 
399  if (bpage->ischunk)
400  {
401  /* Scan b's chunk, mark each indicated page lossy in a */
402  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
403  {
404  bitmapword w = bpage->words[wordnum];
405 
406  if (w != 0)
407  {
408  BlockNumber pg;
409 
410  pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
411  while (w != 0)
412  {
413  if (w & 1)
414  tbm_mark_page_lossy(a, pg);
415  pg++;
416  w >>= 1;
417  }
418  }
419  }
420  }
421  else if (tbm_page_is_lossy(a, bpage->blockno))
422  {
423  /* page is already lossy in a, nothing to do */
424  return;
425  }
426  else
427  {
428  apage = tbm_get_pageentry(a, bpage->blockno);
429  if (apage->ischunk)
430  {
431  /* The page is a lossy chunk header, set bit for itself */
432  apage->words[0] |= ((bitmapword) 1 << 0);
433  }
434  else
435  {
436  /* Both pages are exact, merge at the bit level */
437  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
438  apage->words[wordnum] |= bpage->words[wordnum];
439  apage->recheck |= bpage->recheck;
440  }
441  }
442 
443  if (a->nentries > a->maxentries)
444  tbm_lossify(a);
445 }
446 
447 /*
448  * tbm_intersect - set intersection
449  *
450  * a is modified in-place, b is not changed
451  */
452 void
454 {
455  Assert(!a->iterating);
456  /* Nothing to do if a is empty */
457  if (a->nentries == 0)
458  return;
459  /* Scan through chunks and pages in a, try to match to b */
460  if (a->status == TBM_ONE_PAGE)
461  {
462  if (tbm_intersect_page(a, &a->entry1, b))
463  {
464  /* Page is now empty, remove it from a */
465  Assert(!a->entry1.ischunk);
466  a->npages--;
467  a->nentries--;
468  Assert(a->nentries == 0);
469  a->status = TBM_EMPTY;
470  }
471  }
472  else
473  {
474  pagetable_iterator i;
475  PagetableEntry *apage;
476 
477  Assert(a->status == TBM_HASH);
478  pagetable_start_iterate(a->pagetable, &i);
479  while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
480  {
481  if (tbm_intersect_page(a, apage, b))
482  {
483  /* Page or chunk is now empty, remove it from a */
484  if (apage->ischunk)
485  a->nchunks--;
486  else
487  a->npages--;
488  a->nentries--;
489  if (!pagetable_delete(a->pagetable, apage->blockno))
490  elog(ERROR, "hash table corrupted");
491  }
492  }
493  }
494 }
495 
496 /*
497  * Process one page of a during an intersection op
498  *
499  * Returns TRUE if apage is now empty and should be deleted from a
500  */
501 static bool
503 {
504  const PagetableEntry *bpage;
505  int wordnum;
506 
507  if (apage->ischunk)
508  {
509  /* Scan each bit in chunk, try to clear */
510  bool candelete = true;
511 
512  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
513  {
514  bitmapword w = apage->words[wordnum];
515 
516  if (w != 0)
517  {
518  bitmapword neww = w;
519  BlockNumber pg;
520  int bitnum;
521 
522  pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
523  bitnum = 0;
524  while (w != 0)
525  {
526  if (w & 1)
527  {
528  if (!tbm_page_is_lossy(b, pg) &&
529  tbm_find_pageentry(b, pg) == NULL)
530  {
531  /* Page is not in b at all, lose lossy bit */
532  neww &= ~((bitmapword) 1 << bitnum);
533  }
534  }
535  pg++;
536  bitnum++;
537  w >>= 1;
538  }
539  apage->words[wordnum] = neww;
540  if (neww != 0)
541  candelete = false;
542  }
543  }
544  return candelete;
545  }
546  else if (tbm_page_is_lossy(b, apage->blockno))
547  {
548  /*
549  * Some of the tuples in 'a' might not satisfy the quals for 'b', but
550  * because the page 'b' is lossy, we don't know which ones. Therefore
551  * we mark 'a' as requiring rechecks, to indicate that at most those
552  * tuples set in 'a' are matches.
553  */
554  apage->recheck = true;
555  return false;
556  }
557  else
558  {
559  bool candelete = true;
560 
561  bpage = tbm_find_pageentry(b, apage->blockno);
562  if (bpage != NULL)
563  {
564  /* Both pages are exact, merge at the bit level */
565  Assert(!bpage->ischunk);
566  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
567  {
568  apage->words[wordnum] &= bpage->words[wordnum];
569  if (apage->words[wordnum] != 0)
570  candelete = false;
571  }
572  apage->recheck |= bpage->recheck;
573  }
574  /* If there is no matching b page, we can just delete the a page */
575  return candelete;
576  }
577 }
578 
579 /*
580  * tbm_is_empty - is a TIDBitmap completely empty?
581  */
582 bool
584 {
585  return (tbm->nentries == 0);
586 }
587 
588 /*
589  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
590  *
591  * The TBMIterator struct is created in the caller's memory context.
592  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
593  * okay to just allow the memory context to be released, too. It is caller's
594  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
595  * is freed.
596  *
597  * NB: after this is called, it is no longer allowed to modify the contents
598  * of the bitmap. However, you can call this multiple times to scan the
599  * contents repeatedly, including parallel scans.
600  */
601 TBMIterator *
603 {
604  TBMIterator *iterator;
605 
606  /*
607  * Create the TBMIterator struct, with enough trailing space to serve the
608  * needs of the TBMIterateResult sub-struct.
609  */
610  iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
612  iterator->tbm = tbm;
613 
614  /*
615  * Initialize iteration pointers.
616  */
617  iterator->spageptr = 0;
618  iterator->schunkptr = 0;
619  iterator->schunkbit = 0;
620 
621  /*
622  * If we have a hashtable, create and fill the sorted page lists, unless
623  * we already did that for a previous iterator. Note that the lists are
624  * attached to the bitmap not the iterator, so they can be used by more
625  * than one iterator.
626  */
627  if (tbm->status == TBM_HASH && !tbm->iterating)
628  {
629  pagetable_iterator i;
630  PagetableEntry *page;
631  int npages;
632  int nchunks;
633 
634  if (!tbm->spages && tbm->npages > 0)
635  tbm->spages = (PagetableEntry **)
637  tbm->npages * sizeof(PagetableEntry *));
638  if (!tbm->schunks && tbm->nchunks > 0)
639  tbm->schunks = (PagetableEntry **)
641  tbm->nchunks * sizeof(PagetableEntry *));
642 
643  npages = nchunks = 0;
644  pagetable_start_iterate(tbm->pagetable, &i);
645  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
646  {
647  if (page->ischunk)
648  tbm->schunks[nchunks++] = page;
649  else
650  tbm->spages[npages++] = page;
651  }
652  Assert(npages == tbm->npages);
653  Assert(nchunks == tbm->nchunks);
654  if (npages > 1)
655  qsort(tbm->spages, npages, sizeof(PagetableEntry *),
657  if (nchunks > 1)
658  qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
660  }
661 
662  tbm->iterating = true;
663 
664  return iterator;
665 }
666 
667 /*
668  * tbm_iterate - scan through next page of a TIDBitmap
669  *
670  * Returns a TBMIterateResult representing one page, or NULL if there are
671  * no more pages to scan. Pages are guaranteed to be delivered in numerical
672  * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
673  * remember the exact tuples to look at on this page --- the caller must
674  * examine all tuples on the page and check if they meet the intended
675  * condition. If result->recheck is true, only the indicated tuples need
676  * be examined, but the condition must be rechecked anyway. (For ease of
677  * testing, recheck is always set true when ntuples < 0.)
678  */
681 {
682  TIDBitmap *tbm = iterator->tbm;
683  TBMIterateResult *output = &(iterator->output);
684 
685  Assert(tbm->iterating);
686 
687  /*
688  * If lossy chunk pages remain, make sure we've advanced schunkptr/
689  * schunkbit to the next set bit.
690  */
691  while (iterator->schunkptr < tbm->nchunks)
692  {
693  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
694  int schunkbit = iterator->schunkbit;
695 
696  while (schunkbit < PAGES_PER_CHUNK)
697  {
698  int wordnum = WORDNUM(schunkbit);
699  int bitnum = BITNUM(schunkbit);
700 
701  if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
702  break;
703  schunkbit++;
704  }
705  if (schunkbit < PAGES_PER_CHUNK)
706  {
707  iterator->schunkbit = schunkbit;
708  break;
709  }
710  /* advance to next chunk */
711  iterator->schunkptr++;
712  iterator->schunkbit = 0;
713  }
714 
715  /*
716  * If both chunk and per-page data remain, must output the numerically
717  * earlier page.
718  */
719  if (iterator->schunkptr < tbm->nchunks)
720  {
721  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
722  BlockNumber chunk_blockno;
723 
724  chunk_blockno = chunk->blockno + iterator->schunkbit;
725  if (iterator->spageptr >= tbm->npages ||
726  chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
727  {
728  /* Return a lossy page indicator from the chunk */
729  output->blockno = chunk_blockno;
730  output->ntuples = -1;
731  output->recheck = true;
732  iterator->schunkbit++;
733  return output;
734  }
735  }
736 
737  if (iterator->spageptr < tbm->npages)
738  {
739  PagetableEntry *page;
740  int ntuples;
741  int wordnum;
742 
743  /* In ONE_PAGE state, we don't allocate an spages[] array */
744  if (tbm->status == TBM_ONE_PAGE)
745  page = &tbm->entry1;
746  else
747  page = tbm->spages[iterator->spageptr];
748 
749  /* scan bitmap to extract individual offset numbers */
750  ntuples = 0;
751  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
752  {
753  bitmapword w = page->words[wordnum];
754 
755  if (w != 0)
756  {
757  int off = wordnum * BITS_PER_BITMAPWORD + 1;
758 
759  while (w != 0)
760  {
761  if (w & 1)
762  output->offsets[ntuples++] = (OffsetNumber) off;
763  off++;
764  w >>= 1;
765  }
766  }
767  }
768  output->blockno = page->blockno;
769  output->ntuples = ntuples;
770  output->recheck = page->recheck;
771  iterator->spageptr++;
772  return output;
773  }
774 
775  /* Nothing more in the bitmap */
776  return NULL;
777 }
778 
779 /*
780  * tbm_end_iterate - finish an iteration over a TIDBitmap
781  *
782  * Currently this is just a pfree, but it might do more someday. (For
783  * instance, it could be useful to count open iterators and allow the
784  * bitmap to return to read/write status when there are no more iterators.)
785  */
786 void
788 {
789  pfree(iterator);
790 }
791 
792 /*
793  * tbm_find_pageentry - find a PagetableEntry for the pageno
794  *
795  * Returns NULL if there is no non-lossy entry for the pageno.
796  */
797 static const PagetableEntry *
799 {
800  const PagetableEntry *page;
801 
802  if (tbm->nentries == 0) /* in case pagetable doesn't exist */
803  return NULL;
804 
805  if (tbm->status == TBM_ONE_PAGE)
806  {
807  page = &tbm->entry1;
808  if (page->blockno != pageno)
809  return NULL;
810  Assert(!page->ischunk);
811  return page;
812  }
813 
814  page = pagetable_lookup(tbm->pagetable, pageno);
815  if (page == NULL)
816  return NULL;
817  if (page->ischunk)
818  return NULL; /* don't want a lossy chunk header */
819  return page;
820 }
821 
822 /*
823  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
824  *
825  * If new, the entry is marked as an exact (non-chunk) entry.
826  *
827  * This may cause the table to exceed the desired memory size. It is
828  * up to the caller to call tbm_lossify() at the next safe point if so.
829  */
830 static PagetableEntry *
832 {
833  PagetableEntry *page;
834  bool found;
835 
836  if (tbm->status == TBM_EMPTY)
837  {
838  /* Use the fixed slot */
839  page = &tbm->entry1;
840  found = false;
841  tbm->status = TBM_ONE_PAGE;
842  }
843  else
844  {
845  if (tbm->status == TBM_ONE_PAGE)
846  {
847  page = &tbm->entry1;
848  if (page->blockno == pageno)
849  return page;
850  /* Time to switch from one page to a hashtable */
852  }
853 
854  /* Look up or create an entry */
855  page = pagetable_insert(tbm->pagetable, pageno, &found);
856  }
857 
858  /* Initialize it if not present before */
859  if (!found)
860  {
861  char oldstatus = page->status;
862 
863  MemSet(page, 0, sizeof(PagetableEntry));
864  page->status = oldstatus;
865  page->blockno = pageno;
866  /* must count it too */
867  tbm->nentries++;
868  tbm->npages++;
869  }
870 
871  return page;
872 }
873 
874 /*
875  * tbm_page_is_lossy - is the page marked as lossily stored?
876  */
877 static bool
879 {
880  PagetableEntry *page;
881  BlockNumber chunk_pageno;
882  int bitno;
883 
884  /* we can skip the lookup if there are no lossy chunks */
885  if (tbm->nchunks == 0)
886  return false;
887  Assert(tbm->status == TBM_HASH);
888 
889  bitno = pageno % PAGES_PER_CHUNK;
890  chunk_pageno = pageno - bitno;
891 
892  page = pagetable_lookup(tbm->pagetable, chunk_pageno);
893 
894  if (page != NULL && page->ischunk)
895  {
896  int wordnum = WORDNUM(bitno);
897  int bitnum = BITNUM(bitno);
898 
899  if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
900  return true;
901  }
902  return false;
903 }
904 
905 /*
906  * tbm_mark_page_lossy - mark the page number as lossily stored
907  *
908  * This may cause the table to exceed the desired memory size. It is
909  * up to the caller to call tbm_lossify() at the next safe point if so.
910  */
911 static void
913 {
914  PagetableEntry *page;
915  bool found;
916  BlockNumber chunk_pageno;
917  int bitno;
918  int wordnum;
919  int bitnum;
920 
921  /* We force the bitmap into hashtable mode whenever it's lossy */
922  if (tbm->status != TBM_HASH)
924 
925  bitno = pageno % PAGES_PER_CHUNK;
926  chunk_pageno = pageno - bitno;
927 
928  /*
929  * Remove any extant non-lossy entry for the page. If the page is its own
930  * chunk header, however, we skip this and handle the case below.
931  */
932  if (bitno != 0)
933  {
934  if (pagetable_delete(tbm->pagetable, pageno))
935  {
936  /* It was present, so adjust counts */
937  tbm->nentries--;
938  tbm->npages--; /* assume it must have been non-lossy */
939  }
940  }
941 
942  /* Look up or create entry for chunk-header page */
943  page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
944 
945  /* Initialize it if not present before */
946  if (!found)
947  {
948  char oldstatus = page->status;
949 
950  MemSet(page, 0, sizeof(PagetableEntry));
951  page->status = oldstatus;
952  page->blockno = chunk_pageno;
953  page->ischunk = true;
954  /* must count it too */
955  tbm->nentries++;
956  tbm->nchunks++;
957  }
958  else if (!page->ischunk)
959  {
960  char oldstatus = page->status;
961 
962  /* chunk header page was formerly non-lossy, make it lossy */
963  MemSet(page, 0, sizeof(PagetableEntry));
964  page->status = oldstatus;
965  page->blockno = chunk_pageno;
966  page->ischunk = true;
967  /* we assume it had some tuple bit(s) set, so mark it lossy */
968  page->words[0] = ((bitmapword) 1 << 0);
969  /* adjust counts */
970  tbm->nchunks++;
971  tbm->npages--;
972  }
973 
974  /* Now set the original target page's bit */
975  wordnum = WORDNUM(bitno);
976  bitnum = BITNUM(bitno);
977  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
978 }
979 
980 /*
981  * tbm_lossify - lose some information to get back under the memory limit
982  */
983 static void
985 {
986  pagetable_iterator i;
987  PagetableEntry *page;
988 
989  /*
990  * XXX Really stupid implementation: this just lossifies pages in
991  * essentially random order. We should be paying some attention to the
992  * number of bits set in each page, instead.
993  *
994  * Since we are called as soon as nentries exceeds maxentries, we should
995  * push nentries down to significantly less than maxentries, or else we'll
996  * just end up doing this again very soon. We shoot for maxentries/2.
997  */
998  Assert(!tbm->iterating);
999  Assert(tbm->status == TBM_HASH);
1000 
1001  pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1002  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1003  {
1004  if (page->ischunk)
1005  continue; /* already a chunk header */
1006 
1007  /*
1008  * If the page would become a chunk header, we won't save anything by
1009  * converting it to lossy, so skip it.
1010  */
1011  if ((page->blockno % PAGES_PER_CHUNK) == 0)
1012  continue;
1013 
1014  /* This does the dirty work ... */
1015  tbm_mark_page_lossy(tbm, page->blockno);
1016 
1017  if (tbm->nentries <= tbm->maxentries / 2)
1018  {
1019  /*
1020  * We have made enough room. Remember where to start lossifying
1021  * next round, so we evenly iterate over the hashtable.
1022  */
1023  tbm->lossify_start = i.cur;
1024  break;
1025  }
1026 
1027  /*
1028  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1029  * hashtable and may have deleted the non-lossy chunk. We can
1030  * continue the same hash table scan, since failure to visit one
1031  * element or visiting the newly inserted element, isn't fatal.
1032  */
1033  }
1034 
1035  /*
1036  * With a big bitmap and small work_mem, it's possible that we cannot get
1037  * under maxentries. Again, if that happens, we'd end up uselessly
1038  * calling tbm_lossify over and over. To prevent this from becoming a
1039  * performance sink, force maxentries up to at least double the current
1040  * number of entries. (In essence, we're admitting inability to fit
1041  * within work_mem when we do this.) Note that this test will not fire if
1042  * we broke out of the loop early; and if we didn't, the current number of
1043  * entries is simply not reducible any further.
1044  */
1045  if (tbm->nentries > tbm->maxentries / 2)
1046  tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1047 }
1048 
1049 /*
1050  * qsort comparator to handle PagetableEntry pointers.
1051  */
1052 static int
1053 tbm_comparator(const void *left, const void *right)
1054 {
1055  BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1056  BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1057 
1058  if (l < r)
1059  return -1;
1060  else if (l > r)
1061  return 1;
1062  return 0;
1063 }
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1053
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:502
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:787
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:453
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:371
struct pagetable_hash * pagetable
Definition: tidbitmap.c:131
static void output(uint64 loop_count)
#define BITNUM(x)
Definition: tidbitmap.c:75
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:290
#define Min(x, y)
Definition: c.h:802
uint32 lossify_start
Definition: tidbitmap.c:137
#define MemSet(start, val, len)
Definition: c.h:853
uint32 BlockNumber
Definition: block.h:31
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:28
NodeTag
Definition: nodes.h:26
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:394
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:80
uint32 bitmapword
Definition: bitmapset.h:29
TBMStatus
Definition: tidbitmap.c:116
#define WORDS_PER_PAGE
Definition: tidbitmap.c:78
uint16 OffsetNumber
Definition: off.h:24
MemoryContext mcxt
Definition: tidbitmap.c:129
PagetableEntry entry1
Definition: tidbitmap.c:138
BlockNumber blockno
Definition: tidbitmap.h:40
void pfree(void *pointer)
Definition: mcxt.c:992
char * Pointer
Definition: c.h:242
#define ERROR
Definition: elog.h:43
int schunkptr
Definition: tidbitmap.c:154
BlockNumber blockno
Definition: tidbitmap.c:98
int spageptr
Definition: tidbitmap.c:153
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:272
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.h:44
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:798
int maxentries
Definition: tidbitmap.c:133
unsigned int uint32
Definition: c.h:265
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
NodeTag type
Definition: tidbitmap.c:128
#define WORDNUM(x)
Definition: tidbitmap.c:74
TIDBitmap * tbm
Definition: tidbitmap.c:152
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:583
int nchunks
Definition: tidbitmap.c:135
PagetableEntry ** spages
Definition: tidbitmap.c:140
bool iterating
Definition: tidbitmap.c:136
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:70
TBMIterateResult * tbm_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:680
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:102
int npages
Definition: tidbitmap.c:134
#define Max(x, y)
Definition: c.h:796
#define makeNode(_type_)
Definition: nodes.h:556
static uint32 hash_blockno(BlockNumber b)
Definition: tidbitmap.c:177
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:671
PagetableEntry ** schunks
Definition: tidbitmap.c:141
TBMIterateResult output
Definition: tidbitmap.c:156
TBMIterator * tbm_begin_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:602
#define InvalidBlockNumber
Definition: block.h:33
TBMStatus status
Definition: tidbitmap.c:130
#define MAX_TUPLES_PER_PAGE
Definition: tidbitmap.c:53
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:76
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:878
TIDBitmap * tbm_create(long maxbytes)
Definition: tidbitmap.c:210
struct PagetableEntry PagetableEntry
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:356
void * palloc(Size size)
Definition: mcxt.c:891
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:749
int i
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:984
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:912
int schunkbit
Definition: tidbitmap.c:155
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:831
#define elog
Definition: elog.h:219
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:66
#define qsort(a, b, c, d)
Definition: port.h:440
int nentries
Definition: tidbitmap.c:132
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:242