PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
tidbitmap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  * PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory. People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness. In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking. This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  * src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include <limits.h>
42 
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48 #include "utils/hashutils.h"
49 
50 /*
51  * The maximum number of tuples per page is not large (typically 256 with
52  * 8K pages, or 1024 with 32K pages). So there's not much point in making
53  * the per-page bitmaps variable size. We just legislate that the size
54  * is this:
55  */
56 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57 
58 /*
59  * When we have to switch over to lossy storage, we use a data structure
60  * with one bit per page, where all pages having the same number DIV
61  * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62  * and has the bit set for a given page, there must not be a per-page entry
63  * for that page in the page table.
64  *
65  * We actually store both exact pages and lossy chunks in the same hash
66  * table, using identical data structures. (This is because the memory
67  * management for hashtables doesn't easily/efficiently allow space to be
68  * transferred easily from one hashtable to another.) Therefore it's best
69  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70  * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71  * avoid expensive integer remainder operations. So, define it like this:
72  */
73 #define PAGES_PER_CHUNK (BLCKSZ / 32)
74 
75 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 
77 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79 
80 /* number of active words for an exact page: */
81 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 /* number of active words for a lossy chunk: */
83 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 
85 /*
86  * The hashtable entries are represented by this data structure. For
87  * an exact page, blockno is the page number and bit k of the bitmap
88  * represents tuple offset k+1. For a lossy chunk, blockno is the first
89  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90  * bit k represents page blockno+k. Note that it is not possible to
91  * have exact storage for the first page of a chunk if we are using
92  * lossy storage for any page in the chunk's range, since the same
93  * hashtable entry has to serve both purposes.
94  *
95  * recheck is used only on exact pages --- it indicates that although
96  * only the stated tuples need be checked, the full index qual condition
97  * must be checked for each (ie, these are candidate matches).
98  */
99 typedef struct PagetableEntry
100 {
101  BlockNumber blockno; /* page number (hashtable key) */
102  char status; /* hash entry status */
103  bool ischunk; /* T = lossy storage, F = exact */
104  bool recheck; /* should the tuples be rechecked? */
107 
108 /*
109  * Holds array of pagetable entries.
110  */
111 typedef struct PTEntryArray
112 {
113  pg_atomic_uint32 refcount; /* no. of iterator attached */
114  PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 } PTEntryArray;
116 
117 /*
118  * We want to avoid the overhead of creating the hashtable, which is
119  * comparatively large, when not necessary. Particularly when we are using a
120  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121  * long enough to accumulate one entry in such cases. We therefore avoid
122  * creating an actual hashtable until we need two pagetable entries. When
123  * just one pagetable entry is needed, we store it in a fixed field of
124  * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125  * shrinks down to zero or one page again. So, status can be TBM_HASH even
126  * when nentries is zero or one.)
127  */
128 typedef enum
129 {
130  TBM_EMPTY, /* no hashtable, nentries == 0 */
131  TBM_ONE_PAGE, /* entry1 contains the single entry */
132  TBM_HASH /* pagetable is valid, entry1 is not */
133 } TBMStatus;
134 
135 /*
136  * Current iterating state of the TBM.
137  */
138 typedef enum
139 {
140  TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
141  TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
142  TBM_ITERATING_SHARED /* converted to shared page and chunk array */
144 
145 /*
146  * Here is the representation for a whole TIDBitMap:
147  */
148 struct TIDBitmap
149 {
150  NodeTag type; /* to make it a valid Node */
151  MemoryContext mcxt; /* memory context containing me */
152  TBMStatus status; /* see codes above */
153  struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
154  int nentries; /* number of entries in pagetable */
155  int maxentries; /* limit on same to meet maxbytes */
156  int npages; /* number of exact entries in pagetable */
157  int nchunks; /* number of lossy entries in pagetable */
158  TBMIteratingState iterating; /* tbm_begin_iterate called? */
159  uint32 lossify_start; /* offset to start lossifying hashtable at */
160  PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
161  /* these are valid when iterating is true: */
162  PagetableEntry **spages; /* sorted exact-page list, or NULL */
163  PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
164  dsa_pointer dsapagetable; /* dsa_pointer to the element array */
165  dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
166  dsa_pointer ptpages; /* dsa_pointer to the page array */
167  dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
168  dsa_area *dsa; /* reference to per-query dsa area */
169 };
170 
171 /*
172  * When iterating over a bitmap in sorted order, a TBMIterator is used to
173  * track our progress. There can be several iterators scanning the same
174  * bitmap concurrently. Note that the bitmap becomes read-only as soon as
175  * any iterator is created.
176  */
178 {
179  TIDBitmap *tbm; /* TIDBitmap we're iterating over */
180  int spageptr; /* next spages index */
181  int schunkptr; /* next schunks index */
182  int schunkbit; /* next bit to check in current schunk */
183  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
184 };
185 
186 /*
187  * Holds the shared members of the iterator so that multiple processes
188  * can jointly iterate.
189  */
191 {
192  int nentries; /* number of entries in pagetable */
193  int maxentries; /* limit on same to meet maxbytes */
194  int npages; /* number of exact entries in pagetable */
195  int nchunks; /* number of lossy entries in pagetable */
196  dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
197  dsa_pointer spages; /* dsa pointer to page array */
198  dsa_pointer schunks; /* dsa pointer to chunk array */
199  LWLock lock; /* lock to protect below members */
200  int spageptr; /* next spages index */
201  int schunkptr; /* next schunks index */
202  int schunkbit; /* next bit to check in current schunk */
204 
205 /*
206  * pagetable iteration array.
207  */
208 typedef struct PTIterationArray
209 {
210  pg_atomic_uint32 refcount; /* no. of iterator attached */
211  int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
213 
214 /*
215  * same as TBMIterator, but it is used for joint iteration, therefore this
216  * also holds a reference to the shared state.
217  */
219 {
220  TBMSharedIteratorState *state; /* shared state */
221  PTEntryArray *ptbase; /* pagetable element array */
222  PTIterationArray *ptpages; /* sorted exact page index list */
223  PTIterationArray *ptchunks; /* sorted lossy page index list */
224  TBMIterateResult output; /* MUST BE LAST (because variable-size) */
225 };
226 
227 /* Local function prototypes */
228 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230  const TIDBitmap *b);
231 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232  BlockNumber pageno);
234 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236 static void tbm_lossify(TIDBitmap *tbm);
237 static int tbm_comparator(const void *left, const void *right);
238 static int tbm_shared_comparator(const void *left, const void *right,
239  void *arg);
240 
241 /* define hashtable mapping block numbers to PagetableEntry's */
242 #define SH_USE_NONDEFAULT_ALLOCATOR
243 #define SH_PREFIX pagetable
244 #define SH_ELEMENT_TYPE PagetableEntry
245 #define SH_KEY_TYPE BlockNumber
246 #define SH_KEY blockno
247 #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 #define SH_EQUAL(tb, a, b) a == b
249 #define SH_SCOPE static inline
250 #define SH_DEFINE
251 #define SH_DECLARE
252 #include "lib/simplehash.h"
253 
254 
255 /*
256  * tbm_create - create an initially-empty bitmap
257  *
258  * The bitmap will live in the memory context that is CurrentMemoryContext
259  * at the time of this call. It will be limited to (approximately) maxbytes
260  * total memory consumption. If the DSA passed to this function is not NULL
261  * then the memory for storing elements of the underlying page table will
262  * be allocated from the DSA.
263  */
264 TIDBitmap *
265 tbm_create(long maxbytes, dsa_area *dsa)
266 {
267  TIDBitmap *tbm;
268  long nbuckets;
269 
270  /* Create the TIDBitmap struct and zero all its fields */
271  tbm = makeNode(TIDBitmap);
272 
273  tbm->mcxt = CurrentMemoryContext;
274  tbm->status = TBM_EMPTY;
275 
276  /*
277  * Estimate number of hashtable entries we can have within maxbytes. This
278  * estimates the hash cost as sizeof(PagetableEntry), which is good enough
279  * for our purpose. Also count an extra Pointer per entry for the arrays
280  * created during iteration readout.
281  */
282  nbuckets = maxbytes /
283  (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
284  nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
285  nbuckets = Max(nbuckets, 16); /* sanity limit */
286  tbm->maxentries = (int) nbuckets;
287  tbm->lossify_start = 0;
288  tbm->dsa = dsa;
291  tbm->ptpages = InvalidDsaPointer;
293 
294  return tbm;
295 }
296 
297 /*
298  * Actually create the hashtable. Since this is a moderately expensive
299  * proposition, we don't do it until we have to.
300  */
301 static void
303 {
304  Assert(tbm->status != TBM_HASH);
305  Assert(tbm->pagetable == NULL);
306 
307  tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
308 
309  /* If entry1 is valid, push it into the hashtable */
310  if (tbm->status == TBM_ONE_PAGE)
311  {
312  PagetableEntry *page;
313  bool found;
314  char oldstatus;
315 
316  page = pagetable_insert(tbm->pagetable,
317  tbm->entry1.blockno,
318  &found);
319  Assert(!found);
320  oldstatus = page->status;
321  memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
322  page->status = oldstatus;
323  }
324 
325  tbm->status = TBM_HASH;
326 }
327 
328 /*
329  * tbm_free - free a TIDBitmap
330  */
331 void
333 {
334  if (tbm->pagetable)
335  pagetable_destroy(tbm->pagetable);
336  if (tbm->spages)
337  pfree(tbm->spages);
338  if (tbm->schunks)
339  pfree(tbm->schunks);
340  pfree(tbm);
341 }
342 
343 /*
344  * tbm_free_shared_area - free shared state
345  *
346  * Free shared iterator state, Also free shared pagetable and iterator arrays
347  * memory if they are not referred by any of the shared iterator i.e recount
348  * is becomes 0.
349  */
350 void
352 {
353  TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
354  PTEntryArray *ptbase;
355  PTIterationArray *ptpages;
356  PTIterationArray *ptchunks;
357 
358  if (DsaPointerIsValid(istate->pagetable))
359  {
360  ptbase = dsa_get_address(dsa, istate->pagetable);
361  if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
362  dsa_free(dsa, istate->pagetable);
363  }
364  if (DsaPointerIsValid(istate->spages))
365  {
366  ptpages = dsa_get_address(dsa, istate->spages);
367  if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
368  dsa_free(dsa, istate->spages);
369  }
370  if (DsaPointerIsValid(istate->schunks))
371  {
372  ptchunks = dsa_get_address(dsa, istate->schunks);
373  if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
374  dsa_free(dsa, istate->schunks);
375  }
376 
377  dsa_free(dsa, dp);
378 }
379 
380 /*
381  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
382  *
383  * If recheck is true, then the recheck flag will be set in the
384  * TBMIterateResult when any of these tuples are reported out.
385  */
386 void
387 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
388  bool recheck)
389 {
391  PagetableEntry *page = NULL; /* only valid when currblk is valid */
392  int i;
393 
395  for (i = 0; i < ntids; i++)
396  {
397  BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
399  int wordnum,
400  bitnum;
401 
402  /* safety check to ensure we don't overrun bit array bounds */
403  if (off < 1 || off > MAX_TUPLES_PER_PAGE)
404  elog(ERROR, "tuple offset out of range: %u", off);
405 
406  /*
407  * Look up target page unless we already did. This saves cycles when
408  * the input includes consecutive tuples on the same page, which is
409  * common enough to justify an extra test here.
410  */
411  if (blk != currblk)
412  {
413  if (tbm_page_is_lossy(tbm, blk))
414  page = NULL; /* remember page is lossy */
415  else
416  page = tbm_get_pageentry(tbm, blk);
417  currblk = blk;
418  }
419 
420  if (page == NULL)
421  continue; /* whole page is already marked */
422 
423  if (page->ischunk)
424  {
425  /* The page is a lossy chunk header, set bit for itself */
426  wordnum = bitnum = 0;
427  }
428  else
429  {
430  /* Page is exact, so set bit for individual tuple */
431  wordnum = WORDNUM(off - 1);
432  bitnum = BITNUM(off - 1);
433  }
434  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
435  page->recheck |= recheck;
436 
437  if (tbm->nentries > tbm->maxentries)
438  {
439  tbm_lossify(tbm);
440  /* Page could have been converted to lossy, so force new lookup */
441  currblk = InvalidBlockNumber;
442  }
443  }
444 }
445 
446 /*
447  * tbm_add_page - add a whole page to a TIDBitmap
448  *
449  * This causes the whole page to be reported (with the recheck flag)
450  * when the TIDBitmap is scanned.
451  */
452 void
454 {
455  /* Enter the page in the bitmap, or mark it lossy if already present */
456  tbm_mark_page_lossy(tbm, pageno);
457  /* If we went over the memory limit, lossify some more pages */
458  if (tbm->nentries > tbm->maxentries)
459  tbm_lossify(tbm);
460 }
461 
462 /*
463  * tbm_union - set union
464  *
465  * a is modified in-place, b is not changed
466  */
467 void
469 {
470  Assert(!a->iterating);
471  /* Nothing to do if b is empty */
472  if (b->nentries == 0)
473  return;
474  /* Scan through chunks and pages in b, merge into a */
475  if (b->status == TBM_ONE_PAGE)
476  tbm_union_page(a, &b->entry1);
477  else
478  {
479  pagetable_iterator i;
480  PagetableEntry *bpage;
481 
482  Assert(b->status == TBM_HASH);
483  pagetable_start_iterate(b->pagetable, &i);
484  while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
485  tbm_union_page(a, bpage);
486  }
487 }
488 
489 /* Process one page of b during a union op */
490 static void
492 {
493  PagetableEntry *apage;
494  int wordnum;
495 
496  if (bpage->ischunk)
497  {
498  /* Scan b's chunk, mark each indicated page lossy in a */
499  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
500  {
501  bitmapword w = bpage->words[wordnum];
502 
503  if (w != 0)
504  {
505  BlockNumber pg;
506 
507  pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
508  while (w != 0)
509  {
510  if (w & 1)
511  tbm_mark_page_lossy(a, pg);
512  pg++;
513  w >>= 1;
514  }
515  }
516  }
517  }
518  else if (tbm_page_is_lossy(a, bpage->blockno))
519  {
520  /* page is already lossy in a, nothing to do */
521  return;
522  }
523  else
524  {
525  apage = tbm_get_pageentry(a, bpage->blockno);
526  if (apage->ischunk)
527  {
528  /* The page is a lossy chunk header, set bit for itself */
529  apage->words[0] |= ((bitmapword) 1 << 0);
530  }
531  else
532  {
533  /* Both pages are exact, merge at the bit level */
534  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
535  apage->words[wordnum] |= bpage->words[wordnum];
536  apage->recheck |= bpage->recheck;
537  }
538  }
539 
540  if (a->nentries > a->maxentries)
541  tbm_lossify(a);
542 }
543 
544 /*
545  * tbm_intersect - set intersection
546  *
547  * a is modified in-place, b is not changed
548  */
549 void
551 {
552  Assert(!a->iterating);
553  /* Nothing to do if a is empty */
554  if (a->nentries == 0)
555  return;
556  /* Scan through chunks and pages in a, try to match to b */
557  if (a->status == TBM_ONE_PAGE)
558  {
559  if (tbm_intersect_page(a, &a->entry1, b))
560  {
561  /* Page is now empty, remove it from a */
562  Assert(!a->entry1.ischunk);
563  a->npages--;
564  a->nentries--;
565  Assert(a->nentries == 0);
566  a->status = TBM_EMPTY;
567  }
568  }
569  else
570  {
571  pagetable_iterator i;
572  PagetableEntry *apage;
573 
574  Assert(a->status == TBM_HASH);
575  pagetable_start_iterate(a->pagetable, &i);
576  while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
577  {
578  if (tbm_intersect_page(a, apage, b))
579  {
580  /* Page or chunk is now empty, remove it from a */
581  if (apage->ischunk)
582  a->nchunks--;
583  else
584  a->npages--;
585  a->nentries--;
586  if (!pagetable_delete(a->pagetable, apage->blockno))
587  elog(ERROR, "hash table corrupted");
588  }
589  }
590  }
591 }
592 
593 /*
594  * Process one page of a during an intersection op
595  *
596  * Returns TRUE if apage is now empty and should be deleted from a
597  */
598 static bool
600 {
601  const PagetableEntry *bpage;
602  int wordnum;
603 
604  if (apage->ischunk)
605  {
606  /* Scan each bit in chunk, try to clear */
607  bool candelete = true;
608 
609  for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
610  {
611  bitmapword w = apage->words[wordnum];
612 
613  if (w != 0)
614  {
615  bitmapword neww = w;
616  BlockNumber pg;
617  int bitnum;
618 
619  pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
620  bitnum = 0;
621  while (w != 0)
622  {
623  if (w & 1)
624  {
625  if (!tbm_page_is_lossy(b, pg) &&
626  tbm_find_pageentry(b, pg) == NULL)
627  {
628  /* Page is not in b at all, lose lossy bit */
629  neww &= ~((bitmapword) 1 << bitnum);
630  }
631  }
632  pg++;
633  bitnum++;
634  w >>= 1;
635  }
636  apage->words[wordnum] = neww;
637  if (neww != 0)
638  candelete = false;
639  }
640  }
641  return candelete;
642  }
643  else if (tbm_page_is_lossy(b, apage->blockno))
644  {
645  /*
646  * Some of the tuples in 'a' might not satisfy the quals for 'b', but
647  * because the page 'b' is lossy, we don't know which ones. Therefore
648  * we mark 'a' as requiring rechecks, to indicate that at most those
649  * tuples set in 'a' are matches.
650  */
651  apage->recheck = true;
652  return false;
653  }
654  else
655  {
656  bool candelete = true;
657 
658  bpage = tbm_find_pageentry(b, apage->blockno);
659  if (bpage != NULL)
660  {
661  /* Both pages are exact, merge at the bit level */
662  Assert(!bpage->ischunk);
663  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
664  {
665  apage->words[wordnum] &= bpage->words[wordnum];
666  if (apage->words[wordnum] != 0)
667  candelete = false;
668  }
669  apage->recheck |= bpage->recheck;
670  }
671  /* If there is no matching b page, we can just delete the a page */
672  return candelete;
673  }
674 }
675 
676 /*
677  * tbm_is_empty - is a TIDBitmap completely empty?
678  */
679 bool
681 {
682  return (tbm->nentries == 0);
683 }
684 
685 /*
686  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
687  *
688  * The TBMIterator struct is created in the caller's memory context.
689  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
690  * okay to just allow the memory context to be released, too. It is caller's
691  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
692  * is freed.
693  *
694  * NB: after this is called, it is no longer allowed to modify the contents
695  * of the bitmap. However, you can call this multiple times to scan the
696  * contents repeatedly, including parallel scans.
697  */
698 TBMIterator *
700 {
701  TBMIterator *iterator;
702 
704 
705  /*
706  * Create the TBMIterator struct, with enough trailing space to serve the
707  * needs of the TBMIterateResult sub-struct.
708  */
709  iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
711  iterator->tbm = tbm;
712 
713  /*
714  * Initialize iteration pointers.
715  */
716  iterator->spageptr = 0;
717  iterator->schunkptr = 0;
718  iterator->schunkbit = 0;
719 
720  /*
721  * If we have a hashtable, create and fill the sorted page lists, unless
722  * we already did that for a previous iterator. Note that the lists are
723  * attached to the bitmap not the iterator, so they can be used by more
724  * than one iterator.
725  */
726  if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
727  {
728  pagetable_iterator i;
729  PagetableEntry *page;
730  int npages;
731  int nchunks;
732 
733  if (!tbm->spages && tbm->npages > 0)
734  tbm->spages = (PagetableEntry **)
736  tbm->npages * sizeof(PagetableEntry *));
737  if (!tbm->schunks && tbm->nchunks > 0)
738  tbm->schunks = (PagetableEntry **)
740  tbm->nchunks * sizeof(PagetableEntry *));
741 
742  npages = nchunks = 0;
743  pagetable_start_iterate(tbm->pagetable, &i);
744  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
745  {
746  if (page->ischunk)
747  tbm->schunks[nchunks++] = page;
748  else
749  tbm->spages[npages++] = page;
750  }
751  Assert(npages == tbm->npages);
752  Assert(nchunks == tbm->nchunks);
753  if (npages > 1)
754  qsort(tbm->spages, npages, sizeof(PagetableEntry *),
756  if (nchunks > 1)
757  qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
759  }
760 
762 
763  return iterator;
764 }
765 
766 /*
767  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
768  *
769  * The necessary shared state will be allocated from the DSA passed to
770  * tbm_create, so that multiple processes can attach to it and iterate jointly.
771  *
772  * This will convert the pagetable hash into page and chunk array of the index
773  * into pagetable array.
774  */
777 {
778  dsa_pointer dp;
779  TBMSharedIteratorState *istate;
780  PTEntryArray *ptbase = NULL;
781  PTIterationArray *ptpages = NULL;
782  PTIterationArray *ptchunks = NULL;
783 
784  Assert(tbm->dsa != NULL);
786 
787  /*
788  * Allocate TBMSharedIteratorState from DSA to hold the shared members and
789  * lock, this will also be used by multiple worker for shared iterate.
790  */
791  dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
792  istate = dsa_get_address(tbm->dsa, dp);
793 
794  /*
795  * If we're not already iterating, create and fill the sorted page lists.
796  * (If we are, the sorted page lists are already stored in the TIDBitmap,
797  * and we can just reuse them.)
798  */
799  if (tbm->iterating == TBM_NOT_ITERATING)
800  {
801  pagetable_iterator i;
802  PagetableEntry *page;
803  int idx;
804  int npages;
805  int nchunks;
806 
807  /*
808  * Allocate the page and chunk array memory from the DSA to share
809  * across multiple processes.
810  */
811  if (tbm->npages)
812  {
813  tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
814  tbm->npages * sizeof(int));
815  ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
816  pg_atomic_init_u32(&ptpages->refcount, 0);
817  }
818  if (tbm->nchunks)
819  {
820  tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
821  tbm->nchunks * sizeof(int));
822  ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
823  pg_atomic_init_u32(&ptchunks->refcount, 0);
824  }
825 
826  /*
827  * If TBM status is TBM_HASH then iterate over the pagetable and
828  * convert it to page and chunk arrays. But if it's in the
829  * TBM_ONE_PAGE mode then directly allocate the space for one entry
830  * from the DSA.
831  */
832  npages = nchunks = 0;
833  if (tbm->status == TBM_HASH)
834  {
835  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
836 
837  pagetable_start_iterate(tbm->pagetable, &i);
838  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
839  {
840  idx = page - ptbase->ptentry;
841  if (page->ischunk)
842  ptchunks->index[nchunks++] = idx;
843  else
844  ptpages->index[npages++] = idx;
845  }
846 
847  Assert(npages == tbm->npages);
848  Assert(nchunks == tbm->nchunks);
849  }
850  else if (tbm->status == TBM_ONE_PAGE)
851  {
852  /*
853  * In one page mode allocate the space for one pagetable entry,
854  * initialize it, and directly store its index (i.e. 0) in the
855  * page array.
856  */
857  tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
858  sizeof(PagetableEntry));
859  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
860  memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
861  ptpages->index[0] = 0;
862  }
863 
864  if (ptbase != NULL)
865  pg_atomic_init_u32(&ptbase->refcount, 0);
866  if (npages > 1)
867  qsort_arg((void *) (ptpages->index), npages, sizeof(int),
868  tbm_shared_comparator, (void *) ptbase->ptentry);
869  if (nchunks > 1)
870  qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
871  tbm_shared_comparator, (void *) ptbase->ptentry);
872  }
873 
874  /*
875  * Store the TBM members in the shared state so that we can share them
876  * across multiple processes.
877  */
878  istate->nentries = tbm->nentries;
879  istate->maxentries = tbm->maxentries;
880  istate->npages = tbm->npages;
881  istate->nchunks = tbm->nchunks;
882  istate->pagetable = tbm->dsapagetable;
883  istate->spages = tbm->ptpages;
884  istate->schunks = tbm->ptchunks;
885 
886  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
887  ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
888  ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
889 
890  /*
891  * For every shared iterator, referring to pagetable and iterator array,
892  * increase the refcount by 1 so that while freeing the shared iterator we
893  * don't free pagetable and iterator array until its refcount becomes 0.
894  */
895  if (ptbase != NULL)
896  pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
897  if (ptpages != NULL)
898  pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
899  if (ptchunks != NULL)
900  pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
901 
902  /* Initialize the iterator lock */
904 
905  /* Initialize the shared iterator state */
906  istate->schunkbit = 0;
907  istate->schunkptr = 0;
908  istate->spageptr = 0;
909 
911 
912  return dp;
913 }
914 
915 /*
916  * tbm_extract_page_tuple - extract the tuple offsets from a page
917  *
918  * The extracted offsets are stored into TBMIterateResult.
919  */
920 static inline int
922 {
923  int wordnum;
924  int ntuples = 0;
925 
926  for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
927  {
928  bitmapword w = page->words[wordnum];
929 
930  if (w != 0)
931  {
932  int off = wordnum * BITS_PER_BITMAPWORD + 1;
933 
934  while (w != 0)
935  {
936  if (w & 1)
937  output->offsets[ntuples++] = (OffsetNumber) off;
938  off++;
939  w >>= 1;
940  }
941  }
942  }
943 
944  return ntuples;
945 }
946 
947 /*
948  * tbm_advance_schunkbit - Advance the chunkbit
949  */
950 static inline void
951 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
952 {
953  int schunkbit = *schunkbitp;
954 
955  while (schunkbit < PAGES_PER_CHUNK)
956  {
957  int wordnum = WORDNUM(schunkbit);
958  int bitnum = BITNUM(schunkbit);
959 
960  if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
961  break;
962  schunkbit++;
963  }
964 
965  *schunkbitp = schunkbit;
966 }
967 
968 /*
969  * tbm_iterate - scan through next page of a TIDBitmap
970  *
971  * Returns a TBMIterateResult representing one page, or NULL if there are
972  * no more pages to scan. Pages are guaranteed to be delivered in numerical
973  * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
974  * remember the exact tuples to look at on this page --- the caller must
975  * examine all tuples on the page and check if they meet the intended
976  * condition. If result->recheck is true, only the indicated tuples need
977  * be examined, but the condition must be rechecked anyway. (For ease of
978  * testing, recheck is always set true when ntuples < 0.)
979  */
982 {
983  TIDBitmap *tbm = iterator->tbm;
984  TBMIterateResult *output = &(iterator->output);
985 
987 
988  /*
989  * If lossy chunk pages remain, make sure we've advanced schunkptr/
990  * schunkbit to the next set bit.
991  */
992  while (iterator->schunkptr < tbm->nchunks)
993  {
994  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
995  int schunkbit = iterator->schunkbit;
996 
997  tbm_advance_schunkbit(chunk, &schunkbit);
998  if (schunkbit < PAGES_PER_CHUNK)
999  {
1000  iterator->schunkbit = schunkbit;
1001  break;
1002  }
1003  /* advance to next chunk */
1004  iterator->schunkptr++;
1005  iterator->schunkbit = 0;
1006  }
1007 
1008  /*
1009  * If both chunk and per-page data remain, must output the numerically
1010  * earlier page.
1011  */
1012  if (iterator->schunkptr < tbm->nchunks)
1013  {
1014  PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1015  BlockNumber chunk_blockno;
1016 
1017  chunk_blockno = chunk->blockno + iterator->schunkbit;
1018  if (iterator->spageptr >= tbm->npages ||
1019  chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1020  {
1021  /* Return a lossy page indicator from the chunk */
1022  output->blockno = chunk_blockno;
1023  output->ntuples = -1;
1024  output->recheck = true;
1025  iterator->schunkbit++;
1026  return output;
1027  }
1028  }
1029 
1030  if (iterator->spageptr < tbm->npages)
1031  {
1032  PagetableEntry *page;
1033  int ntuples;
1034 
1035  /* In ONE_PAGE state, we don't allocate an spages[] array */
1036  if (tbm->status == TBM_ONE_PAGE)
1037  page = &tbm->entry1;
1038  else
1039  page = tbm->spages[iterator->spageptr];
1040 
1041  /* scan bitmap to extract individual offset numbers */
1042  ntuples = tbm_extract_page_tuple(page, output);
1043  output->blockno = page->blockno;
1044  output->ntuples = ntuples;
1045  output->recheck = page->recheck;
1046  iterator->spageptr++;
1047  return output;
1048  }
1049 
1050  /* Nothing more in the bitmap */
1051  return NULL;
1052 }
1053 
1054 /*
1055  * tbm_shared_iterate - scan through next page of a TIDBitmap
1056  *
1057  * As above, but this will iterate using an iterator which is shared
1058  * across multiple processes. We need to acquire the iterator LWLock,
1059  * before accessing the shared members.
1060  */
1063 {
1064  TBMIterateResult *output = &iterator->output;
1065  TBMSharedIteratorState *istate = iterator->state;
1066  PagetableEntry *ptbase = NULL;
1067  int *idxpages = NULL;
1068  int *idxchunks = NULL;
1069 
1070  if (iterator->ptbase != NULL)
1071  ptbase = iterator->ptbase->ptentry;
1072  if (iterator->ptpages != NULL)
1073  idxpages = iterator->ptpages->index;
1074  if (iterator->ptchunks != NULL)
1075  idxchunks = iterator->ptchunks->index;
1076 
1077  /* Acquire the LWLock before accessing the shared members */
1078  LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1079 
1080  /*
1081  * If lossy chunk pages remain, make sure we've advanced schunkptr/
1082  * schunkbit to the next set bit.
1083  */
1084  while (istate->schunkptr < istate->nchunks)
1085  {
1086  PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1087  int schunkbit = istate->schunkbit;
1088 
1089  tbm_advance_schunkbit(chunk, &schunkbit);
1090  if (schunkbit < PAGES_PER_CHUNK)
1091  {
1092  istate->schunkbit = schunkbit;
1093  break;
1094  }
1095  /* advance to next chunk */
1096  istate->schunkptr++;
1097  istate->schunkbit = 0;
1098  }
1099 
1100  /*
1101  * If both chunk and per-page data remain, must output the numerically
1102  * earlier page.
1103  */
1104  if (istate->schunkptr < istate->nchunks)
1105  {
1106  PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1107  BlockNumber chunk_blockno;
1108 
1109  chunk_blockno = chunk->blockno + istate->schunkbit;
1110 
1111  if (istate->spageptr >= istate->npages ||
1112  chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1113  {
1114  /* Return a lossy page indicator from the chunk */
1115  output->blockno = chunk_blockno;
1116  output->ntuples = -1;
1117  output->recheck = true;
1118  istate->schunkbit++;
1119 
1120  LWLockRelease(&istate->lock);
1121  return output;
1122  }
1123  }
1124 
1125  if (istate->spageptr < istate->npages)
1126  {
1127  PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1128  int ntuples;
1129 
1130  /* scan bitmap to extract individual offset numbers */
1131  ntuples = tbm_extract_page_tuple(page, output);
1132  output->blockno = page->blockno;
1133  output->ntuples = ntuples;
1134  output->recheck = page->recheck;
1135  istate->spageptr++;
1136 
1137  LWLockRelease(&istate->lock);
1138 
1139  return output;
1140  }
1141 
1142  LWLockRelease(&istate->lock);
1143 
1144  /* Nothing more in the bitmap */
1145  return NULL;
1146 }
1147 
1148 /*
1149  * tbm_end_iterate - finish an iteration over a TIDBitmap
1150  *
1151  * Currently this is just a pfree, but it might do more someday. (For
1152  * instance, it could be useful to count open iterators and allow the
1153  * bitmap to return to read/write status when there are no more iterators.)
1154  */
1155 void
1157 {
1158  pfree(iterator);
1159 }
1160 
1161 /*
1162  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1163  *
1164  * This doesn't free any of the shared state associated with the iterator,
1165  * just our backend-private state.
1166  */
1167 void
1169 {
1170  pfree(iterator);
1171 }
1172 
1173 /*
1174  * tbm_find_pageentry - find a PagetableEntry for the pageno
1175  *
1176  * Returns NULL if there is no non-lossy entry for the pageno.
1177  */
1178 static const PagetableEntry *
1180 {
1181  const PagetableEntry *page;
1182 
1183  if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1184  return NULL;
1185 
1186  if (tbm->status == TBM_ONE_PAGE)
1187  {
1188  page = &tbm->entry1;
1189  if (page->blockno != pageno)
1190  return NULL;
1191  Assert(!page->ischunk);
1192  return page;
1193  }
1194 
1195  page = pagetable_lookup(tbm->pagetable, pageno);
1196  if (page == NULL)
1197  return NULL;
1198  if (page->ischunk)
1199  return NULL; /* don't want a lossy chunk header */
1200  return page;
1201 }
1202 
1203 /*
1204  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1205  *
1206  * If new, the entry is marked as an exact (non-chunk) entry.
1207  *
1208  * This may cause the table to exceed the desired memory size. It is
1209  * up to the caller to call tbm_lossify() at the next safe point if so.
1210  */
1211 static PagetableEntry *
1213 {
1214  PagetableEntry *page;
1215  bool found;
1216 
1217  if (tbm->status == TBM_EMPTY)
1218  {
1219  /* Use the fixed slot */
1220  page = &tbm->entry1;
1221  found = false;
1222  tbm->status = TBM_ONE_PAGE;
1223  }
1224  else
1225  {
1226  if (tbm->status == TBM_ONE_PAGE)
1227  {
1228  page = &tbm->entry1;
1229  if (page->blockno == pageno)
1230  return page;
1231  /* Time to switch from one page to a hashtable */
1232  tbm_create_pagetable(tbm);
1233  }
1234 
1235  /* Look up or create an entry */
1236  page = pagetable_insert(tbm->pagetable, pageno, &found);
1237  }
1238 
1239  /* Initialize it if not present before */
1240  if (!found)
1241  {
1242  char oldstatus = page->status;
1243 
1244  MemSet(page, 0, sizeof(PagetableEntry));
1245  page->status = oldstatus;
1246  page->blockno = pageno;
1247  /* must count it too */
1248  tbm->nentries++;
1249  tbm->npages++;
1250  }
1251 
1252  return page;
1253 }
1254 
1255 /*
1256  * tbm_page_is_lossy - is the page marked as lossily stored?
1257  */
1258 static bool
1260 {
1261  PagetableEntry *page;
1262  BlockNumber chunk_pageno;
1263  int bitno;
1264 
1265  /* we can skip the lookup if there are no lossy chunks */
1266  if (tbm->nchunks == 0)
1267  return false;
1268  Assert(tbm->status == TBM_HASH);
1269 
1270  bitno = pageno % PAGES_PER_CHUNK;
1271  chunk_pageno = pageno - bitno;
1272 
1273  page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1274 
1275  if (page != NULL && page->ischunk)
1276  {
1277  int wordnum = WORDNUM(bitno);
1278  int bitnum = BITNUM(bitno);
1279 
1280  if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1281  return true;
1282  }
1283  return false;
1284 }
1285 
1286 /*
1287  * tbm_mark_page_lossy - mark the page number as lossily stored
1288  *
1289  * This may cause the table to exceed the desired memory size. It is
1290  * up to the caller to call tbm_lossify() at the next safe point if so.
1291  */
1292 static void
1294 {
1295  PagetableEntry *page;
1296  bool found;
1297  BlockNumber chunk_pageno;
1298  int bitno;
1299  int wordnum;
1300  int bitnum;
1301 
1302  /* We force the bitmap into hashtable mode whenever it's lossy */
1303  if (tbm->status != TBM_HASH)
1304  tbm_create_pagetable(tbm);
1305 
1306  bitno = pageno % PAGES_PER_CHUNK;
1307  chunk_pageno = pageno - bitno;
1308 
1309  /*
1310  * Remove any extant non-lossy entry for the page. If the page is its own
1311  * chunk header, however, we skip this and handle the case below.
1312  */
1313  if (bitno != 0)
1314  {
1315  if (pagetable_delete(tbm->pagetable, pageno))
1316  {
1317  /* It was present, so adjust counts */
1318  tbm->nentries--;
1319  tbm->npages--; /* assume it must have been non-lossy */
1320  }
1321  }
1322 
1323  /* Look up or create entry for chunk-header page */
1324  page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1325 
1326  /* Initialize it if not present before */
1327  if (!found)
1328  {
1329  char oldstatus = page->status;
1330 
1331  MemSet(page, 0, sizeof(PagetableEntry));
1332  page->status = oldstatus;
1333  page->blockno = chunk_pageno;
1334  page->ischunk = true;
1335  /* must count it too */
1336  tbm->nentries++;
1337  tbm->nchunks++;
1338  }
1339  else if (!page->ischunk)
1340  {
1341  char oldstatus = page->status;
1342 
1343  /* chunk header page was formerly non-lossy, make it lossy */
1344  MemSet(page, 0, sizeof(PagetableEntry));
1345  page->status = oldstatus;
1346  page->blockno = chunk_pageno;
1347  page->ischunk = true;
1348  /* we assume it had some tuple bit(s) set, so mark it lossy */
1349  page->words[0] = ((bitmapword) 1 << 0);
1350  /* adjust counts */
1351  tbm->nchunks++;
1352  tbm->npages--;
1353  }
1354 
1355  /* Now set the original target page's bit */
1356  wordnum = WORDNUM(bitno);
1357  bitnum = BITNUM(bitno);
1358  page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1359 }
1360 
1361 /*
1362  * tbm_lossify - lose some information to get back under the memory limit
1363  */
1364 static void
1366 {
1367  pagetable_iterator i;
1368  PagetableEntry *page;
1369 
1370  /*
1371  * XXX Really stupid implementation: this just lossifies pages in
1372  * essentially random order. We should be paying some attention to the
1373  * number of bits set in each page, instead.
1374  *
1375  * Since we are called as soon as nentries exceeds maxentries, we should
1376  * push nentries down to significantly less than maxentries, or else we'll
1377  * just end up doing this again very soon. We shoot for maxentries/2.
1378  */
1380  Assert(tbm->status == TBM_HASH);
1381 
1382  pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1383  while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1384  {
1385  if (page->ischunk)
1386  continue; /* already a chunk header */
1387 
1388  /*
1389  * If the page would become a chunk header, we won't save anything by
1390  * converting it to lossy, so skip it.
1391  */
1392  if ((page->blockno % PAGES_PER_CHUNK) == 0)
1393  continue;
1394 
1395  /* This does the dirty work ... */
1396  tbm_mark_page_lossy(tbm, page->blockno);
1397 
1398  if (tbm->nentries <= tbm->maxentries / 2)
1399  {
1400  /*
1401  * We have made enough room. Remember where to start lossifying
1402  * next round, so we evenly iterate over the hashtable.
1403  */
1404  tbm->lossify_start = i.cur;
1405  break;
1406  }
1407 
1408  /*
1409  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1410  * hashtable and may have deleted the non-lossy chunk. We can
1411  * continue the same hash table scan, since failure to visit one
1412  * element or visiting the newly inserted element, isn't fatal.
1413  */
1414  }
1415 
1416  /*
1417  * With a big bitmap and small work_mem, it's possible that we cannot get
1418  * under maxentries. Again, if that happens, we'd end up uselessly
1419  * calling tbm_lossify over and over. To prevent this from becoming a
1420  * performance sink, force maxentries up to at least double the current
1421  * number of entries. (In essence, we're admitting inability to fit
1422  * within work_mem when we do this.) Note that this test will not fire if
1423  * we broke out of the loop early; and if we didn't, the current number of
1424  * entries is simply not reducible any further.
1425  */
1426  if (tbm->nentries > tbm->maxentries / 2)
1427  tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1428 }
1429 
1430 /*
1431  * qsort comparator to handle PagetableEntry pointers.
1432  */
1433 static int
1434 tbm_comparator(const void *left, const void *right)
1435 {
1436  BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1437  BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1438 
1439  if (l < r)
1440  return -1;
1441  else if (l > r)
1442  return 1;
1443  return 0;
1444 }
1445 
1446 /*
1447  * As above, but this will get index into PagetableEntry array. Therefore,
1448  * it needs to get actual PagetableEntry using the index before comparing the
1449  * blockno.
1450  */
1451 static int
1452 tbm_shared_comparator(const void *left, const void *right, void *arg)
1453 {
1454  PagetableEntry *base = (PagetableEntry *) arg;
1455  PagetableEntry *lpage = &base[*(int *) left];
1456  PagetableEntry *rpage = &base[*(int *) right];
1457 
1458  if (lpage->blockno < rpage->blockno)
1459  return -1;
1460  else if (lpage->blockno > rpage->blockno)
1461  return 1;
1462  return 0;
1463 }
1464 
1465 /*
1466  * tbm_attach_shared_iterate
1467  *
1468  * Allocate a backend-private iterator and attach the shared iterator state
1469  * to it so that multiple processed can iterate jointly.
1470  *
1471  * We also converts the DSA pointers to local pointers and store them into
1472  * our private iterator.
1473  */
1476 {
1477  TBMSharedIterator *iterator;
1478  TBMSharedIteratorState *istate;
1479 
1480  /*
1481  * Create the TBMSharedIterator struct, with enough trailing space to
1482  * serve the needs of the TBMIterateResult sub-struct.
1483  */
1484  iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1485  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1486 
1487  istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1488 
1489  iterator->state = istate;
1490 
1491  iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1492 
1493  if (istate->npages)
1494  iterator->ptpages = dsa_get_address(dsa, istate->spages);
1495  if (istate->nchunks)
1496  iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1497 
1498  return iterator;
1499 }
1500 
1501 /*
1502  * pagetable_allocate
1503  *
1504  * Callback function for allocating the memory for hashtable elements.
1505  * Allocate memory for hashtable elements, using DSA if available.
1506  */
1507 static inline void *
1508 pagetable_allocate(pagetable_hash *pagetable, Size size)
1509 {
1510  TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1511  PTEntryArray *ptbase;
1512 
1513  if (tbm->dsa == NULL)
1514  return MemoryContextAllocExtended(pagetable->ctx, size,
1516 
1517  /*
1518  * Save the dsapagetable reference in dsapagetableold before allocating
1519  * new memory so that pagetable_free can free the old entry.
1520  */
1521  tbm->dsapagetableold = tbm->dsapagetable;
1523  sizeof(PTEntryArray) + size,
1525  ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1526 
1527  return ptbase->ptentry;
1528 }
1529 
1530 /*
1531  * pagetable_free
1532  *
1533  * Callback function for freeing hash table elements.
1534  */
1535 static inline void
1536 pagetable_free(pagetable_hash *pagetable, void *pointer)
1537 {
1538  TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1539 
1540  /* pfree the input pointer if DSA is not available */
1541  if (tbm->dsa == NULL)
1542  pfree(pointer);
1543  else if (DsaPointerIsValid(tbm->dsapagetableold))
1544  {
1545  dsa_free(tbm->dsa, tbm->dsapagetableold);
1547  }
1548 }
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1434
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:599
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1156
Definition: lwlock.h:32
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:550
struct PTIterationArray PTIterationArray
#define InvalidDsaPointer
Definition: dsa.h:78
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:16
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:776
dsa_pointer dsa_allocate_extended(dsa_area *area, Size size, int flags)
Definition: dsa.c:664
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:468
dsa_pointer spages
Definition: tidbitmap.c:197
struct pagetable_hash * pagetable
Definition: tidbitmap.c:153
static void output(uint64 loop_count)
TBMSharedIteratorState * state
Definition: tidbitmap.c:220
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:812
#define BITNUM(x)
Definition: tidbitmap.c:78
TIDBitmap * tbm_create(long maxbytes, dsa_area *dsa)
Definition: tidbitmap.c:265
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:412
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:387
#define Min(x, y)
Definition: c.h:812
uint32 lossify_start
Definition: tidbitmap.c:159
static int tbm_shared_comparator(const void *left, const void *right, void *arg)
Definition: tidbitmap.c:1452
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsa_pointer schunks
Definition: tidbitmap.c:198
PTEntryArray * ptbase
Definition: tidbitmap.c:221
#define MemSet(start, val, len)
Definition: c.h:863
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:397
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:264
uint32 BlockNumber
Definition: block.h:31
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:33
PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:114
static void tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
Definition: tidbitmap.c:951
NodeTag
Definition: nodes.h:26
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:491
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:83
uint64 dsa_pointer
Definition: dsa.h:62
uint32 bitmapword
Definition: bitmapset.h:34
dsa_pointer dsapagetable
Definition: tidbitmap.c:164
TBMStatus
Definition: tidbitmap.c:128
#define WORDS_PER_PAGE
Definition: tidbitmap.c:81
uint16 OffsetNumber
Definition: off.h:24
MemoryContext mcxt
Definition: tidbitmap.c:151
Definition: type.h:89
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1721
PagetableEntry entry1
Definition: tidbitmap.c:160
BlockNumber blockno
Definition: tidbitmap.h:42
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:924
void pfree(void *pointer)
Definition: mcxt.c:949
char * Pointer
Definition: c.h:235
struct TBMSharedIteratorState TBMSharedIteratorState
dsa_pointer ptpages
Definition: tidbitmap.c:166
#define ERROR
Definition: elog.h:43
int schunkptr
Definition: tidbitmap.c:181
BlockNumber blockno
Definition: tidbitmap.c:101
static int tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
Definition: tidbitmap.c:921
int spageptr
Definition: tidbitmap.c:180
#define dsa_allocate0(area, size)
Definition: dsa.h:88
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:332
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.h:46
TBMIterateResult output
Definition: tidbitmap.c:224
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1179
dsa_pointer ptchunks
Definition: tidbitmap.c:167
int maxentries
Definition: tidbitmap.c:155
unsigned int uint32
Definition: c.h:258
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:351
static void pagetable_free(pagetable_hash *pagetable, void *pointer)
Definition: tidbitmap.c:1536
NodeTag type
Definition: tidbitmap.c:150
#define WORDNUM(x)
Definition: tidbitmap.c:77
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:673
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
TIDBitmap * tbm
Definition: tidbitmap.c:179
dsa_pointer pagetable
Definition: tidbitmap.c:196
void * palloc0(Size size)
Definition: mcxt.c:877
PTIterationArray * ptpages
Definition: tidbitmap.c:222
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:680
pg_atomic_uint32 refcount
Definition: tidbitmap.c:113
int nchunks
Definition: tidbitmap.c:157
PagetableEntry ** spages
Definition: tidbitmap.c:162
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:73
TBMIterateResult * tbm_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:981
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:105
int npages
Definition: tidbitmap.c:156
#define Max(x, y)
Definition: c.h:806
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define makeNode(_type_)
Definition: nodes.h:558
#define Assert(condition)
Definition: c.h:681
PagetableEntry ** schunks
Definition: tidbitmap.c:163
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:19
TBMIterateResult output
Definition: tidbitmap.c:183
size_t Size
Definition: c.h:350
TBMIterator * tbm_begin_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:699
TBMIterateResult * tbm_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1062
#define InvalidBlockNumber
Definition: block.h:33
TBMStatus status
Definition: tidbitmap.c:152
#define MAX_TUPLES_PER_PAGE
Definition: tidbitmap.c:56
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1117
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:95
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1259
dsa_pointer dsapagetableold
Definition: tidbitmap.c:165
struct PagetableEntry PagetableEntry
#define DsaPointerIsValid(x)
Definition: dsa.h:81
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:453
Definition: dsa.c:354
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:812
void * palloc(Size size)
Definition: mcxt.c:848
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:706
pg_atomic_uint32 refcount
Definition: tidbitmap.c:210
int i
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:1365
void * arg
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1293
int schunkbit
Definition: tidbitmap.c:182
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1212
TBMIteratingState iterating
Definition: tidbitmap.c:158
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:234
#define elog
Definition: elog.h:219
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:76
#define qsort(a, b, c, d)
Definition: port.h:447
TBMIteratingState
Definition: tidbitmap.c:138
static void * pagetable_allocate(pagetable_hash *pagetable, Size size)
Definition: tidbitmap.c:1508
int nentries
Definition: tidbitmap.c:154
dsa_area * dsa
Definition: tidbitmap.c:168
struct PTEntryArray PTEntryArray
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1475
int index[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:211
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:302
PTIterationArray * ptchunks
Definition: tidbitmap.c:223
#define dsa_allocate(area, size)
Definition: dsa.h:84
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1168