PostgreSQL Source Code  git master
nodeBitmapHeapscan.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  * Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8  * special snapshots). The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned,
15  * but with anything else we might return a tuple that doesn't meet the
16  * required index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  * src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  * ExecBitmapHeapScan scans a relation using bitmap info
31  * ExecBitmapHeapNext workhorse for above
32  * ExecInitBitmapHeapScan creates and initializes state info.
33  * ExecReScanBitmapHeapScan prepares to rescan the plan.
34  * ExecEndBitmapHeapScan releases all storage.
35  */
36 #include "postgres.h"
37 
38 #include <math.h>
39 
40 #include "access/relscan.h"
41 #include "access/tableam.h"
42 #include "access/visibilitymap.h"
43 #include "executor/executor.h"
45 #include "miscadmin.h"
46 #include "pgstat.h"
47 #include "storage/bufmgr.h"
48 #include "utils/rel.h"
49 #include "utils/spccache.h"
50 
53 static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node);
54 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
55 static inline void BitmapPrefetch(BitmapHeapScanState *node,
56  TableScanDesc scan);
58 
59 
60 /* ----------------------------------------------------------------
61  * BitmapHeapNext
62  *
63  * Retrieve next tuple from the BitmapHeapScan node's currentRelation
64  * ----------------------------------------------------------------
65  */
66 static TupleTableSlot *
68 {
69  ExprContext *econtext;
70  TableScanDesc scan;
71  TIDBitmap *tbm;
72  TupleTableSlot *slot;
73  ParallelBitmapHeapState *pstate = node->pstate;
74  dsa_area *dsa = node->ss.ps.state->es_query_dsa;
75 
76  /*
77  * extract necessary information from index scan node
78  */
79  econtext = node->ss.ps.ps_ExprContext;
80  slot = node->ss.ss_ScanTupleSlot;
81  scan = node->ss.ss_currentScanDesc;
82  tbm = node->tbm;
83 
84  /*
85  * If we haven't yet performed the underlying index scan, do it, and begin
86  * the iteration over the bitmap.
87  *
88  * For prefetching, we use *two* iterators, one for the pages we are
89  * actually scanning and another that runs ahead of the first for
90  * prefetching. node->prefetch_pages tracks exactly how many pages ahead
91  * the prefetch iterator is. Also, node->prefetch_target tracks the
92  * desired prefetch distance, which starts small and increases up to the
93  * node->prefetch_maximum. This is to avoid doing a lot of prefetching in
94  * a scan that stops after a few tuples because of a LIMIT.
95  */
96  if (!node->initialized)
97  {
98  TBMIterator *tbmiterator = NULL;
99  TBMSharedIterator *shared_tbmiterator = NULL;
100 
101  if (!pstate)
102  {
103  tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
104 
105  if (!tbm || !IsA(tbm, TIDBitmap))
106  elog(ERROR, "unrecognized result from subplan");
107 
108  node->tbm = tbm;
109  tbmiterator = tbm_begin_iterate(tbm);
110 
111 #ifdef USE_PREFETCH
112  if (node->prefetch_maximum > 0)
113  {
115  node->prefetch_pages = 0;
116  node->prefetch_target = -1;
117  }
118 #endif /* USE_PREFETCH */
119  }
120  else
121  {
122  /*
123  * The leader will immediately come out of the function, but
124  * others will be blocked until leader populates the TBM and wakes
125  * them up.
126  */
128  {
129  tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
130  if (!tbm || !IsA(tbm, TIDBitmap))
131  elog(ERROR, "unrecognized result from subplan");
132 
133  node->tbm = tbm;
134 
135  /*
136  * Prepare to iterate over the TBM. This will return the
137  * dsa_pointer of the iterator state which will be used by
138  * multiple processes to iterate jointly.
139  */
141 #ifdef USE_PREFETCH
142  if (node->prefetch_maximum > 0)
143  {
144  pstate->prefetch_iterator =
146 
147  /*
148  * We don't need the mutex here as we haven't yet woke up
149  * others.
150  */
151  pstate->prefetch_pages = 0;
152  pstate->prefetch_target = -1;
153  }
154 #endif
155 
156  /* We have initialized the shared state so wake up others. */
158  }
159 
160  /* Allocate a private iterator and attach the shared state to it */
161  shared_tbmiterator =
163 
164 #ifdef USE_PREFETCH
165  if (node->prefetch_maximum > 0)
166  {
169  }
170 #endif /* USE_PREFETCH */
171  }
172 
173  /*
174  * If this is the first scan of the underlying table, create the table
175  * scan descriptor and begin the scan.
176  */
177  if (!scan)
178  {
179  bool need_tuples = false;
180 
181  /*
182  * We can potentially skip fetching heap pages if we do not need
183  * any columns of the table, either for checking non-indexable
184  * quals or for returning data. This test is a bit simplistic, as
185  * it checks the stronger condition that there's no qual or return
186  * tlist at all. But in most cases it's probably not worth working
187  * harder than that.
188  */
189  need_tuples = (node->ss.ps.plan->qual != NIL ||
190  node->ss.ps.plan->targetlist != NIL);
191 
193  node->ss.ps.state->es_snapshot,
194  0,
195  NULL,
196  need_tuples);
197 
198  node->ss.ss_currentScanDesc = scan;
199  }
200 
201  scan->st.bitmap.rs_iterator = tbmiterator;
202  scan->st.bitmap.rs_shared_iterator = shared_tbmiterator;
203  node->initialized = true;
204 
205  goto new_page;
206  }
207 
208  for (;;)
209  {
210  while (table_scan_bitmap_next_tuple(scan, slot))
211  {
212  /*
213  * Continuing in previously obtained page.
214  */
215 
217 
218 #ifdef USE_PREFETCH
219 
220  /*
221  * Try to prefetch at least a few pages even before we get to the
222  * second page if we don't stop reading after the first tuple.
223  */
224  if (!pstate)
225  {
226  if (node->prefetch_target < node->prefetch_maximum)
227  node->prefetch_target++;
228  }
229  else if (pstate->prefetch_target < node->prefetch_maximum)
230  {
231  /* take spinlock while updating shared state */
232  SpinLockAcquire(&pstate->mutex);
233  if (pstate->prefetch_target < node->prefetch_maximum)
234  pstate->prefetch_target++;
235  SpinLockRelease(&pstate->mutex);
236  }
237 #endif /* USE_PREFETCH */
238 
239  /*
240  * We issue prefetch requests *after* fetching the current page to
241  * try to avoid having prefetching interfere with the main I/O.
242  * Also, this should happen only when we have determined there is
243  * still something to do on the current page, else we may
244  * uselessly prefetch the same page we are just about to request
245  * for real.
246  */
247  BitmapPrefetch(node, scan);
248 
249  /*
250  * If we are using lossy info, we have to recheck the qual
251  * conditions at every tuple.
252  */
253  if (node->recheck)
254  {
255  econtext->ecxt_scantuple = slot;
256  if (!ExecQualAndReset(node->bitmapqualorig, econtext))
257  {
258  /* Fails recheck, so drop it and loop back for another */
259  InstrCountFiltered2(node, 1);
260  ExecClearTuple(slot);
261  continue;
262  }
263  }
264 
265  /* OK to return this tuple */
266  return slot;
267  }
268 
269 new_page:
270 
272 
273  /*
274  * Returns false if the bitmap is exhausted and there are no further
275  * blocks we need to scan.
276  */
277  if (!table_scan_bitmap_next_block(scan, &node->blockno,
278  &node->recheck,
279  &node->stats.lossy_pages,
280  &node->stats.exact_pages))
281  break;
282 
283  /*
284  * If serial, we can error out if the prefetch block doesn't stay
285  * ahead of the current block.
286  */
287  if (node->pstate == NULL &&
288  node->prefetch_iterator &&
289  node->prefetch_blockno < node->blockno)
290  elog(ERROR,
291  "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d",
292  node->prefetch_blockno, node->blockno);
293 
294  /* Adjust the prefetch target */
296  }
297 
298  /*
299  * if we get here it means we are at the end of the scan..
300  */
301  return ExecClearTuple(slot);
302 }
303 
304 /*
305  * BitmapDoneInitializingSharedState - Shared state is initialized
306  *
307  * By this time the leader has already populated the TBM and initialized the
308  * shared state so wake up other processes.
309  */
310 static inline void
312 {
313  SpinLockAcquire(&pstate->mutex);
314  pstate->state = BM_FINISHED;
315  SpinLockRelease(&pstate->mutex);
316  ConditionVariableBroadcast(&pstate->cv);
317 }
318 
319 /*
320  * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
321  *
322  * We keep track of how far the prefetch iterator is ahead of the main
323  * iterator in prefetch_pages. For each block the main iterator returns, we
324  * decrement prefetch_pages.
325  */
326 static inline void
328 {
329 #ifdef USE_PREFETCH
330  ParallelBitmapHeapState *pstate = node->pstate;
331  TBMIterateResult *tbmpre;
332 
333  if (pstate == NULL)
334  {
335  TBMIterator *prefetch_iterator = node->prefetch_iterator;
336 
337  if (node->prefetch_pages > 0)
338  {
339  /* The main iterator has closed the distance by one page */
340  node->prefetch_pages--;
341  }
342  else if (prefetch_iterator)
343  {
344  tbmpre = tbm_iterate(prefetch_iterator);
345  node->prefetch_blockno = tbmpre ? tbmpre->blockno :
347  }
348  return;
349  }
350 
351  /*
352  * XXX: There is a known issue with keeping the prefetch and current block
353  * iterators in sync for parallel bitmap table scans. This can lead to
354  * prefetching blocks that have already been read. See the discussion
355  * here:
356  * https://postgr.es/m/20240315211449.en2jcmdqxv5o6tlz%40alap3.anarazel.de
357  * Note that moving the call site of BitmapAdjustPrefetchIterator()
358  * exacerbates the effects of this bug.
359  */
360  if (node->prefetch_maximum > 0)
361  {
362  TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
363 
364  SpinLockAcquire(&pstate->mutex);
365  if (pstate->prefetch_pages > 0)
366  {
367  pstate->prefetch_pages--;
368  SpinLockRelease(&pstate->mutex);
369  }
370  else
371  {
372  /* Release the mutex before iterating */
373  SpinLockRelease(&pstate->mutex);
374 
375  /*
376  * In case of shared mode, we can not ensure that the current
377  * blockno of the main iterator and that of the prefetch iterator
378  * are same. It's possible that whatever blockno we are
379  * prefetching will be processed by another process. Therefore,
380  * we don't validate the blockno here as we do in non-parallel
381  * case.
382  */
383  if (prefetch_iterator)
384  {
385  tbmpre = tbm_shared_iterate(prefetch_iterator);
386  node->prefetch_blockno = tbmpre ? tbmpre->blockno :
388  }
389  }
390  }
391 #endif /* USE_PREFETCH */
392 }
393 
394 /*
395  * BitmapAdjustPrefetchTarget - Adjust the prefetch target
396  *
397  * Increase prefetch target if it's not yet at the max. Note that
398  * we will increase it to zero after fetching the very first
399  * page/tuple, then to one after the second tuple is fetched, then
400  * it doubles as later pages are fetched.
401  */
402 static inline void
404 {
405 #ifdef USE_PREFETCH
406  ParallelBitmapHeapState *pstate = node->pstate;
407 
408  if (pstate == NULL)
409  {
410  if (node->prefetch_target >= node->prefetch_maximum)
411  /* don't increase any further */ ;
412  else if (node->prefetch_target >= node->prefetch_maximum / 2)
413  node->prefetch_target = node->prefetch_maximum;
414  else if (node->prefetch_target > 0)
415  node->prefetch_target *= 2;
416  else
417  node->prefetch_target++;
418  return;
419  }
420 
421  /* Do an unlocked check first to save spinlock acquisitions. */
422  if (pstate->prefetch_target < node->prefetch_maximum)
423  {
424  SpinLockAcquire(&pstate->mutex);
425  if (pstate->prefetch_target >= node->prefetch_maximum)
426  /* don't increase any further */ ;
427  else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
428  pstate->prefetch_target = node->prefetch_maximum;
429  else if (pstate->prefetch_target > 0)
430  pstate->prefetch_target *= 2;
431  else
432  pstate->prefetch_target++;
433  SpinLockRelease(&pstate->mutex);
434  }
435 #endif /* USE_PREFETCH */
436 }
437 
438 /*
439  * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
440  */
441 static inline void
443 {
444 #ifdef USE_PREFETCH
445  ParallelBitmapHeapState *pstate = node->pstate;
446 
447  if (pstate == NULL)
448  {
449  TBMIterator *prefetch_iterator = node->prefetch_iterator;
450 
451  if (prefetch_iterator)
452  {
453  while (node->prefetch_pages < node->prefetch_target)
454  {
455  TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
456  bool skip_fetch;
457 
458  if (tbmpre == NULL)
459  {
460  /* No more pages to prefetch */
461  tbm_end_iterate(prefetch_iterator);
462  node->prefetch_iterator = NULL;
463  break;
464  }
465  node->prefetch_pages++;
466  node->prefetch_blockno = tbmpre->blockno;
467 
468  /*
469  * If we expect not to have to actually read this heap page,
470  * skip this prefetch call, but continue to run the prefetch
471  * logic normally. (Would it be better not to increment
472  * prefetch_pages?)
473  */
474  skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
475  !tbmpre->recheck &&
477  tbmpre->blockno,
478  &node->pvmbuffer));
479 
480  if (!skip_fetch)
481  PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
482  }
483  }
484 
485  return;
486  }
487 
488  if (pstate->prefetch_pages < pstate->prefetch_target)
489  {
490  TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
491 
492  if (prefetch_iterator)
493  {
494  while (1)
495  {
496  TBMIterateResult *tbmpre;
497  bool do_prefetch = false;
498  bool skip_fetch;
499 
500  /*
501  * Recheck under the mutex. If some other process has already
502  * done enough prefetching then we need not to do anything.
503  */
504  SpinLockAcquire(&pstate->mutex);
505  if (pstate->prefetch_pages < pstate->prefetch_target)
506  {
507  pstate->prefetch_pages++;
508  do_prefetch = true;
509  }
510  SpinLockRelease(&pstate->mutex);
511 
512  if (!do_prefetch)
513  return;
514 
515  tbmpre = tbm_shared_iterate(prefetch_iterator);
516  if (tbmpre == NULL)
517  {
518  /* No more pages to prefetch */
519  tbm_end_shared_iterate(prefetch_iterator);
520  node->shared_prefetch_iterator = NULL;
521  break;
522  }
523 
524  node->prefetch_blockno = tbmpre->blockno;
525 
526  /* As above, skip prefetch if we expect not to need page */
527  skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
528  !tbmpre->recheck &&
530  tbmpre->blockno,
531  &node->pvmbuffer));
532 
533  if (!skip_fetch)
534  PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
535  }
536  }
537  }
538 #endif /* USE_PREFETCH */
539 }
540 
541 /*
542  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
543  */
544 static bool
546 {
547  ExprContext *econtext;
548 
549  /*
550  * extract necessary information from index scan node
551  */
552  econtext = node->ss.ps.ps_ExprContext;
553 
554  /* Does the tuple meet the original qual conditions? */
555  econtext->ecxt_scantuple = slot;
556  return ExecQualAndReset(node->bitmapqualorig, econtext);
557 }
558 
559 /* ----------------------------------------------------------------
560  * ExecBitmapHeapScan(node)
561  * ----------------------------------------------------------------
562  */
563 static TupleTableSlot *
565 {
567 
568  return ExecScan(&node->ss,
571 }
572 
573 /* ----------------------------------------------------------------
574  * ExecReScanBitmapHeapScan(node)
575  * ----------------------------------------------------------------
576  */
577 void
579 {
581 
582  TableScanDesc scan = node->ss.ss_currentScanDesc;
583 
584  if (scan)
585  {
586  /*
587  * End iteration on iterators saved in scan descriptor.
588  */
589  if (scan->st.bitmap.rs_shared_iterator)
590  {
591  tbm_end_shared_iterate(scan->st.bitmap.rs_shared_iterator);
592  scan->st.bitmap.rs_shared_iterator = NULL;
593  }
594 
595  if (scan->st.bitmap.rs_iterator)
596  {
597  tbm_end_iterate(scan->st.bitmap.rs_iterator);
598  scan->st.bitmap.rs_iterator = NULL;
599  }
600 
601  /* rescan to release any page pin */
602  table_rescan(node->ss.ss_currentScanDesc, NULL);
603  }
604 
605  /* release bitmaps and buffers if any */
606  if (node->prefetch_iterator)
608  if (node->shared_prefetch_iterator)
610  if (node->tbm)
611  tbm_free(node->tbm);
612  if (node->pvmbuffer != InvalidBuffer)
613  ReleaseBuffer(node->pvmbuffer);
614  node->tbm = NULL;
615  node->prefetch_iterator = NULL;
616  node->initialized = false;
617  node->shared_prefetch_iterator = NULL;
618  node->pvmbuffer = InvalidBuffer;
619  node->recheck = true;
620  node->blockno = InvalidBlockNumber;
622 
623  ExecScanReScan(&node->ss);
624 
625  /*
626  * if chgParam of subnode is not null then plan will be re-scanned by
627  * first ExecProcNode.
628  */
629  if (outerPlan->chgParam == NULL)
631 }
632 
633 /* ----------------------------------------------------------------
634  * ExecEndBitmapHeapScan
635  * ----------------------------------------------------------------
636  */
637 void
639 {
640  TableScanDesc scanDesc;
641 
642  /*
643  * When ending a parallel worker, copy the statistics gathered by the
644  * worker back into shared memory so that it can be picked up by the main
645  * process to report in EXPLAIN ANALYZE.
646  */
647  if (node->sinstrument != NULL && IsParallelWorker())
648  {
650 
651  Assert(ParallelWorkerNumber <= node->sinstrument->num_workers);
653 
654  /*
655  * Here we accumulate the stats rather than performing memcpy on
656  * node->stats into si. When a Gather/GatherMerge node finishes it
657  * will perform planner shutdown on the workers. On rescan it will
658  * spin up new workers which will have a new BitmapHeapScanState and
659  * zeroed stats.
660  */
661  si->exact_pages += node->stats.exact_pages;
662  si->lossy_pages += node->stats.lossy_pages;
663  }
664 
665  /*
666  * extract information from the node
667  */
668  scanDesc = node->ss.ss_currentScanDesc;
669 
670  /*
671  * close down subplans
672  */
674 
675  if (scanDesc)
676  {
677  /*
678  * End iteration on iterators saved in scan descriptor.
679  */
680  if (scanDesc->st.bitmap.rs_shared_iterator)
681  {
682  tbm_end_shared_iterate(scanDesc->st.bitmap.rs_shared_iterator);
683  scanDesc->st.bitmap.rs_shared_iterator = NULL;
684  }
685 
686  if (scanDesc->st.bitmap.rs_iterator)
687  {
688  tbm_end_iterate(scanDesc->st.bitmap.rs_iterator);
689  scanDesc->st.bitmap.rs_iterator = NULL;
690  }
691 
692  /*
693  * close table scan
694  */
695  table_endscan(scanDesc);
696  }
697 
698  /*
699  * release bitmaps and buffers if any
700  */
701  if (node->prefetch_iterator)
703  if (node->tbm)
704  tbm_free(node->tbm);
705  if (node->shared_prefetch_iterator)
707  if (node->pvmbuffer != InvalidBuffer)
708  ReleaseBuffer(node->pvmbuffer);
709 }
710 
711 /* ----------------------------------------------------------------
712  * ExecInitBitmapHeapScan
713  *
714  * Initializes the scan's state information.
715  * ----------------------------------------------------------------
716  */
718 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
719 {
720  BitmapHeapScanState *scanstate;
721  Relation currentRelation;
722 
723  /* check for unsupported flags */
724  Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
725 
726  /*
727  * Assert caller didn't ask for an unsafe snapshot --- see comments at
728  * head of file.
729  */
731 
732  /*
733  * create state structure
734  */
735  scanstate = makeNode(BitmapHeapScanState);
736  scanstate->ss.ps.plan = (Plan *) node;
737  scanstate->ss.ps.state = estate;
738  scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
739 
740  scanstate->tbm = NULL;
741  scanstate->pvmbuffer = InvalidBuffer;
742 
743  /* Zero the statistics counters */
744  memset(&scanstate->stats, 0, sizeof(BitmapHeapScanInstrumentation));
745 
746  scanstate->prefetch_iterator = NULL;
747  scanstate->prefetch_pages = 0;
748  scanstate->prefetch_target = 0;
749  scanstate->initialized = false;
750  scanstate->shared_prefetch_iterator = NULL;
751  scanstate->pstate = NULL;
752  scanstate->recheck = true;
753  scanstate->blockno = InvalidBlockNumber;
755 
756  /*
757  * Miscellaneous initialization
758  *
759  * create expression context for node
760  */
761  ExecAssignExprContext(estate, &scanstate->ss.ps);
762 
763  /*
764  * open the scan relation
765  */
766  currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
767 
768  /*
769  * initialize child nodes
770  */
771  outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
772 
773  /*
774  * get the scan type from the relation descriptor.
775  */
776  ExecInitScanTupleSlot(estate, &scanstate->ss,
777  RelationGetDescr(currentRelation),
778  table_slot_callbacks(currentRelation));
779 
780  /*
781  * Initialize result type and projection.
782  */
783  ExecInitResultTypeTL(&scanstate->ss.ps);
784  ExecAssignScanProjectionInfo(&scanstate->ss);
785 
786  /*
787  * initialize child expressions
788  */
789  scanstate->ss.ps.qual =
790  ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
791  scanstate->bitmapqualorig =
792  ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
793 
794  /*
795  * Maximum number of prefetches for the tablespace if configured,
796  * otherwise the current value of the effective_io_concurrency GUC.
797  */
798  scanstate->prefetch_maximum =
799  get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
800 
801  scanstate->ss.ss_currentRelation = currentRelation;
802 
803  /*
804  * all done.
805  */
806  return scanstate;
807 }
808 
809 /*----------------
810  * BitmapShouldInitializeSharedState
811  *
812  * The first process to come here and see the state to the BM_INITIAL
813  * will become the leader for the parallel bitmap scan and will be
814  * responsible for populating the TIDBitmap. The other processes will
815  * be blocked by the condition variable until the leader wakes them up.
816  * ---------------
817  */
818 static bool
820 {
822 
823  while (1)
824  {
825  SpinLockAcquire(&pstate->mutex);
826  state = pstate->state;
827  if (pstate->state == BM_INITIAL)
828  pstate->state = BM_INPROGRESS;
829  SpinLockRelease(&pstate->mutex);
830 
831  /* Exit if bitmap is done, or if we're the leader. */
832  if (state != BM_INPROGRESS)
833  break;
834 
835  /* Wait for the leader to wake us up. */
836  ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
837  }
838 
840 
841  return (state == BM_INITIAL);
842 }
843 
844 /* ----------------------------------------------------------------
845  * ExecBitmapHeapEstimate
846  *
847  * Compute the amount of space we'll need in the parallel
848  * query DSM, and inform pcxt->estimator about our needs.
849  * ----------------------------------------------------------------
850  */
851 void
853  ParallelContext *pcxt)
854 {
855  Size size;
856 
858 
859  /* account for instrumentation, if required */
860  if (node->ss.ps.instrument && pcxt->nworkers > 0)
861  {
862  size = add_size(size, offsetof(SharedBitmapHeapInstrumentation, sinstrument));
864  }
865 
867  shm_toc_estimate_keys(&pcxt->estimator, 1);
868 }
869 
870 /* ----------------------------------------------------------------
871  * ExecBitmapHeapInitializeDSM
872  *
873  * Set up a parallel bitmap heap scan descriptor.
874  * ----------------------------------------------------------------
875  */
876 void
878  ParallelContext *pcxt)
879 {
880  ParallelBitmapHeapState *pstate;
881  SharedBitmapHeapInstrumentation *sinstrument = NULL;
882  dsa_area *dsa = node->ss.ps.state->es_query_dsa;
883  char *ptr;
884  Size size;
885 
886  /* If there's no DSA, there are no workers; initialize nothing. */
887  if (dsa == NULL)
888  return;
889 
891  if (node->ss.ps.instrument && pcxt->nworkers > 0)
892  {
893  size = add_size(size, offsetof(SharedBitmapHeapInstrumentation, sinstrument));
895  }
896 
897  ptr = shm_toc_allocate(pcxt->toc, size);
898  pstate = (ParallelBitmapHeapState *) ptr;
899  ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
900  if (node->ss.ps.instrument && pcxt->nworkers > 0)
901  sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
902 
903  pstate->tbmiterator = 0;
904  pstate->prefetch_iterator = 0;
905 
906  /* Initialize the mutex */
907  SpinLockInit(&pstate->mutex);
908  pstate->prefetch_pages = 0;
909  pstate->prefetch_target = 0;
910  pstate->state = BM_INITIAL;
911 
912  ConditionVariableInit(&pstate->cv);
913 
914  if (sinstrument)
915  {
916  sinstrument->num_workers = pcxt->nworkers;
917 
918  /* ensure any unfilled slots will contain zeroes */
919  memset(sinstrument->sinstrument, 0,
920  pcxt->nworkers * sizeof(BitmapHeapScanInstrumentation));
921  }
922 
923  shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
924  node->pstate = pstate;
925  node->sinstrument = sinstrument;
926 }
927 
928 /* ----------------------------------------------------------------
929  * ExecBitmapHeapReInitializeDSM
930  *
931  * Reset shared state before beginning a fresh scan.
932  * ----------------------------------------------------------------
933  */
934 void
936  ParallelContext *pcxt)
937 {
938  ParallelBitmapHeapState *pstate = node->pstate;
939  dsa_area *dsa = node->ss.ps.state->es_query_dsa;
940 
941  /* If there's no DSA, there are no workers; do nothing. */
942  if (dsa == NULL)
943  return;
944 
945  pstate->state = BM_INITIAL;
946 
947  if (DsaPointerIsValid(pstate->tbmiterator))
948  tbm_free_shared_area(dsa, pstate->tbmiterator);
949 
952 
953  pstate->tbmiterator = InvalidDsaPointer;
955 }
956 
957 /* ----------------------------------------------------------------
958  * ExecBitmapHeapInitializeWorker
959  *
960  * Copy relevant information from TOC into planstate.
961  * ----------------------------------------------------------------
962  */
963 void
965  ParallelWorkerContext *pwcxt)
966 {
967  char *ptr;
968 
969  Assert(node->ss.ps.state->es_query_dsa != NULL);
970 
971  ptr = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
972 
973  node->pstate = (ParallelBitmapHeapState *) ptr;
974  ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
975 
976  if (node->ss.ps.instrument)
978 }
979 
980 /* ----------------------------------------------------------------
981  * ExecBitmapHeapRetrieveInstrumentation
982  *
983  * Transfer bitmap heap scan statistics from DSM to private memory.
984  * ----------------------------------------------------------------
985  */
986 void
988 {
989  SharedBitmapHeapInstrumentation *sinstrument = node->sinstrument;
990  Size size;
991 
992  if (sinstrument == NULL)
993  return;
994 
995  size = offsetof(SharedBitmapHeapInstrumentation, sinstrument)
996  + sinstrument->num_workers * sizeof(BitmapHeapScanInstrumentation);
997 
998  node->sinstrument = palloc(size);
999  memcpy(node->sinstrument, sinstrument, size);
1000 }
int ParallelWorkerNumber
Definition: parallel.c:114
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:639
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4924
#define MAXALIGN(LEN)
Definition: c.h:790
#define Assert(condition)
Definition: c.h:837
size_t Size
Definition: c.h:584
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define InvalidDsaPointer
Definition: dsa.h:78
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
void ExecReScan(PlanState *node)
Definition: execAmi.c:76
ExprState * ExecInitQual(List *qual, PlanState *parent)
Definition: execExpr.c:224
void ExecEndNode(PlanState *node)
Definition: execProcnode.c:562
Node * MultiExecProcNode(PlanState *node)
Definition: execProcnode.c:507
PlanState * ExecInitNode(Plan *node, EState *estate, int eflags)
Definition: execProcnode.c:142
TupleTableSlot * ExecScan(ScanState *node, ExecScanAccessMtd accessMtd, ExecScanRecheckMtd recheckMtd)
Definition: execScan.c:156
void ExecAssignScanProjectionInfo(ScanState *node)
Definition: execScan.c:270
void ExecScanReScan(ScanState *node)
Definition: execScan.c:297
void ExecInitScanTupleSlot(EState *estate, ScanState *scanstate, TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1898
void ExecInitResultTypeTL(PlanState *planstate)
Definition: execTuples.c:1842
void ExecAssignExprContext(EState *estate, PlanState *planstate)
Definition: execUtils.c:485
Relation ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags)
Definition: execUtils.c:699
#define outerPlanState(node)
Definition: execnodes.h:1223
#define InstrCountFiltered2(node, delta)
Definition: execnodes.h:1236
SharedBitmapState
Definition: execnodes.h:1787
@ BM_INITIAL
Definition: execnodes.h:1788
@ BM_FINISHED
Definition: execnodes.h:1790
@ BM_INPROGRESS
Definition: execnodes.h:1789
struct BitmapHeapScanInstrumentation BitmapHeapScanInstrumentation
#define EXEC_FLAG_BACKWARD
Definition: executor.h:68
TupleTableSlot *(* ExecScanAccessMtd)(ScanState *node)
Definition: executor.h:484
bool(* ExecScanRecheckMtd)(ScanState *node, TupleTableSlot *slot)
Definition: executor.h:485
static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)
Definition: executor.h:451
#define EXEC_FLAG_MARK
Definition: executor.h:69
#define IsParallelWorker()
Definition: parallel.h:60
void * palloc(Size size)
Definition: mcxt.c:1317
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void ExecEndBitmapHeapScan(BitmapHeapScanState *node)
void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, ParallelWorkerContext *pwcxt)
void ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
void ExecBitmapHeapEstimate(BitmapHeapScanState *node, ParallelContext *pcxt)
void ExecBitmapHeapRetrieveInstrumentation(BitmapHeapScanState *node)
void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt)
static TupleTableSlot * ExecBitmapHeapScan(PlanState *pstate)
BitmapHeapScanState * ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
static void BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
void ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt)
static void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node)
static void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
static void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
static bool BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
#define makeNode(_type_)
Definition: nodes.h:155
#define castNode(_type_, nodeptr)
Definition: nodes.h:176
#define NIL
Definition: pg_list.h:68
#define outerPlan(node)
Definition: plannodes.h:183
#define RelationGetDescr(relation)
Definition: rel.h:531
@ MAIN_FORKNUM
Definition: relpath.h:58
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:488
Size mul_size(Size s1, Size s2)
Definition: shmem.c:505
static pg_noinline void Size size
Definition: slab.c:607
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
int get_tablespace_io_concurrency(Oid spcid)
Definition: spccache.c:215
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
ParallelBitmapHeapState * pstate
Definition: execnodes.h:1863
ExprState * bitmapqualorig
Definition: execnodes.h:1853
BitmapHeapScanInstrumentation stats
Definition: execnodes.h:1856
BlockNumber prefetch_blockno
Definition: execnodes.h:1867
SharedBitmapHeapInstrumentation * sinstrument
Definition: execnodes.h:1864
TIDBitmap * tbm
Definition: execnodes.h:1854
TBMIterator * prefetch_iterator
Definition: execnodes.h:1857
BlockNumber blockno
Definition: execnodes.h:1866
TBMSharedIterator * shared_prefetch_iterator
Definition: execnodes.h:1862
List * bitmapqualorig
Definition: plannodes.h:542
struct dsa_area * es_query_dsa
Definition: execnodes.h:717
Snapshot es_snapshot
Definition: execnodes.h:632
TupleTableSlot * ecxt_scantuple
Definition: execnodes.h:258
SharedBitmapState state
Definition: execnodes.h:1812
dsa_pointer tbmiterator
Definition: execnodes.h:1807
ConditionVariable cv
Definition: execnodes.h:1813
dsa_pointer prefetch_iterator
Definition: execnodes.h:1808
shm_toc_estimator estimator
Definition: parallel.h:41
shm_toc * toc
Definition: parallel.h:44
Instrumentation * instrument
Definition: execnodes.h:1137
ExprState * qual
Definition: execnodes.h:1148
Plan * plan
Definition: execnodes.h:1127
EState * state
Definition: execnodes.h:1129
ExprContext * ps_ExprContext
Definition: execnodes.h:1166
ExecProcNodeMtd ExecProcNode
Definition: execnodes.h:1133
List * qual
Definition: plannodes.h:154
int plan_node_id
Definition: plannodes.h:152
List * targetlist
Definition: plannodes.h:153
Form_pg_class rd_rel
Definition: rel.h:111
Relation ss_currentRelation
Definition: execnodes.h:1575
TupleTableSlot * ss_ScanTupleSlot
Definition: execnodes.h:1577
PlanState ps
Definition: execnodes.h:1574
struct TableScanDescData * ss_currentScanDesc
Definition: execnodes.h:1576
Index scanrelid
Definition: plannodes.h:390
BitmapHeapScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]
Definition: execnodes.h:1827
BlockNumber blockno
Definition: tidbitmap.h:42
Relation rs_rd
Definition: relscan.h:38
union TableScanDescData::@48 st
struct TableScanDescData::@48::@49 bitmap
uint32 rs_flags
Definition: relscan.h:70
Definition: dsa.c:348
Definition: regguts.h:323
const TupleTableSlotOps * table_slot_callbacks(Relation relation)
Definition: tableam.c:58
@ SO_NEED_TUPLES
Definition: tableam.h:71
static void table_endscan(TableScanDesc scan)
Definition: tableam.h:1028
static void table_rescan(TableScanDesc scan, struct ScanKeyData *key)
Definition: tableam.h:1037
static bool table_scan_bitmap_next_tuple(TableScanDesc scan, TupleTableSlot *slot)
Definition: tableam.h:2009
static bool table_scan_bitmap_next_block(TableScanDesc scan, BlockNumber *blockno, bool *recheck, uint64 *lossy_pages, uint64 *exact_pages)
Definition: tableam.h:1980
static TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool need_tuple)
Definition: tableam.h:957
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:322
TBMIterator * tbm_begin_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:689
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1146
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1158
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1461
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:766
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:341
TBMIterateResult * tbm_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1052
TBMIterateResult * tbm_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:971
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:454
#define VM_ALL_VISIBLE(r, b, v)
Definition: visibilitymap.h:24