PostgreSQL Source Code  git master
nodeGather.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nodeGather.c
4  * Support routines for scanning a plan via multiple workers.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * A Gather executor launches parallel workers to run multiple copies of a
10  * plan. It can also run the plan itself, if the workers are not available
11  * or have not started up yet. It then merges all of the results it produces
12  * and the results from the workers into a single output stream. Therefore,
13  * it will normally be used with a plan where running multiple copies of the
14  * same plan does not produce duplicate output, such as parallel-aware
15  * SeqScan.
16  *
17  * Alternatively, a Gather node can be configured to use just one worker
18  * and the single-copy flag can be set. In this case, the Gather node will
19  * run the plan in one worker and will not execute the plan itself. In
20  * this case, it simply returns whatever tuples were returned by the worker.
21  * If a worker cannot be obtained, then it will run the plan itself and
22  * return the results. Therefore, a plan used with a single-copy Gather
23  * node need not be parallel-aware.
24  *
25  * IDENTIFICATION
26  * src/backend/executor/nodeGather.c
27  *
28  *-------------------------------------------------------------------------
29  */
30 
31 #include "postgres.h"
32 
33 #include "access/relscan.h"
34 #include "access/xact.h"
35 #include "executor/execdebug.h"
36 #include "executor/execParallel.h"
37 #include "executor/nodeGather.h"
38 #include "executor/nodeSubplan.h"
39 #include "executor/tqueue.h"
40 #include "miscadmin.h"
41 #include "optimizer/planmain.h"
42 #include "pgstat.h"
43 #include "utils/memutils.h"
44 #include "utils/rel.h"
45 
46 
47 static TupleTableSlot *ExecGather(PlanState *pstate);
48 static TupleTableSlot *gather_getnext(GatherState *gatherstate);
49 static HeapTuple gather_readnext(GatherState *gatherstate);
50 static void ExecShutdownGatherWorkers(GatherState *node);
51 
52 
53 /* ----------------------------------------------------------------
54  * ExecInitGather
55  * ----------------------------------------------------------------
56  */
58 ExecInitGather(Gather *node, EState *estate, int eflags)
59 {
60  GatherState *gatherstate;
61  Plan *outerNode;
62  bool hasoid;
63  TupleDesc tupDesc;
64 
65  /* Gather node doesn't have innerPlan node. */
66  Assert(innerPlan(node) == NULL);
67 
68  /*
69  * create state structure
70  */
71  gatherstate = makeNode(GatherState);
72  gatherstate->ps.plan = (Plan *) node;
73  gatherstate->ps.state = estate;
74  gatherstate->ps.ExecProcNode = ExecGather;
75 
76  gatherstate->initialized = false;
77  gatherstate->need_to_scan_locally =
79  gatherstate->tuples_needed = -1;
80 
81  /*
82  * Miscellaneous initialization
83  *
84  * create expression context for node
85  */
86  ExecAssignExprContext(estate, &gatherstate->ps);
87 
88  /*
89  * Gather doesn't support checking a qual (it's always more efficient to
90  * do it in the child node).
91  */
92  Assert(!node->plan.qual);
93 
94  /*
95  * tuple table initialization
96  */
97  gatherstate->funnel_slot = ExecInitExtraTupleSlot(estate);
98  ExecInitResultTupleSlot(estate, &gatherstate->ps);
99 
100  /*
101  * now initialize outer plan
102  */
103  outerNode = outerPlan(node);
104  outerPlanState(gatherstate) = ExecInitNode(outerNode, estate, eflags);
105 
106  /*
107  * Initialize funnel slot to same tuple descriptor as outer plan.
108  */
109  if (!ExecContextForcesOids(outerPlanState(gatherstate), &hasoid))
110  hasoid = false;
111  tupDesc = ExecTypeFromTL(outerNode->targetlist, hasoid);
112  ExecSetSlotDescriptor(gatherstate->funnel_slot, tupDesc);
113 
114  /*
115  * Initialize result tuple type and projection info.
116  */
117  ExecAssignResultTypeFromTL(&gatherstate->ps);
118  ExecConditionalAssignProjectionInfo(&gatherstate->ps, tupDesc, OUTER_VAR);
119 
120  return gatherstate;
121 }
122 
123 /* ----------------------------------------------------------------
124  * ExecGather(node)
125  *
126  * Scans the relation via multiple workers and returns
127  * the next qualifying tuple.
128  * ----------------------------------------------------------------
129  */
130 static TupleTableSlot *
132 {
133  GatherState *node = castNode(GatherState, pstate);
134  TupleTableSlot *slot;
135  ExprContext *econtext;
136 
138 
139  /*
140  * Initialize the parallel context and workers on first execution. We do
141  * this on first execution rather than during node initialization, as it
142  * needs to allocate a large dynamic segment, so it is better to do it
143  * only if it is really needed.
144  */
145  if (!node->initialized)
146  {
147  EState *estate = node->ps.state;
148  Gather *gather = (Gather *) node->ps.plan;
149 
150  /*
151  * Sometimes we might have to run without parallelism; but if parallel
152  * mode is active then we can try to fire up some workers.
153  */
154  if (gather->num_workers > 0 && estate->es_use_parallel_mode)
155  {
156  ParallelContext *pcxt;
157 
158  /* Initialize, or re-initialize, shared state needed by workers. */
159  if (!node->pei)
160  node->pei = ExecInitParallelPlan(node->ps.lefttree,
161  estate,
162  gather->initParam,
163  gather->num_workers,
164  node->tuples_needed);
165  else
167  node->pei,
168  gather->initParam);
169 
170  /*
171  * Register backend workers. We might not get as many as we
172  * requested, or indeed any at all.
173  */
174  pcxt = node->pei->pcxt;
175  LaunchParallelWorkers(pcxt);
176  /* We save # workers launched for the benefit of EXPLAIN */
177  node->nworkers_launched = pcxt->nworkers_launched;
178 
179  /* Set up tuple queue readers to read the results. */
180  if (pcxt->nworkers_launched > 0)
181  {
183  /* Make a working array showing the active readers */
184  node->nreaders = pcxt->nworkers_launched;
185  node->reader = (TupleQueueReader **)
186  palloc(node->nreaders * sizeof(TupleQueueReader *));
187  memcpy(node->reader, node->pei->reader,
188  node->nreaders * sizeof(TupleQueueReader *));
189  }
190  else
191  {
192  /* No workers? Then never mind. */
193  node->nreaders = 0;
194  node->reader = NULL;
195  }
196  node->nextreader = 0;
197  }
198 
199  /* Run plan locally if no workers or enabled and not single-copy. */
200  node->need_to_scan_locally = (node->nreaders == 0)
202  node->initialized = true;
203  }
204 
205  /*
206  * Reset per-tuple memory context to free any expression evaluation
207  * storage allocated in the previous tuple cycle.
208  */
209  econtext = node->ps.ps_ExprContext;
210  ResetExprContext(econtext);
211 
212  /*
213  * Get next tuple, either from one of our workers, or by running the plan
214  * ourselves.
215  */
216  slot = gather_getnext(node);
217  if (TupIsNull(slot))
218  return NULL;
219 
220  /* If no projection is required, we're done. */
221  if (node->ps.ps_ProjInfo == NULL)
222  return slot;
223 
224  /*
225  * Form the result tuple using ExecProject(), and return it.
226  */
227  econtext->ecxt_outertuple = slot;
228  return ExecProject(node->ps.ps_ProjInfo);
229 }
230 
231 /* ----------------------------------------------------------------
232  * ExecEndGather
233  *
234  * frees any storage allocated through C routines.
235  * ----------------------------------------------------------------
236  */
237 void
239 {
240  ExecEndNode(outerPlanState(node)); /* let children clean up first */
241  ExecShutdownGather(node);
242  ExecFreeExprContext(&node->ps);
244 }
245 
246 /*
247  * Read the next tuple. We might fetch a tuple from one of the tuple queues
248  * using gather_readnext, or if no tuple queue contains a tuple and the
249  * single_copy flag is not set, we might generate one locally instead.
250  */
251 static TupleTableSlot *
253 {
254  PlanState *outerPlan = outerPlanState(gatherstate);
255  TupleTableSlot *outerTupleSlot;
256  TupleTableSlot *fslot = gatherstate->funnel_slot;
257  HeapTuple tup;
258 
259  while (gatherstate->nreaders > 0 || gatherstate->need_to_scan_locally)
260  {
262 
263  if (gatherstate->nreaders > 0)
264  {
265  tup = gather_readnext(gatherstate);
266 
267  if (HeapTupleIsValid(tup))
268  {
269  ExecStoreTuple(tup, /* tuple to store */
270  fslot, /* slot in which to store the tuple */
271  InvalidBuffer, /* buffer associated with this
272  * tuple */
273  true); /* pfree tuple when done with it */
274  return fslot;
275  }
276  }
277 
278  if (gatherstate->need_to_scan_locally)
279  {
280  EState *estate = gatherstate->ps.state;
281 
282  /* Install our DSA area while executing the plan. */
283  estate->es_query_dsa =
284  gatherstate->pei ? gatherstate->pei->area : NULL;
285  outerTupleSlot = ExecProcNode(outerPlan);
286  estate->es_query_dsa = NULL;
287 
288  if (!TupIsNull(outerTupleSlot))
289  return outerTupleSlot;
290 
291  gatherstate->need_to_scan_locally = false;
292  }
293  }
294 
295  return ExecClearTuple(fslot);
296 }
297 
298 /*
299  * Attempt to read a tuple from one of our parallel workers.
300  */
301 static HeapTuple
303 {
304  int nvisited = 0;
305 
306  for (;;)
307  {
308  TupleQueueReader *reader;
309  HeapTuple tup;
310  bool readerdone;
311 
312  /* Check for async events, particularly messages from workers. */
314 
315  /* Attempt to read a tuple, but don't block if none is available. */
316  Assert(gatherstate->nextreader < gatherstate->nreaders);
317  reader = gatherstate->reader[gatherstate->nextreader];
318  tup = TupleQueueReaderNext(reader, true, &readerdone);
319 
320  /*
321  * If this reader is done, remove it from our working array of active
322  * readers. If all readers are done, we're outta here.
323  */
324  if (readerdone)
325  {
326  Assert(!tup);
327  --gatherstate->nreaders;
328  if (gatherstate->nreaders == 0)
329  return NULL;
330  memmove(&gatherstate->reader[gatherstate->nextreader],
331  &gatherstate->reader[gatherstate->nextreader + 1],
332  sizeof(TupleQueueReader *)
333  * (gatherstate->nreaders - gatherstate->nextreader));
334  if (gatherstate->nextreader >= gatherstate->nreaders)
335  gatherstate->nextreader = 0;
336  continue;
337  }
338 
339  /* If we got a tuple, return it. */
340  if (tup)
341  return tup;
342 
343  /*
344  * Advance nextreader pointer in round-robin fashion. Note that we
345  * only reach this code if we weren't able to get a tuple from the
346  * current worker. We used to advance the nextreader pointer after
347  * every tuple, but it turns out to be much more efficient to keep
348  * reading from the same queue until that would require blocking.
349  */
350  gatherstate->nextreader++;
351  if (gatherstate->nextreader >= gatherstate->nreaders)
352  gatherstate->nextreader = 0;
353 
354  /* Have we visited every (surviving) TupleQueueReader? */
355  nvisited++;
356  if (nvisited >= gatherstate->nreaders)
357  {
358  /*
359  * If (still) running plan locally, return NULL so caller can
360  * generate another tuple from the local copy of the plan.
361  */
362  if (gatherstate->need_to_scan_locally)
363  return NULL;
364 
365  /* Nothing to do except wait for developments. */
368  nvisited = 0;
369  }
370  }
371 }
372 
373 /* ----------------------------------------------------------------
374  * ExecShutdownGatherWorkers
375  *
376  * Stop all the parallel workers.
377  * ----------------------------------------------------------------
378  */
379 static void
381 {
382  if (node->pei != NULL)
383  ExecParallelFinish(node->pei);
384 
385  /* Flush local copy of reader array */
386  if (node->reader)
387  pfree(node->reader);
388  node->reader = NULL;
389 }
390 
391 /* ----------------------------------------------------------------
392  * ExecShutdownGather
393  *
394  * Destroy the setup for parallel workers including parallel context.
395  * ----------------------------------------------------------------
396  */
397 void
399 {
401 
402  /* Now destroy the parallel context. */
403  if (node->pei != NULL)
404  {
405  ExecParallelCleanup(node->pei);
406  node->pei = NULL;
407  }
408 }
409 
410 /* ----------------------------------------------------------------
411  * Join Support
412  * ----------------------------------------------------------------
413  */
414 
415 /* ----------------------------------------------------------------
416  * ExecReScanGather
417  *
418  * Prepare to re-scan the result of a Gather.
419  * ----------------------------------------------------------------
420  */
421 void
423 {
424  Gather *gather = (Gather *) node->ps.plan;
426 
427  /* Make sure any existing workers are gracefully shut down */
429 
430  /* Mark node so that shared state will be rebuilt at next call */
431  node->initialized = false;
432 
433  /*
434  * Set child node's chgParam to tell it that the next scan might deliver a
435  * different set of rows within the leader process. (The overall rowset
436  * shouldn't change, but the leader process's subset might; hence nodes
437  * between here and the parallel table scan node mustn't optimize on the
438  * assumption of an unchanging rowset.)
439  */
440  if (gather->rescan_param >= 0)
441  outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
442  gather->rescan_param);
443 
444  /*
445  * If chgParam of subnode is not null then plan will be re-scanned by
446  * first ExecProcNode. Note: because this does nothing if we have a
447  * rescan_param, it's currently guaranteed that parallel-aware child nodes
448  * will not see a ReScan call until after they get a ReInitializeDSM call.
449  * That ordering might not be something to rely on, though. A good rule
450  * of thumb is that ReInitializeDSM should reset only shared state, ReScan
451  * should reset only local state, and anything that depends on both of
452  * those steps being finished must wait until the first ExecProcNode call.
453  */
454  if (outerPlan->chgParam == NULL)
456 }
TupleTableSlot * ExecStoreTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer, bool shouldFree)
Definition: execTuples.c:320
List * qual
Definition: plannodes.h:145
struct dsa_area * es_query_dsa
Definition: execnodes.h:524
ParallelContext * pcxt
Definition: execParallel.h:27
static TupleTableSlot * ExecGather(PlanState *pstate)
Definition: nodeGather.c:131
int nworkers_launched
Definition: execnodes.h:1961
TupleTableSlot * ExecInitExtraTupleSlot(EState *estate)
Definition: execTuples.c:852
ProjectionInfo * ps_ProjInfo
Definition: execnodes.h:893
void ExecParallelFinish(ParallelExecutorInfo *pei)
Definition: execParallel.c:995
#define castNode(_type_, nodeptr)
Definition: nodes.h:581
void ExecEndNode(PlanState *node)
Definition: execProcnode.c:539
ExprContext * ps_ExprContext
Definition: execnodes.h:892
void ExecReScan(PlanState *node)
Definition: execAmi.c:76
TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: execTuples.c:439
#define InvalidBuffer
Definition: buf.h:25
bool need_to_scan_locally
Definition: execnodes.h:1955
HeapTuple TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done)
Definition: tqueue.c:172
struct TupleQueueReader ** reader
Definition: execnodes.h:1964
EState * state
Definition: execnodes.h:860
void ResetLatch(volatile Latch *latch)
Definition: latch.c:497
void ExecConditionalAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc, Index varno)
Definition: execUtils.c:515
GatherState * ExecInitGather(Gather *node, EState *estate, int eflags)
Definition: nodeGather.c:58
void ExecFreeExprContext(PlanState *planstate)
Definition: execUtils.c:603
void ExecAssignResultTypeFromTL(PlanState *planstate)
Definition: execUtils.c:448
struct PlanState * lefttree
Definition: execnodes.h:877
bool es_use_parallel_mode
Definition: execnodes.h:521
TupleTableSlot * ps_ResultTupleSlot
Definition: execnodes.h:891
int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:336
void pfree(void *pointer)
Definition: mcxt.c:936
bool initialized
Definition: execnodes.h:1954
static void ExecShutdownGatherWorkers(GatherState *node)
Definition: nodeGather.c:380
Plan plan
Definition: plannodes.h:840
bool single_copy
Definition: plannodes.h:843
void ExecInitResultTupleSlot(EState *estate, PlanState *planstate)
Definition: execTuples.c:832
struct ParallelExecutorInfo * pei
Definition: execnodes.h:1959
#define outerPlanState(node)
Definition: execnodes.h:904
#define innerPlan(node)
Definition: plannodes.h:173
void ExecParallelCreateReaders(ParallelExecutorInfo *pei)
Definition: execParallel.c:796
#define memmove(d, s, c)
Definition: c.h:1069
struct TupleQueueReader ** reader
Definition: execParallel.h:35
#define TupIsNull(slot)
Definition: tuptable.h:138
PlanState ps
Definition: execnodes.h:1953
int nworkers_launched
Definition: parallel.h:37
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:451
static HeapTuple gather_readnext(GatherState *gatherstate)
Definition: nodeGather.c:302
TupleDesc ExecTypeFromTL(List *targetList, bool hasoid)
Definition: execTuples.c:888
#define outerPlan(node)
Definition: plannodes.h:174
void ExecReScanGather(GatherState *node)
Definition: nodeGather.c:422
void ExecShutdownGather(GatherState *node)
Definition: nodeGather.c:398
ExecProcNodeMtd ExecProcNode
Definition: execnodes.h:864
void ExecSetSlotDescriptor(TupleTableSlot *slot, TupleDesc tupdesc)
Definition: execTuples.c:247
static TupleTableSlot * ExecProcNode(PlanState *node)
Definition: executor.h:240
TupleTableSlot * funnel_slot
Definition: execnodes.h:1958
Plan * plan
Definition: execnodes.h:858
int num_workers
Definition: plannodes.h:841
bool parallel_leader_participation
Definition: planner.c:64
#define makeNode(_type_)
Definition: nodes.h:560
void ExecParallelCleanup(ParallelExecutorInfo *pei)
TupleTableSlot * ecxt_outertuple
Definition: execnodes.h:210
#define HeapTupleIsValid(tuple)
Definition: htup.h:77
#define Assert(condition)
Definition: c.h:680
int rescan_param
Definition: plannodes.h:842
void ExecAssignExprContext(EState *estate, PlanState *planstate)
Definition: execUtils.c:426
void ExecParallelReinitialize(PlanState *planstate, ParallelExecutorInfo *pei, Bitmapset *sendParams)
Definition: execParallel.c:822
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:742
List * targetlist
Definition: plannodes.h:144
static TupleTableSlot * gather_getnext(GatherState *gatherstate)
Definition: nodeGather.c:252
Bitmapset * initParam
Definition: plannodes.h:845
int nextreader
Definition: execnodes.h:1963
void * palloc(Size size)
Definition: mcxt.c:835
struct Latch * MyLatch
Definition: globals.c:52
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
ParallelExecutorInfo * ExecInitParallelPlan(PlanState *planstate, EState *estate, Bitmapset *sendParams, int nworkers, int64 tuples_needed)
Definition: execParallel.c:561
bool ExecContextForcesOids(PlanState *planstate, bool *hasoids)
Definition: execMain.c:1513
PlanState * ExecInitNode(Plan *node, EState *estate, int eflags)
Definition: execProcnode.c:139
#define WL_LATCH_SET
Definition: latch.h:124
#define OUTER_VAR
Definition: primnodes.h:154
static TupleTableSlot * ExecProject(ProjectionInfo *projInfo)
Definition: executor.h:326
void ExecEndGather(GatherState *node)
Definition: nodeGather.c:238
#define ResetExprContext(econtext)
Definition: executor.h:468
int64 tuples_needed
Definition: execnodes.h:1956