PostgreSQL Source Code  git master
nodeAgg.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nodeAgg.c
4  * Routines to handle aggregate nodes.
5  *
6  * ExecAgg normally evaluates each aggregate in the following steps:
7  *
8  * transvalue = initcond
9  * foreach input_tuple do
10  * transvalue = transfunc(transvalue, input_value(s))
11  * result = finalfunc(transvalue, direct_argument(s))
12  *
13  * If a finalfunc is not supplied then the result is just the ending
14  * value of transvalue.
15  *
16  * Other behaviors can be selected by the "aggsplit" mode, which exists
17  * to support partial aggregation. It is possible to:
18  * * Skip running the finalfunc, so that the output is always the
19  * final transvalue state.
20  * * Substitute the combinefunc for the transfunc, so that transvalue
21  * states (propagated up from a child partial-aggregation step) are merged
22  * rather than processing raw input rows. (The statements below about
23  * the transfunc apply equally to the combinefunc, when it's selected.)
24  * * Apply the serializefunc to the output values (this only makes sense
25  * when skipping the finalfunc, since the serializefunc works on the
26  * transvalue data type).
27  * * Apply the deserializefunc to the input values (this only makes sense
28  * when using the combinefunc, for similar reasons).
29  * It is the planner's responsibility to connect up Agg nodes using these
30  * alternate behaviors in a way that makes sense, with partial aggregation
31  * results being fed to nodes that expect them.
32  *
33  * If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the
34  * input tuples and eliminate duplicates (if required) before performing
35  * the above-depicted process. (However, we don't do that for ordered-set
36  * aggregates; their "ORDER BY" inputs are ordinary aggregate arguments
37  * so far as this module is concerned.) Note that partial aggregation
38  * is not supported in these cases, since we couldn't ensure global
39  * ordering or distinctness of the inputs.
40  *
41  * If transfunc is marked "strict" in pg_proc and initcond is NULL,
42  * then the first non-NULL input_value is assigned directly to transvalue,
43  * and transfunc isn't applied until the second non-NULL input_value.
44  * The agg's first input type and transtype must be the same in this case!
45  *
46  * If transfunc is marked "strict" then NULL input_values are skipped,
47  * keeping the previous transvalue. If transfunc is not strict then it
48  * is called for every input tuple and must deal with NULL initcond
49  * or NULL input_values for itself.
50  *
51  * If finalfunc is marked "strict" then it is not called when the
52  * ending transvalue is NULL, instead a NULL result is created
53  * automatically (this is just the usual handling of strict functions,
54  * of course). A non-strict finalfunc can make its own choice of
55  * what to return for a NULL ending transvalue.
56  *
57  * Ordered-set aggregates are treated specially in one other way: we
58  * evaluate any "direct" arguments and pass them to the finalfunc along
59  * with the transition value.
60  *
61  * A finalfunc can have additional arguments beyond the transvalue and
62  * any "direct" arguments, corresponding to the input arguments of the
63  * aggregate. These are always just passed as NULL. Such arguments may be
64  * needed to allow resolution of a polymorphic aggregate's result type.
65  *
66  * We compute aggregate input expressions and run the transition functions
67  * in a temporary econtext (aggstate->tmpcontext). This is reset at least
68  * once per input tuple, so when the transvalue datatype is
69  * pass-by-reference, we have to be careful to copy it into a longer-lived
70  * memory context, and free the prior value to avoid memory leakage. We
71  * store transvalues in another set of econtexts, aggstate->aggcontexts
72  * (one per grouping set, see below), which are also used for the hashtable
73  * structures in AGG_HASHED mode. These econtexts are rescanned, not just
74  * reset, at group boundaries so that aggregate transition functions can
75  * register shutdown callbacks via AggRegisterCallback.
76  *
77  * The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to
78  * run finalize functions and compute the output tuple; this context can be
79  * reset once per output tuple.
80  *
81  * The executor's AggState node is passed as the fmgr "context" value in
82  * all transfunc and finalfunc calls. It is not recommended that the
83  * transition functions look at the AggState node directly, but they can
84  * use AggCheckCallContext() to verify that they are being called by
85  * nodeAgg.c (and not as ordinary SQL functions). The main reason a
86  * transition function might want to know this is so that it can avoid
87  * palloc'ing a fixed-size pass-by-ref transition value on every call:
88  * it can instead just scribble on and return its left input. Ordinarily
89  * it is completely forbidden for functions to modify pass-by-ref inputs,
90  * but in the aggregate case we know the left input is either the initial
91  * transition value or a previous function result, and in either case its
92  * value need not be preserved. See int8inc() for an example. Notice that
93  * the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when
94  * the previous transition value pointer is returned. It is also possible
95  * to avoid repeated data copying when the transition value is an expanded
96  * object: to do that, the transition function must take care to return
97  * an expanded object that is in a child context of the memory context
98  * returned by AggCheckCallContext(). Also, some transition functions want
99  * to store working state in addition to the nominal transition value; they
100  * can use the memory context returned by AggCheckCallContext() to do that.
101  *
102  * Note: AggCheckCallContext() is available as of PostgreSQL 9.0. The
103  * AggState is available as context in earlier releases (back to 8.1),
104  * but direct examination of the node is needed to use it before 9.0.
105  *
106  * As of 9.4, aggregate transition functions can also use AggGetAggref()
107  * to get hold of the Aggref expression node for their aggregate call.
108  * This is mainly intended for ordered-set aggregates, which are not
109  * supported as window functions. (A regular aggregate function would
110  * need some fallback logic to use this, since there's no Aggref node
111  * for a window function.)
112  *
113  * Grouping sets:
114  *
115  * A list of grouping sets which is structurally equivalent to a ROLLUP
116  * clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over
117  * ordered data. We do this by keeping a separate set of transition values
118  * for each grouping set being concurrently processed; for each input tuple
119  * we update them all, and on group boundaries we reset those states
120  * (starting at the front of the list) whose grouping values have changed
121  * (the list of grouping sets is ordered from most specific to least
122  * specific).
123  *
124  * Where more complex grouping sets are used, we break them down into
125  * "phases", where each phase has a different sort order (except phase 0
126  * which is reserved for hashing). During each phase but the last, the
127  * input tuples are additionally stored in a tuplesort which is keyed to the
128  * next phase's sort order; during each phase but the first, the input
129  * tuples are drawn from the previously sorted data. (The sorting of the
130  * data for the first phase is handled by the planner, as it might be
131  * satisfied by underlying nodes.)
132  *
133  * Hashing can be mixed with sorted grouping. To do this, we have an
134  * AGG_MIXED strategy that populates the hashtables during the first sorted
135  * phase, and switches to reading them out after completing all sort phases.
136  * We can also support AGG_HASHED with multiple hash tables and no sorting
137  * at all.
138  *
139  * From the perspective of aggregate transition and final functions, the
140  * only issue regarding grouping sets is this: a single call site (flinfo)
141  * of an aggregate function may be used for updating several different
142  * transition values in turn. So the function must not cache in the flinfo
143  * anything which logically belongs as part of the transition value (most
144  * importantly, the memory context in which the transition value exists).
145  * The support API functions (AggCheckCallContext, AggRegisterCallback) are
146  * sensitive to the grouping set for which the aggregate function is
147  * currently being called.
148  *
149  * Plan structure:
150  *
151  * What we get from the planner is actually one "real" Agg node which is
152  * part of the plan tree proper, but which optionally has an additional list
153  * of Agg nodes hung off the side via the "chain" field. This is because an
154  * Agg node happens to be a convenient representation of all the data we
155  * need for grouping sets.
156  *
157  * For many purposes, we treat the "real" node as if it were just the first
158  * node in the chain. The chain must be ordered such that hashed entries
159  * come before sorted/plain entries; the real node is marked AGG_MIXED if
160  * there are both types present (in which case the real node describes one
161  * of the hashed groupings, other AGG_HASHED nodes may optionally follow in
162  * the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If
163  * the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
164  * nodes must be of the same type; if it is AGG_PLAIN, there can be no
165  * chained nodes.
166  *
167  * We collect all hashed nodes into a single "phase", numbered 0, and create
168  * a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
169  * Phase 0 is allocated even if there are no hashes, but remains unused in
170  * that case.
171  *
172  * AGG_HASHED nodes actually refer to only a single grouping set each,
173  * because for each hashed grouping we need a separate grpColIdx and
174  * numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of
175  * grouping sets that share a sort order. Each AGG_SORTED node other than
176  * the first one has an associated Sort node which describes the sort order
177  * to be used; the first sorted node takes its input from the outer subtree,
178  * which the planner has already arranged to provide ordered data.
179  *
180  * Memory and ExprContext usage:
181  *
182  * Because we're accumulating aggregate values across input rows, we need to
183  * use more memory contexts than just simple input/output tuple contexts.
184  * In fact, for a rollup, we need a separate context for each grouping set
185  * so that we can reset the inner (finer-grained) aggregates on their group
186  * boundaries while continuing to accumulate values for outer
187  * (coarser-grained) groupings. On top of this, we might be simultaneously
188  * populating hashtables; however, we only need one context for all the
189  * hashtables.
190  *
191  * So we create an array, aggcontexts, with an ExprContext for each grouping
192  * set in the largest rollup that we're going to process, and use the
193  * per-tuple memory context of those ExprContexts to store the aggregate
194  * transition values. hashcontext is the single context created to support
195  * all hash tables.
196  *
197  * Spilling To Disk
198  *
199  * When performing hash aggregation, if the hash table memory exceeds the
200  * limit (see hash_agg_check_limits()), we enter "spill mode". In spill
201  * mode, we advance the transition states only for groups already in the
202  * hash table. For tuples that would need to create a new hash table
203  * entries (and initialize new transition states), we instead spill them to
204  * disk to be processed later. The tuples are spilled in a partitioned
205  * manner, so that subsequent batches are smaller and less likely to exceed
206  * hash_mem (if a batch does exceed hash_mem, it must be spilled
207  * recursively).
208  *
209  * Spilled data is written to logical tapes. These provide better control
210  * over memory usage, disk space, and the number of files than if we were
211  * to use a BufFile for each spill. We don't know the number of tapes needed
212  * at the start of the algorithm (because it can recurse), so a tape set is
213  * allocated at the beginning, and individual tapes are created as needed.
214  * As a particular tape is read, logtape.c recycles its disk space. When a
215  * tape is read to completion, it is destroyed entirely.
216  *
217  * Tapes' buffers can take up substantial memory when many tapes are open at
218  * once. We only need one tape open at a time in read mode (using a buffer
219  * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
220  * requiring a buffer of size BLCKSZ) for each partition.
221  *
222  * Note that it's possible for transition states to start small but then
223  * grow very large; for instance in the case of ARRAY_AGG. In such cases,
224  * it's still possible to significantly exceed hash_mem. We try to avoid
225  * this situation by estimating what will fit in the available memory, and
226  * imposing a limit on the number of groups separately from the amount of
227  * memory consumed.
228  *
229  * Transition / Combine function invocation:
230  *
231  * For performance reasons transition functions, including combine
232  * functions, aren't invoked one-by-one from nodeAgg.c after computing
233  * arguments using the expression evaluation engine. Instead
234  * ExecBuildAggTrans() builds one large expression that does both argument
235  * evaluation and transition function invocation. That avoids performance
236  * issues due to repeated uses of expression evaluation, complications due
237  * to filter expressions having to be evaluated early, and allows to JIT
238  * the entire expression into one native function.
239  *
240  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
241  * Portions Copyright (c) 1994, Regents of the University of California
242  *
243  * IDENTIFICATION
244  * src/backend/executor/nodeAgg.c
245  *
246  *-------------------------------------------------------------------------
247  */
248 
249 #include "postgres.h"
250 
251 #include "access/htup_details.h"
252 #include "access/parallel.h"
253 #include "catalog/objectaccess.h"
254 #include "catalog/pg_aggregate.h"
255 #include "catalog/pg_proc.h"
256 #include "catalog/pg_type.h"
257 #include "common/hashfn.h"
258 #include "executor/execExpr.h"
259 #include "executor/executor.h"
260 #include "executor/nodeAgg.h"
261 #include "lib/hyperloglog.h"
262 #include "miscadmin.h"
263 #include "nodes/makefuncs.h"
264 #include "nodes/nodeFuncs.h"
265 #include "optimizer/optimizer.h"
266 #include "parser/parse_agg.h"
267 #include "parser/parse_coerce.h"
268 #include "utils/acl.h"
269 #include "utils/builtins.h"
270 #include "utils/datum.h"
271 #include "utils/dynahash.h"
272 #include "utils/expandeddatum.h"
273 #include "utils/logtape.h"
274 #include "utils/lsyscache.h"
275 #include "utils/memutils.h"
276 #include "utils/syscache.h"
277 #include "utils/tuplesort.h"
278 
279 /*
280  * Control how many partitions are created when spilling HashAgg to
281  * disk.
282  *
283  * HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of
284  * partitions needed such that each partition will fit in memory. The factor
285  * is set higher than one because there's not a high cost to having a few too
286  * many partitions, and it makes it less likely that a partition will need to
287  * be spilled recursively. Another benefit of having more, smaller partitions
288  * is that small hash tables may perform better than large ones due to memory
289  * caching effects.
290  *
291  * We also specify a min and max number of partitions per spill. Too few might
292  * mean a lot of wasted I/O from repeated spilling of the same tuples. Too
293  * many will result in lots of memory wasted buffering the spill files (which
294  * could instead be spent on a larger hash table).
295  */
296 #define HASHAGG_PARTITION_FACTOR 1.50
297 #define HASHAGG_MIN_PARTITIONS 4
298 #define HASHAGG_MAX_PARTITIONS 1024
299 
300 /*
301  * For reading from tapes, the buffer size must be a multiple of
302  * BLCKSZ. Larger values help when reading from multiple tapes concurrently,
303  * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a
304  * tape always uses a buffer of size BLCKSZ.
305  */
306 #define HASHAGG_READ_BUFFER_SIZE BLCKSZ
307 #define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
308 
309 /*
310  * HyperLogLog is used for estimating the cardinality of the spilled tuples in
311  * a given partition. 5 bits corresponds to a size of about 32 bytes and a
312  * worst-case error of around 18%. That's effective enough to choose a
313  * reasonable number of partitions when recursing.
314  */
315 #define HASHAGG_HLL_BIT_WIDTH 5
316 
317 /*
318  * Estimate chunk overhead as a constant 16 bytes. XXX: should this be
319  * improved?
320  */
321 #define CHUNKHDRSZ 16
322 
323 /*
324  * Represents partitioned spill data for a single hashtable. Contains the
325  * necessary information to route tuples to the correct partition, and to
326  * transform the spilled data into new batches.
327  *
328  * The high bits are used for partition selection (when recursing, we ignore
329  * the bits that have already been used for partition selection at an earlier
330  * level).
331  */
332 typedef struct HashAggSpill
333 {
334  int npartitions; /* number of partitions */
335  LogicalTape **partitions; /* spill partition tapes */
336  int64 *ntuples; /* number of tuples in each partition */
337  uint32 mask; /* mask to find partition from hash value */
338  int shift; /* after masking, shift by this amount */
339  hyperLogLogState *hll_card; /* cardinality estimate for contents */
340 } HashAggSpill;
341 
342 /*
343  * Represents work to be done for one pass of hash aggregation (with only one
344  * grouping set).
345  *
346  * Also tracks the bits of the hash already used for partition selection by
347  * earlier iterations, so that this batch can use new bits. If all bits have
348  * already been used, no partitioning will be done (any spilled data will go
349  * to a single output tape).
350  */
351 typedef struct HashAggBatch
352 {
353  int setno; /* grouping set */
354  int used_bits; /* number of bits of hash already used */
355  LogicalTape *input_tape; /* input partition tape */
356  int64 input_tuples; /* number of tuples in this batch */
357  double input_card; /* estimated group cardinality */
358 } HashAggBatch;
359 
360 /* used to find referenced colnos */
361 typedef struct FindColsContext
362 {
363  bool is_aggref; /* is under an aggref */
364  Bitmapset *aggregated; /* column references under an aggref */
365  Bitmapset *unaggregated; /* other column references */
367 
368 static void select_current_set(AggState *aggstate, int setno, bool is_hash);
369 static void initialize_phase(AggState *aggstate, int newphase);
370 static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
371 static void initialize_aggregates(AggState *aggstate,
372  AggStatePerGroup *pergroups,
373  int numReset);
374 static void advance_transition_function(AggState *aggstate,
375  AggStatePerTrans pertrans,
376  AggStatePerGroup pergroupstate);
377 static void advance_aggregates(AggState *aggstate);
378 static void process_ordered_aggregate_single(AggState *aggstate,
379  AggStatePerTrans pertrans,
380  AggStatePerGroup pergroupstate);
381 static void process_ordered_aggregate_multi(AggState *aggstate,
382  AggStatePerTrans pertrans,
383  AggStatePerGroup pergroupstate);
384 static void finalize_aggregate(AggState *aggstate,
385  AggStatePerAgg peragg,
386  AggStatePerGroup pergroupstate,
387  Datum *resultVal, bool *resultIsNull);
388 static void finalize_partialaggregate(AggState *aggstate,
389  AggStatePerAgg peragg,
390  AggStatePerGroup pergroupstate,
391  Datum *resultVal, bool *resultIsNull);
392 static inline void prepare_hash_slot(AggStatePerHash perhash,
393  TupleTableSlot *inputslot,
394  TupleTableSlot *hashslot);
395 static void prepare_projection_slot(AggState *aggstate,
396  TupleTableSlot *slot,
397  int currentSet);
398 static void finalize_aggregates(AggState *aggstate,
399  AggStatePerAgg peragg,
400  AggStatePerGroup pergroup);
401 static TupleTableSlot *project_aggregates(AggState *aggstate);
402 static void find_cols(AggState *aggstate, Bitmapset **aggregated,
403  Bitmapset **unaggregated);
404 static bool find_cols_walker(Node *node, FindColsContext *context);
405 static void build_hash_tables(AggState *aggstate);
406 static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
407 static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
408  bool nullcheck);
409 static long hash_choose_num_buckets(double hashentrysize,
410  long estimated_nbuckets,
411  Size memory);
412 static int hash_choose_num_partitions(double input_groups,
413  double hashentrysize,
414  int used_bits,
415  int *log2_npartittions);
416 static void initialize_hash_entry(AggState *aggstate,
417  TupleHashTable hashtable,
418  TupleHashEntry entry);
419 static void lookup_hash_entries(AggState *aggstate);
420 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
421 static void agg_fill_hash_table(AggState *aggstate);
422 static bool agg_refill_hash_table(AggState *aggstate);
425 static void hash_agg_check_limits(AggState *aggstate);
426 static void hash_agg_enter_spill_mode(AggState *aggstate);
427 static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
428  int npartitions);
429 static void hashagg_finish_initial_spills(AggState *aggstate);
430 static void hashagg_reset_spill_state(AggState *aggstate);
431 static HashAggBatch *hashagg_batch_new(LogicalTape *input_tape, int setno,
432  int64 input_tuples, double input_card,
433  int used_bits);
434 static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
435 static void hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *lts,
436  int used_bits, double input_groups,
437  double hashentrysize);
438 static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
439  TupleTableSlot *slot, uint32 hash);
440 static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
441  int setno);
442 static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
443 static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
444  AggState *aggstate, EState *estate,
445  Aggref *aggref, Oid transfn_oid,
446  Oid aggtranstype, Oid aggserialfn,
447  Oid aggdeserialfn, Datum initValue,
448  bool initValueIsNull, Oid *inputTypes,
449  int numArguments);
450 
451 
452 /*
453  * Select the current grouping set; affects current_set and
454  * curaggcontext.
455  */
456 static void
457 select_current_set(AggState *aggstate, int setno, bool is_hash)
458 {
459  /*
460  * When changing this, also adapt ExecAggPlainTransByVal() and
461  * ExecAggPlainTransByRef().
462  */
463  if (is_hash)
464  aggstate->curaggcontext = aggstate->hashcontext;
465  else
466  aggstate->curaggcontext = aggstate->aggcontexts[setno];
467 
468  aggstate->current_set = setno;
469 }
470 
471 /*
472  * Switch to phase "newphase", which must either be 0 or 1 (to reset) or
473  * current_phase + 1. Juggle the tuplesorts accordingly.
474  *
475  * Phase 0 is for hashing, which we currently handle last in the AGG_MIXED
476  * case, so when entering phase 0, all we need to do is drop open sorts.
477  */
478 static void
479 initialize_phase(AggState *aggstate, int newphase)
480 {
481  Assert(newphase <= 1 || newphase == aggstate->current_phase + 1);
482 
483  /*
484  * Whatever the previous state, we're now done with whatever input
485  * tuplesort was in use.
486  */
487  if (aggstate->sort_in)
488  {
489  tuplesort_end(aggstate->sort_in);
490  aggstate->sort_in = NULL;
491  }
492 
493  if (newphase <= 1)
494  {
495  /*
496  * Discard any existing output tuplesort.
497  */
498  if (aggstate->sort_out)
499  {
500  tuplesort_end(aggstate->sort_out);
501  aggstate->sort_out = NULL;
502  }
503  }
504  else
505  {
506  /*
507  * The old output tuplesort becomes the new input one, and this is the
508  * right time to actually sort it.
509  */
510  aggstate->sort_in = aggstate->sort_out;
511  aggstate->sort_out = NULL;
512  Assert(aggstate->sort_in);
513  tuplesort_performsort(aggstate->sort_in);
514  }
515 
516  /*
517  * If this isn't the last phase, we need to sort appropriately for the
518  * next phase in sequence.
519  */
520  if (newphase > 0 && newphase < aggstate->numphases - 1)
521  {
522  Sort *sortnode = aggstate->phases[newphase + 1].sortnode;
523  PlanState *outerNode = outerPlanState(aggstate);
524  TupleDesc tupDesc = ExecGetResultType(outerNode);
525 
526  aggstate->sort_out = tuplesort_begin_heap(tupDesc,
527  sortnode->numCols,
528  sortnode->sortColIdx,
529  sortnode->sortOperators,
530  sortnode->collations,
531  sortnode->nullsFirst,
532  work_mem,
533  NULL, false);
534  }
535 
536  aggstate->current_phase = newphase;
537  aggstate->phase = &aggstate->phases[newphase];
538 }
539 
540 /*
541  * Fetch a tuple from either the outer plan (for phase 1) or from the sorter
542  * populated by the previous phase. Copy it to the sorter for the next phase
543  * if any.
544  *
545  * Callers cannot rely on memory for tuple in returned slot remaining valid
546  * past any subsequently fetched tuple.
547  */
548 static TupleTableSlot *
550 {
551  TupleTableSlot *slot;
552 
553  if (aggstate->sort_in)
554  {
555  /* make sure we check for interrupts in either path through here */
557  if (!tuplesort_gettupleslot(aggstate->sort_in, true, false,
558  aggstate->sort_slot, NULL))
559  return NULL;
560  slot = aggstate->sort_slot;
561  }
562  else
563  slot = ExecProcNode(outerPlanState(aggstate));
564 
565  if (!TupIsNull(slot) && aggstate->sort_out)
566  tuplesort_puttupleslot(aggstate->sort_out, slot);
567 
568  return slot;
569 }
570 
571 /*
572  * (Re)Initialize an individual aggregate.
573  *
574  * This function handles only one grouping set, already set in
575  * aggstate->current_set.
576  *
577  * When called, CurrentMemoryContext should be the per-query context.
578  */
579 static void
581  AggStatePerGroup pergroupstate)
582 {
583  /*
584  * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
585  */
586  if (pertrans->numSortCols > 0)
587  {
588  /*
589  * In case of rescan, maybe there could be an uncompleted sort
590  * operation? Clean it up if so.
591  */
592  if (pertrans->sortstates[aggstate->current_set])
593  tuplesort_end(pertrans->sortstates[aggstate->current_set]);
594 
595 
596  /*
597  * We use a plain Datum sorter when there's a single input column;
598  * otherwise sort the full tuple. (See comments for
599  * process_ordered_aggregate_single.)
600  */
601  if (pertrans->numInputs == 1)
602  {
603  Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0);
604 
605  pertrans->sortstates[aggstate->current_set] =
606  tuplesort_begin_datum(attr->atttypid,
607  pertrans->sortOperators[0],
608  pertrans->sortCollations[0],
609  pertrans->sortNullsFirst[0],
610  work_mem, NULL, false);
611  }
612  else
613  pertrans->sortstates[aggstate->current_set] =
614  tuplesort_begin_heap(pertrans->sortdesc,
615  pertrans->numSortCols,
616  pertrans->sortColIdx,
617  pertrans->sortOperators,
618  pertrans->sortCollations,
619  pertrans->sortNullsFirst,
620  work_mem, NULL, false);
621  }
622 
623  /*
624  * (Re)set transValue to the initial value.
625  *
626  * Note that when the initial value is pass-by-ref, we must copy it (into
627  * the aggcontext) since we will pfree the transValue later.
628  */
629  if (pertrans->initValueIsNull)
630  pergroupstate->transValue = pertrans->initValue;
631  else
632  {
633  MemoryContext oldContext;
634 
636  pergroupstate->transValue = datumCopy(pertrans->initValue,
637  pertrans->transtypeByVal,
638  pertrans->transtypeLen);
639  MemoryContextSwitchTo(oldContext);
640  }
641  pergroupstate->transValueIsNull = pertrans->initValueIsNull;
642 
643  /*
644  * If the initial value for the transition state doesn't exist in the
645  * pg_aggregate table then we will let the first non-NULL value returned
646  * from the outer procNode become the initial value. (This is useful for
647  * aggregates like max() and min().) The noTransValue flag signals that we
648  * still need to do this.
649  */
650  pergroupstate->noTransValue = pertrans->initValueIsNull;
651 }
652 
653 /*
654  * Initialize all aggregate transition states for a new group of input values.
655  *
656  * If there are multiple grouping sets, we initialize only the first numReset
657  * of them (the grouping sets are ordered so that the most specific one, which
658  * is reset most often, is first). As a convenience, if numReset is 0, we
659  * reinitialize all sets.
660  *
661  * NB: This cannot be used for hash aggregates, as for those the grouping set
662  * number has to be specified from further up.
663  *
664  * When called, CurrentMemoryContext should be the per-query context.
665  */
666 static void
668  AggStatePerGroup *pergroups,
669  int numReset)
670 {
671  int transno;
672  int numGroupingSets = Max(aggstate->phase->numsets, 1);
673  int setno = 0;
674  int numTrans = aggstate->numtrans;
675  AggStatePerTrans transstates = aggstate->pertrans;
676 
677  if (numReset == 0)
678  numReset = numGroupingSets;
679 
680  for (setno = 0; setno < numReset; setno++)
681  {
682  AggStatePerGroup pergroup = pergroups[setno];
683 
684  select_current_set(aggstate, setno, false);
685 
686  for (transno = 0; transno < numTrans; transno++)
687  {
688  AggStatePerTrans pertrans = &transstates[transno];
689  AggStatePerGroup pergroupstate = &pergroup[transno];
690 
691  initialize_aggregate(aggstate, pertrans, pergroupstate);
692  }
693  }
694 }
695 
696 /*
697  * Given new input value(s), advance the transition function of one aggregate
698  * state within one grouping set only (already set in aggstate->current_set)
699  *
700  * The new values (and null flags) have been preloaded into argument positions
701  * 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to
702  * pass to the transition function. We also expect that the static fields of
703  * the fcinfo are already initialized; that was done by ExecInitAgg().
704  *
705  * It doesn't matter which memory context this is called in.
706  */
707 static void
709  AggStatePerTrans pertrans,
710  AggStatePerGroup pergroupstate)
711 {
712  FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
713  MemoryContext oldContext;
714  Datum newVal;
715 
716  if (pertrans->transfn.fn_strict)
717  {
718  /*
719  * For a strict transfn, nothing happens when there's a NULL input; we
720  * just keep the prior transValue.
721  */
722  int numTransInputs = pertrans->numTransInputs;
723  int i;
724 
725  for (i = 1; i <= numTransInputs; i++)
726  {
727  if (fcinfo->args[i].isnull)
728  return;
729  }
730  if (pergroupstate->noTransValue)
731  {
732  /*
733  * transValue has not been initialized. This is the first non-NULL
734  * input value. We use it as the initial value for transValue. (We
735  * already checked that the agg's input type is binary-compatible
736  * with its transtype, so straight copy here is OK.)
737  *
738  * We must copy the datum into aggcontext if it is pass-by-ref. We
739  * do not need to pfree the old transValue, since it's NULL.
740  */
742  pergroupstate->transValue = datumCopy(fcinfo->args[1].value,
743  pertrans->transtypeByVal,
744  pertrans->transtypeLen);
745  pergroupstate->transValueIsNull = false;
746  pergroupstate->noTransValue = false;
747  MemoryContextSwitchTo(oldContext);
748  return;
749  }
750  if (pergroupstate->transValueIsNull)
751  {
752  /*
753  * Don't call a strict function with NULL inputs. Note it is
754  * possible to get here despite the above tests, if the transfn is
755  * strict *and* returned a NULL on a prior cycle. If that happens
756  * we will propagate the NULL all the way to the end.
757  */
758  return;
759  }
760  }
761 
762  /* We run the transition functions in per-input-tuple memory context */
763  oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
764 
765  /* set up aggstate->curpertrans for AggGetAggref() */
766  aggstate->curpertrans = pertrans;
767 
768  /*
769  * OK to call the transition function
770  */
771  fcinfo->args[0].value = pergroupstate->transValue;
772  fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
773  fcinfo->isnull = false; /* just in case transfn doesn't set it */
774 
775  newVal = FunctionCallInvoke(fcinfo);
776 
777  aggstate->curpertrans = NULL;
778 
779  /*
780  * If pass-by-ref datatype, must copy the new value into aggcontext and
781  * free the prior transValue. But if transfn returned a pointer to its
782  * first input, we don't need to do anything. Also, if transfn returned a
783  * pointer to a R/W expanded object that is already a child of the
784  * aggcontext, assume we can adopt that value without copying it.
785  *
786  * It's safe to compare newVal with pergroup->transValue without regard
787  * for either being NULL, because ExecAggTransReparent() takes care to set
788  * transValue to 0 when NULL. Otherwise we could end up accidentally not
789  * reparenting, when the transValue has the same numerical value as
790  * newValue, despite being NULL. This is a somewhat hot path, making it
791  * undesirable to instead solve this with another branch for the common
792  * case of the transition function returning its (modified) input
793  * argument.
794  */
795  if (!pertrans->transtypeByVal &&
796  DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
797  newVal = ExecAggTransReparent(aggstate, pertrans,
798  newVal, fcinfo->isnull,
799  pergroupstate->transValue,
800  pergroupstate->transValueIsNull);
801 
802  pergroupstate->transValue = newVal;
803  pergroupstate->transValueIsNull = fcinfo->isnull;
804 
805  MemoryContextSwitchTo(oldContext);
806 }
807 
808 /*
809  * Advance each aggregate transition state for one input tuple. The input
810  * tuple has been stored in tmpcontext->ecxt_outertuple, so that it is
811  * accessible to ExecEvalExpr.
812  *
813  * We have two sets of transition states to handle: one for sorted aggregation
814  * and one for hashed; we do them both here, to avoid multiple evaluation of
815  * the inputs.
816  *
817  * When called, CurrentMemoryContext should be the per-query context.
818  */
819 static void
821 {
822  bool dummynull;
823 
825  aggstate->tmpcontext,
826  &dummynull);
827 }
828 
829 /*
830  * Run the transition function for a DISTINCT or ORDER BY aggregate
831  * with only one input. This is called after we have completed
832  * entering all the input values into the sort object. We complete the
833  * sort, read out the values in sorted order, and run the transition
834  * function on each value (applying DISTINCT if appropriate).
835  *
836  * Note that the strictness of the transition function was checked when
837  * entering the values into the sort, so we don't check it again here;
838  * we just apply standard SQL DISTINCT logic.
839  *
840  * The one-input case is handled separately from the multi-input case
841  * for performance reasons: for single by-value inputs, such as the
842  * common case of count(distinct id), the tuplesort_getdatum code path
843  * is around 300% faster. (The speedup for by-reference types is less
844  * but still noticeable.)
845  *
846  * This function handles only one grouping set (already set in
847  * aggstate->current_set).
848  *
849  * When called, CurrentMemoryContext should be the per-query context.
850  */
851 static void
853  AggStatePerTrans pertrans,
854  AggStatePerGroup pergroupstate)
855 {
856  Datum oldVal = (Datum) 0;
857  bool oldIsNull = true;
858  bool haveOldVal = false;
859  MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
860  MemoryContext oldContext;
861  bool isDistinct = (pertrans->numDistinctCols > 0);
862  Datum newAbbrevVal = (Datum) 0;
863  Datum oldAbbrevVal = (Datum) 0;
864  FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
865  Datum *newVal;
866  bool *isNull;
867 
868  Assert(pertrans->numDistinctCols < 2);
869 
870  tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
871 
872  /* Load the column into argument 1 (arg 0 will be transition value) */
873  newVal = &fcinfo->args[1].value;
874  isNull = &fcinfo->args[1].isnull;
875 
876  /*
877  * Note: if input type is pass-by-ref, the datums returned by the sort are
878  * freshly palloc'd in the per-query context, so we must be careful to
879  * pfree them when they are no longer needed.
880  */
881 
882  while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
883  true, newVal, isNull, &newAbbrevVal))
884  {
885  /*
886  * Clear and select the working context for evaluation of the equality
887  * function and transition function.
888  */
889  MemoryContextReset(workcontext);
890  oldContext = MemoryContextSwitchTo(workcontext);
891 
892  /*
893  * If DISTINCT mode, and not distinct from prior, skip it.
894  */
895  if (isDistinct &&
896  haveOldVal &&
897  ((oldIsNull && *isNull) ||
898  (!oldIsNull && !*isNull &&
899  oldAbbrevVal == newAbbrevVal &&
901  pertrans->aggCollation,
902  oldVal, *newVal)))))
903  {
904  /* equal to prior, so forget this one */
905  if (!pertrans->inputtypeByVal && !*isNull)
906  pfree(DatumGetPointer(*newVal));
907  }
908  else
909  {
910  advance_transition_function(aggstate, pertrans, pergroupstate);
911  /* forget the old value, if any */
912  if (!oldIsNull && !pertrans->inputtypeByVal)
913  pfree(DatumGetPointer(oldVal));
914  /* and remember the new one for subsequent equality checks */
915  oldVal = *newVal;
916  oldAbbrevVal = newAbbrevVal;
917  oldIsNull = *isNull;
918  haveOldVal = true;
919  }
920 
921  MemoryContextSwitchTo(oldContext);
922  }
923 
924  if (!oldIsNull && !pertrans->inputtypeByVal)
925  pfree(DatumGetPointer(oldVal));
926 
927  tuplesort_end(pertrans->sortstates[aggstate->current_set]);
928  pertrans->sortstates[aggstate->current_set] = NULL;
929 }
930 
931 /*
932  * Run the transition function for a DISTINCT or ORDER BY aggregate
933  * with more than one input. This is called after we have completed
934  * entering all the input values into the sort object. We complete the
935  * sort, read out the values in sorted order, and run the transition
936  * function on each value (applying DISTINCT if appropriate).
937  *
938  * This function handles only one grouping set (already set in
939  * aggstate->current_set).
940  *
941  * When called, CurrentMemoryContext should be the per-query context.
942  */
943 static void
945  AggStatePerTrans pertrans,
946  AggStatePerGroup pergroupstate)
947 {
948  ExprContext *tmpcontext = aggstate->tmpcontext;
949  FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
950  TupleTableSlot *slot1 = pertrans->sortslot;
951  TupleTableSlot *slot2 = pertrans->uniqslot;
952  int numTransInputs = pertrans->numTransInputs;
953  int numDistinctCols = pertrans->numDistinctCols;
954  Datum newAbbrevVal = (Datum) 0;
955  Datum oldAbbrevVal = (Datum) 0;
956  bool haveOldValue = false;
957  TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple;
958  int i;
959 
960  tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
961 
962  ExecClearTuple(slot1);
963  if (slot2)
964  ExecClearTuple(slot2);
965 
966  while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set],
967  true, true, slot1, &newAbbrevVal))
968  {
970 
971  tmpcontext->ecxt_outertuple = slot1;
972  tmpcontext->ecxt_innertuple = slot2;
973 
974  if (numDistinctCols == 0 ||
975  !haveOldValue ||
976  newAbbrevVal != oldAbbrevVal ||
977  !ExecQual(pertrans->equalfnMulti, tmpcontext))
978  {
979  /*
980  * Extract the first numTransInputs columns as datums to pass to
981  * the transfn.
982  */
983  slot_getsomeattrs(slot1, numTransInputs);
984 
985  /* Load values into fcinfo */
986  /* Start from 1, since the 0th arg will be the transition value */
987  for (i = 0; i < numTransInputs; i++)
988  {
989  fcinfo->args[i + 1].value = slot1->tts_values[i];
990  fcinfo->args[i + 1].isnull = slot1->tts_isnull[i];
991  }
992 
993  advance_transition_function(aggstate, pertrans, pergroupstate);
994 
995  if (numDistinctCols > 0)
996  {
997  /* swap the slot pointers to retain the current tuple */
998  TupleTableSlot *tmpslot = slot2;
999 
1000  slot2 = slot1;
1001  slot1 = tmpslot;
1002  /* avoid ExecQual() calls by reusing abbreviated keys */
1003  oldAbbrevVal = newAbbrevVal;
1004  haveOldValue = true;
1005  }
1006  }
1007 
1008  /* Reset context each time */
1009  ResetExprContext(tmpcontext);
1010 
1011  ExecClearTuple(slot1);
1012  }
1013 
1014  if (slot2)
1015  ExecClearTuple(slot2);
1016 
1017  tuplesort_end(pertrans->sortstates[aggstate->current_set]);
1018  pertrans->sortstates[aggstate->current_set] = NULL;
1019 
1020  /* restore previous slot, potentially in use for grouping sets */
1021  tmpcontext->ecxt_outertuple = save;
1022 }
1023 
1024 /*
1025  * Compute the final value of one aggregate.
1026  *
1027  * This function handles only one grouping set (already set in
1028  * aggstate->current_set).
1029  *
1030  * The finalfn will be run, and the result delivered, in the
1031  * output-tuple context; caller's CurrentMemoryContext does not matter.
1032  *
1033  * The finalfn uses the state as set in the transno. This also might be
1034  * being used by another aggregate function, so it's important that we do
1035  * nothing destructive here.
1036  */
1037 static void
1039  AggStatePerAgg peragg,
1040  AggStatePerGroup pergroupstate,
1041  Datum *resultVal, bool *resultIsNull)
1042 {
1043  LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
1044  bool anynull = false;
1045  MemoryContext oldContext;
1046  int i;
1047  ListCell *lc;
1048  AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
1049 
1051 
1052  /*
1053  * Evaluate any direct arguments. We do this even if there's no finalfn
1054  * (which is unlikely anyway), so that side-effects happen as expected.
1055  * The direct arguments go into arg positions 1 and up, leaving position 0
1056  * for the transition state value.
1057  */
1058  i = 1;
1059  foreach(lc, peragg->aggdirectargs)
1060  {
1061  ExprState *expr = (ExprState *) lfirst(lc);
1062 
1063  fcinfo->args[i].value = ExecEvalExpr(expr,
1064  aggstate->ss.ps.ps_ExprContext,
1065  &fcinfo->args[i].isnull);
1066  anynull |= fcinfo->args[i].isnull;
1067  i++;
1068  }
1069 
1070  /*
1071  * Apply the agg's finalfn if one is provided, else return transValue.
1072  */
1073  if (OidIsValid(peragg->finalfn_oid))
1074  {
1075  int numFinalArgs = peragg->numFinalArgs;
1076 
1077  /* set up aggstate->curperagg for AggGetAggref() */
1078  aggstate->curperagg = peragg;
1079 
1080  InitFunctionCallInfoData(*fcinfo, &peragg->finalfn,
1081  numFinalArgs,
1082  pertrans->aggCollation,
1083  (void *) aggstate, NULL);
1084 
1085  /* Fill in the transition state value */
1086  fcinfo->args[0].value =
1087  MakeExpandedObjectReadOnly(pergroupstate->transValue,
1088  pergroupstate->transValueIsNull,
1089  pertrans->transtypeLen);
1090  fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
1091  anynull |= pergroupstate->transValueIsNull;
1092 
1093  /* Fill any remaining argument positions with nulls */
1094  for (; i < numFinalArgs; i++)
1095  {
1096  fcinfo->args[i].value = (Datum) 0;
1097  fcinfo->args[i].isnull = true;
1098  anynull = true;
1099  }
1100 
1101  if (fcinfo->flinfo->fn_strict && anynull)
1102  {
1103  /* don't call a strict function with NULL inputs */
1104  *resultVal = (Datum) 0;
1105  *resultIsNull = true;
1106  }
1107  else
1108  {
1109  *resultVal = FunctionCallInvoke(fcinfo);
1110  *resultIsNull = fcinfo->isnull;
1111  }
1112  aggstate->curperagg = NULL;
1113  }
1114  else
1115  {
1116  /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
1117  *resultVal = pergroupstate->transValue;
1118  *resultIsNull = pergroupstate->transValueIsNull;
1119  }
1120 
1121  /*
1122  * If result is pass-by-ref, make sure it is in the right context.
1123  */
1124  if (!peragg->resulttypeByVal && !*resultIsNull &&
1126  DatumGetPointer(*resultVal)))
1127  *resultVal = datumCopy(*resultVal,
1128  peragg->resulttypeByVal,
1129  peragg->resulttypeLen);
1130 
1131  MemoryContextSwitchTo(oldContext);
1132 }
1133 
1134 /*
1135  * Compute the output value of one partial aggregate.
1136  *
1137  * The serialization function will be run, and the result delivered, in the
1138  * output-tuple context; caller's CurrentMemoryContext does not matter.
1139  */
1140 static void
1142  AggStatePerAgg peragg,
1143  AggStatePerGroup pergroupstate,
1144  Datum *resultVal, bool *resultIsNull)
1145 {
1146  AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
1147  MemoryContext oldContext;
1148 
1150 
1151  /*
1152  * serialfn_oid will be set if we must serialize the transvalue before
1153  * returning it
1154  */
1155  if (OidIsValid(pertrans->serialfn_oid))
1156  {
1157  /* Don't call a strict serialization function with NULL input. */
1158  if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull)
1159  {
1160  *resultVal = (Datum) 0;
1161  *resultIsNull = true;
1162  }
1163  else
1164  {
1165  FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo;
1166 
1167  fcinfo->args[0].value =
1168  MakeExpandedObjectReadOnly(pergroupstate->transValue,
1169  pergroupstate->transValueIsNull,
1170  pertrans->transtypeLen);
1171  fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
1172  fcinfo->isnull = false;
1173 
1174  *resultVal = FunctionCallInvoke(fcinfo);
1175  *resultIsNull = fcinfo->isnull;
1176  }
1177  }
1178  else
1179  {
1180  /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
1181  *resultVal = pergroupstate->transValue;
1182  *resultIsNull = pergroupstate->transValueIsNull;
1183  }
1184 
1185  /* If result is pass-by-ref, make sure it is in the right context. */
1186  if (!peragg->resulttypeByVal && !*resultIsNull &&
1188  DatumGetPointer(*resultVal)))
1189  *resultVal = datumCopy(*resultVal,
1190  peragg->resulttypeByVal,
1191  peragg->resulttypeLen);
1192 
1193  MemoryContextSwitchTo(oldContext);
1194 }
1195 
1196 /*
1197  * Extract the attributes that make up the grouping key into the
1198  * hashslot. This is necessary to compute the hash or perform a lookup.
1199  */
1200 static inline void
1202  TupleTableSlot *inputslot,
1203  TupleTableSlot *hashslot)
1204 {
1205  int i;
1206 
1207  /* transfer just the needed columns into hashslot */
1208  slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
1209  ExecClearTuple(hashslot);
1210 
1211  for (i = 0; i < perhash->numhashGrpCols; i++)
1212  {
1213  int varNumber = perhash->hashGrpColIdxInput[i] - 1;
1214 
1215  hashslot->tts_values[i] = inputslot->tts_values[varNumber];
1216  hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
1217  }
1218  ExecStoreVirtualTuple(hashslot);
1219 }
1220 
1221 /*
1222  * Prepare to finalize and project based on the specified representative tuple
1223  * slot and grouping set.
1224  *
1225  * In the specified tuple slot, force to null all attributes that should be
1226  * read as null in the context of the current grouping set. Also stash the
1227  * current group bitmap where GroupingExpr can get at it.
1228  *
1229  * This relies on three conditions:
1230  *
1231  * 1) Nothing is ever going to try and extract the whole tuple from this slot,
1232  * only reference it in evaluations, which will only access individual
1233  * attributes.
1234  *
1235  * 2) No system columns are going to need to be nulled. (If a system column is
1236  * referenced in a group clause, it is actually projected in the outer plan
1237  * tlist.)
1238  *
1239  * 3) Within a given phase, we never need to recover the value of an attribute
1240  * once it has been set to null.
1241  *
1242  * Poking into the slot this way is a bit ugly, but the consensus is that the
1243  * alternative was worse.
1244  */
1245 static void
1246 prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
1247 {
1248  if (aggstate->phase->grouped_cols)
1249  {
1250  Bitmapset *grouped_cols = aggstate->phase->grouped_cols[currentSet];
1251 
1252  aggstate->grouped_cols = grouped_cols;
1253 
1254  if (TTS_EMPTY(slot))
1255  {
1256  /*
1257  * Force all values to be NULL if working on an empty input tuple
1258  * (i.e. an empty grouping set for which no input rows were
1259  * supplied).
1260  */
1261  ExecStoreAllNullTuple(slot);
1262  }
1263  else if (aggstate->all_grouped_cols)
1264  {
1265  ListCell *lc;
1266 
1267  /* all_grouped_cols is arranged in desc order */
1269 
1270  foreach(lc, aggstate->all_grouped_cols)
1271  {
1272  int attnum = lfirst_int(lc);
1273 
1274  if (!bms_is_member(attnum, grouped_cols))
1275  slot->tts_isnull[attnum - 1] = true;
1276  }
1277  }
1278  }
1279 }
1280 
1281 /*
1282  * Compute the final value of all aggregates for one group.
1283  *
1284  * This function handles only one grouping set at a time, which the caller must
1285  * have selected. It's also the caller's responsibility to adjust the supplied
1286  * pergroup parameter to point to the current set's transvalues.
1287  *
1288  * Results are stored in the output econtext aggvalues/aggnulls.
1289  */
1290 static void
1292  AggStatePerAgg peraggs,
1293  AggStatePerGroup pergroup)
1294 {
1295  ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
1296  Datum *aggvalues = econtext->ecxt_aggvalues;
1297  bool *aggnulls = econtext->ecxt_aggnulls;
1298  int aggno;
1299  int transno;
1300 
1301  /*
1302  * If there were any DISTINCT and/or ORDER BY aggregates, sort their
1303  * inputs and run the transition functions.
1304  */
1305  for (transno = 0; transno < aggstate->numtrans; transno++)
1306  {
1307  AggStatePerTrans pertrans = &aggstate->pertrans[transno];
1308  AggStatePerGroup pergroupstate;
1309 
1310  pergroupstate = &pergroup[transno];
1311 
1312  if (pertrans->numSortCols > 0)
1313  {
1314  Assert(aggstate->aggstrategy != AGG_HASHED &&
1315  aggstate->aggstrategy != AGG_MIXED);
1316 
1317  if (pertrans->numInputs == 1)
1319  pertrans,
1320  pergroupstate);
1321  else
1323  pertrans,
1324  pergroupstate);
1325  }
1326  }
1327 
1328  /*
1329  * Run the final functions.
1330  */
1331  for (aggno = 0; aggno < aggstate->numaggs; aggno++)
1332  {
1333  AggStatePerAgg peragg = &peraggs[aggno];
1334  int transno = peragg->transno;
1335  AggStatePerGroup pergroupstate;
1336 
1337  pergroupstate = &pergroup[transno];
1338 
1339  if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
1340  finalize_partialaggregate(aggstate, peragg, pergroupstate,
1341  &aggvalues[aggno], &aggnulls[aggno]);
1342  else
1343  finalize_aggregate(aggstate, peragg, pergroupstate,
1344  &aggvalues[aggno], &aggnulls[aggno]);
1345  }
1346 }
1347 
1348 /*
1349  * Project the result of a group (whose aggs have already been calculated by
1350  * finalize_aggregates). Returns the result slot, or NULL if no row is
1351  * projected (suppressed by qual).
1352  */
1353 static TupleTableSlot *
1355 {
1356  ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
1357 
1358  /*
1359  * Check the qual (HAVING clause); if the group does not match, ignore it.
1360  */
1361  if (ExecQual(aggstate->ss.ps.qual, econtext))
1362  {
1363  /*
1364  * Form and return projection tuple using the aggregate results and
1365  * the representative input tuple.
1366  */
1367  return ExecProject(aggstate->ss.ps.ps_ProjInfo);
1368  }
1369  else
1370  InstrCountFiltered1(aggstate, 1);
1371 
1372  return NULL;
1373 }
1374 
1375 /*
1376  * Find input-tuple columns that are needed, dividing them into
1377  * aggregated and unaggregated sets.
1378  */
1379 static void
1380 find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
1381 {
1382  Agg *agg = (Agg *) aggstate->ss.ps.plan;
1383  FindColsContext context;
1384 
1385  context.is_aggref = false;
1386  context.aggregated = NULL;
1387  context.unaggregated = NULL;
1388 
1389  /* Examine tlist and quals */
1390  (void) find_cols_walker((Node *) agg->plan.targetlist, &context);
1391  (void) find_cols_walker((Node *) agg->plan.qual, &context);
1392 
1393  /* In some cases, grouping columns will not appear in the tlist */
1394  for (int i = 0; i < agg->numCols; i++)
1395  context.unaggregated = bms_add_member(context.unaggregated,
1396  agg->grpColIdx[i]);
1397 
1398  *aggregated = context.aggregated;
1399  *unaggregated = context.unaggregated;
1400 }
1401 
1402 static bool
1404 {
1405  if (node == NULL)
1406  return false;
1407  if (IsA(node, Var))
1408  {
1409  Var *var = (Var *) node;
1410 
1411  /* setrefs.c should have set the varno to OUTER_VAR */
1412  Assert(var->varno == OUTER_VAR);
1413  Assert(var->varlevelsup == 0);
1414  if (context->is_aggref)
1415  context->aggregated = bms_add_member(context->aggregated,
1416  var->varattno);
1417  else
1418  context->unaggregated = bms_add_member(context->unaggregated,
1419  var->varattno);
1420  return false;
1421  }
1422  if (IsA(node, Aggref))
1423  {
1424  Assert(!context->is_aggref);
1425  context->is_aggref = true;
1426  expression_tree_walker(node, find_cols_walker, (void *) context);
1427  context->is_aggref = false;
1428  return false;
1429  }
1431  (void *) context);
1432 }
1433 
1434 /*
1435  * (Re-)initialize the hash table(s) to empty.
1436  *
1437  * To implement hashed aggregation, we need a hashtable that stores a
1438  * representative tuple and an array of AggStatePerGroup structs for each
1439  * distinct set of GROUP BY column values. We compute the hash key from the
1440  * GROUP BY columns. The per-group data is allocated in lookup_hash_entry(),
1441  * for each entry.
1442  *
1443  * We have a separate hashtable and associated perhash data structure for each
1444  * grouping set for which we're doing hashing.
1445  *
1446  * The contents of the hash tables always live in the hashcontext's per-tuple
1447  * memory context (there is only one of these for all tables together, since
1448  * they are all reset at the same time).
1449  */
1450 static void
1452 {
1453  int setno;
1454 
1455  for (setno = 0; setno < aggstate->num_hashes; ++setno)
1456  {
1457  AggStatePerHash perhash = &aggstate->perhash[setno];
1458  long nbuckets;
1459  Size memory;
1460 
1461  if (perhash->hashtable != NULL)
1462  {
1463  ResetTupleHashTable(perhash->hashtable);
1464  continue;
1465  }
1466 
1467  Assert(perhash->aggnode->numGroups > 0);
1468 
1469  memory = aggstate->hash_mem_limit / aggstate->num_hashes;
1470 
1471  /* choose reasonable number of buckets per hashtable */
1472  nbuckets = hash_choose_num_buckets(aggstate->hashentrysize,
1473  perhash->aggnode->numGroups,
1474  memory);
1475 
1476  build_hash_table(aggstate, setno, nbuckets);
1477  }
1478 
1479  aggstate->hash_ngroups_current = 0;
1480 }
1481 
1482 /*
1483  * Build a single hashtable for this grouping set.
1484  */
1485 static void
1486 build_hash_table(AggState *aggstate, int setno, long nbuckets)
1487 {
1488  AggStatePerHash perhash = &aggstate->perhash[setno];
1489  MemoryContext metacxt = aggstate->hash_metacxt;
1490  MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
1491  MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
1492  Size additionalsize;
1493 
1494  Assert(aggstate->aggstrategy == AGG_HASHED ||
1495  aggstate->aggstrategy == AGG_MIXED);
1496 
1497  /*
1498  * Used to make sure initial hash table allocation does not exceed
1499  * hash_mem. Note that the estimate does not include space for
1500  * pass-by-reference transition data values, nor for the representative
1501  * tuple of each group.
1502  */
1503  additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
1504 
1505  perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
1506  perhash->hashslot->tts_tupleDescriptor,
1507  perhash->numCols,
1508  perhash->hashGrpColIdxHash,
1509  perhash->eqfuncoids,
1510  perhash->hashfunctions,
1511  perhash->aggnode->grpCollations,
1512  nbuckets,
1513  additionalsize,
1514  metacxt,
1515  hashcxt,
1516  tmpcxt,
1517  DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
1518 }
1519 
1520 /*
1521  * Compute columns that actually need to be stored in hashtable entries. The
1522  * incoming tuples from the child plan node will contain grouping columns,
1523  * other columns referenced in our targetlist and qual, columns used to
1524  * compute the aggregate functions, and perhaps just junk columns we don't use
1525  * at all. Only columns of the first two types need to be stored in the
1526  * hashtable, and getting rid of the others can make the table entries
1527  * significantly smaller. The hashtable only contains the relevant columns,
1528  * and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table()
1529  * into the format of the normal input descriptor.
1530  *
1531  * Additional columns, in addition to the columns grouped by, come from two
1532  * sources: Firstly functionally dependent columns that we don't need to group
1533  * by themselves, and secondly ctids for row-marks.
1534  *
1535  * To eliminate duplicates, we build a bitmapset of the needed columns, and
1536  * then build an array of the columns included in the hashtable. We might
1537  * still have duplicates if the passed-in grpColIdx has them, which can happen
1538  * in edge cases from semijoins/distinct; these can't always be removed,
1539  * because it's not certain that the duplicate cols will be using the same
1540  * hash function.
1541  *
1542  * Note that the array is preserved over ExecReScanAgg, so we allocate it in
1543  * the per-query context (unlike the hash table itself).
1544  */
1545 static void
1547 {
1548  Bitmapset *base_colnos;
1549  Bitmapset *aggregated_colnos;
1550  TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
1551  List *outerTlist = outerPlanState(aggstate)->plan->targetlist;
1552  int numHashes = aggstate->num_hashes;
1553  EState *estate = aggstate->ss.ps.state;
1554  int j;
1555 
1556  /* Find Vars that will be needed in tlist and qual */
1557  find_cols(aggstate, &aggregated_colnos, &base_colnos);
1558  aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
1559  aggstate->max_colno_needed = 0;
1560  aggstate->all_cols_needed = true;
1561 
1562  for (int i = 0; i < scanDesc->natts; i++)
1563  {
1564  int colno = i + 1;
1565 
1566  if (bms_is_member(colno, aggstate->colnos_needed))
1567  aggstate->max_colno_needed = colno;
1568  else
1569  aggstate->all_cols_needed = false;
1570  }
1571 
1572  for (j = 0; j < numHashes; ++j)
1573  {
1574  AggStatePerHash perhash = &aggstate->perhash[j];
1575  Bitmapset *colnos = bms_copy(base_colnos);
1576  AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
1577  List *hashTlist = NIL;
1578  TupleDesc hashDesc;
1579  int maxCols;
1580  int i;
1581 
1582  perhash->largestGrpColIdx = 0;
1583 
1584  /*
1585  * If we're doing grouping sets, then some Vars might be referenced in
1586  * tlist/qual for the benefit of other grouping sets, but not needed
1587  * when hashing; i.e. prepare_projection_slot will null them out, so
1588  * there'd be no point storing them. Use prepare_projection_slot's
1589  * logic to determine which.
1590  */
1591  if (aggstate->phases[0].grouped_cols)
1592  {
1593  Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[j];
1594  ListCell *lc;
1595 
1596  foreach(lc, aggstate->all_grouped_cols)
1597  {
1598  int attnum = lfirst_int(lc);
1599 
1600  if (!bms_is_member(attnum, grouped_cols))
1601  colnos = bms_del_member(colnos, attnum);
1602  }
1603  }
1604 
1605  /*
1606  * Compute maximum number of input columns accounting for possible
1607  * duplications in the grpColIdx array, which can happen in some edge
1608  * cases where HashAggregate was generated as part of a semijoin or a
1609  * DISTINCT.
1610  */
1611  maxCols = bms_num_members(colnos) + perhash->numCols;
1612 
1613  perhash->hashGrpColIdxInput =
1614  palloc(maxCols * sizeof(AttrNumber));
1615  perhash->hashGrpColIdxHash =
1616  palloc(perhash->numCols * sizeof(AttrNumber));
1617 
1618  /* Add all the grouping columns to colnos */
1619  for (i = 0; i < perhash->numCols; i++)
1620  colnos = bms_add_member(colnos, grpColIdx[i]);
1621 
1622  /*
1623  * First build mapping for columns directly hashed. These are the
1624  * first, because they'll be accessed when computing hash values and
1625  * comparing tuples for exact matches. We also build simple mapping
1626  * for execGrouping, so it knows where to find the to-be-hashed /
1627  * compared columns in the input.
1628  */
1629  for (i = 0; i < perhash->numCols; i++)
1630  {
1631  perhash->hashGrpColIdxInput[i] = grpColIdx[i];
1632  perhash->hashGrpColIdxHash[i] = i + 1;
1633  perhash->numhashGrpCols++;
1634  /* delete already mapped columns */
1635  bms_del_member(colnos, grpColIdx[i]);
1636  }
1637 
1638  /* and add the remaining columns */
1639  while ((i = bms_first_member(colnos)) >= 0)
1640  {
1641  perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i;
1642  perhash->numhashGrpCols++;
1643  }
1644 
1645  /* and build a tuple descriptor for the hashtable */
1646  for (i = 0; i < perhash->numhashGrpCols; i++)
1647  {
1648  int varNumber = perhash->hashGrpColIdxInput[i] - 1;
1649 
1650  hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber));
1651  perhash->largestGrpColIdx =
1652  Max(varNumber + 1, perhash->largestGrpColIdx);
1653  }
1654 
1655  hashDesc = ExecTypeFromTL(hashTlist);
1656 
1657  execTuplesHashPrepare(perhash->numCols,
1658  perhash->aggnode->grpOperators,
1659  &perhash->eqfuncoids,
1660  &perhash->hashfunctions);
1661  perhash->hashslot =
1662  ExecAllocTableSlot(&estate->es_tupleTable, hashDesc,
1664 
1665  list_free(hashTlist);
1666  bms_free(colnos);
1667  }
1668 
1669  bms_free(base_colnos);
1670 }
1671 
1672 /*
1673  * Estimate per-hash-table-entry overhead.
1674  */
1675 Size
1676 hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
1677 {
1678  Size tupleChunkSize;
1679  Size pergroupChunkSize;
1680  Size transitionChunkSize;
1681  Size tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) +
1682  tupleWidth);
1683  Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
1684 
1685  tupleChunkSize = CHUNKHDRSZ + tupleSize;
1686 
1687  if (pergroupSize > 0)
1688  pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
1689  else
1690  pergroupChunkSize = 0;
1691 
1692  if (transitionSpace > 0)
1693  transitionChunkSize = CHUNKHDRSZ + transitionSpace;
1694  else
1695  transitionChunkSize = 0;
1696 
1697  return
1698  sizeof(TupleHashEntryData) +
1699  tupleChunkSize +
1700  pergroupChunkSize +
1701  transitionChunkSize;
1702 }
1703 
1704 /*
1705  * hashagg_recompile_expressions()
1706  *
1707  * Identifies the right phase, compiles the right expression given the
1708  * arguments, and then sets phase->evalfunc to that expression.
1709  *
1710  * Different versions of the compiled expression are needed depending on
1711  * whether hash aggregation has spilled or not, and whether it's reading from
1712  * the outer plan or a tape. Before spilling to disk, the expression reads
1713  * from the outer plan and does not need to perform a NULL check. After
1714  * HashAgg begins to spill, new groups will not be created in the hash table,
1715  * and the AggStatePerGroup array may be NULL; therefore we need to add a null
1716  * pointer check to the expression. Then, when reading spilled data from a
1717  * tape, we change the outer slot type to be a fixed minimal tuple slot.
1718  *
1719  * It would be wasteful to recompile every time, so cache the compiled
1720  * expressions in the AggStatePerPhase, and reuse when appropriate.
1721  */
1722 static void
1723 hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
1724 {
1725  AggStatePerPhase phase;
1726  int i = minslot ? 1 : 0;
1727  int j = nullcheck ? 1 : 0;
1728 
1729  Assert(aggstate->aggstrategy == AGG_HASHED ||
1730  aggstate->aggstrategy == AGG_MIXED);
1731 
1732  if (aggstate->aggstrategy == AGG_HASHED)
1733  phase = &aggstate->phases[0];
1734  else /* AGG_MIXED */
1735  phase = &aggstate->phases[1];
1736 
1737  if (phase->evaltrans_cache[i][j] == NULL)
1738  {
1739  const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops;
1740  bool outerfixed = aggstate->ss.ps.outeropsfixed;
1741  bool dohash = true;
1742  bool dosort = false;
1743 
1744  /*
1745  * If minslot is true, that means we are processing a spilled batch
1746  * (inside agg_refill_hash_table()), and we must not advance the
1747  * sorted grouping sets.
1748  */
1749  if (aggstate->aggstrategy == AGG_MIXED && !minslot)
1750  dosort = true;
1751 
1752  /* temporarily change the outerops while compiling the expression */
1753  if (minslot)
1754  {
1755  aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
1756  aggstate->ss.ps.outeropsfixed = true;
1757  }
1758 
1759  phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase,
1760  dosort, dohash,
1761  nullcheck);
1762 
1763  /* change back */
1764  aggstate->ss.ps.outerops = outerops;
1765  aggstate->ss.ps.outeropsfixed = outerfixed;
1766  }
1767 
1768  phase->evaltrans = phase->evaltrans_cache[i][j];
1769 }
1770 
1771 /*
1772  * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
1773  * number of partitions we expect to create (if we do spill).
1774  *
1775  * There are two limits: a memory limit, and also an ngroups limit. The
1776  * ngroups limit becomes important when we expect transition values to grow
1777  * substantially larger than the initial value.
1778  */
1779 void
1780 hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
1781  Size *mem_limit, uint64 *ngroups_limit,
1782  int *num_partitions)
1783 {
1784  int npartitions;
1785  Size partition_mem;
1786  Size hash_mem_limit = get_hash_memory_limit();
1787 
1788  /* if not expected to spill, use all of hash_mem */
1789  if (input_groups * hashentrysize <= hash_mem_limit)
1790  {
1791  if (num_partitions != NULL)
1792  *num_partitions = 0;
1793  *mem_limit = hash_mem_limit;
1794  *ngroups_limit = hash_mem_limit / hashentrysize;
1795  return;
1796  }
1797 
1798  /*
1799  * Calculate expected memory requirements for spilling, which is the size
1800  * of the buffers needed for all the tapes that need to be open at once.
1801  * Then, subtract that from the memory available for holding hash tables.
1802  */
1803  npartitions = hash_choose_num_partitions(input_groups,
1804  hashentrysize,
1805  used_bits,
1806  NULL);
1807  if (num_partitions != NULL)
1808  *num_partitions = npartitions;
1809 
1810  partition_mem =
1813 
1814  /*
1815  * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
1816  * minimum number of partitions, so we aren't going to dramatically exceed
1817  * work mem anyway.
1818  */
1819  if (hash_mem_limit > 4 * partition_mem)
1820  *mem_limit = hash_mem_limit - partition_mem;
1821  else
1822  *mem_limit = hash_mem_limit * 0.75;
1823 
1824  if (*mem_limit > hashentrysize)
1825  *ngroups_limit = *mem_limit / hashentrysize;
1826  else
1827  *ngroups_limit = 1;
1828 }
1829 
1830 /*
1831  * hash_agg_check_limits
1832  *
1833  * After adding a new group to the hash table, check whether we need to enter
1834  * spill mode. Allocations may happen without adding new groups (for instance,
1835  * if the transition state size grows), so this check is imperfect.
1836  */
1837 static void
1839 {
1840  uint64 ngroups = aggstate->hash_ngroups_current;
1841  Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
1842  true);
1844  true);
1845 
1846  /*
1847  * Don't spill unless there's at least one group in the hash table so we
1848  * can be sure to make progress even in edge cases.
1849  */
1850  if (aggstate->hash_ngroups_current > 0 &&
1851  (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
1852  ngroups > aggstate->hash_ngroups_limit))
1853  {
1854  hash_agg_enter_spill_mode(aggstate);
1855  }
1856 }
1857 
1858 /*
1859  * Enter "spill mode", meaning that no new groups are added to any of the hash
1860  * tables. Tuples that would create a new group are instead spilled, and
1861  * processed later.
1862  */
1863 static void
1865 {
1866  aggstate->hash_spill_mode = true;
1867  hashagg_recompile_expressions(aggstate, aggstate->table_filled, true);
1868 
1869  if (!aggstate->hash_ever_spilled)
1870  {
1871  Assert(aggstate->hash_tapeset == NULL);
1872  Assert(aggstate->hash_spills == NULL);
1873 
1874  aggstate->hash_ever_spilled = true;
1875 
1876  aggstate->hash_tapeset = LogicalTapeSetCreate(true, NULL, -1);
1877 
1878  aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes);
1879 
1880  for (int setno = 0; setno < aggstate->num_hashes; setno++)
1881  {
1882  AggStatePerHash perhash = &aggstate->perhash[setno];
1883  HashAggSpill *spill = &aggstate->hash_spills[setno];
1884 
1885  hashagg_spill_init(spill, aggstate->hash_tapeset, 0,
1886  perhash->aggnode->numGroups,
1887  aggstate->hashentrysize);
1888  }
1889  }
1890 }
1891 
1892 /*
1893  * Update metrics after filling the hash table.
1894  *
1895  * If reading from the outer plan, from_tape should be false; if reading from
1896  * another tape, from_tape should be true.
1897  */
1898 static void
1899 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
1900 {
1901  Size meta_mem;
1902  Size hashkey_mem;
1903  Size buffer_mem;
1904  Size total_mem;
1905 
1906  if (aggstate->aggstrategy != AGG_MIXED &&
1907  aggstate->aggstrategy != AGG_HASHED)
1908  return;
1909 
1910  /* memory for the hash table itself */
1911  meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
1912 
1913  /* memory for the group keys and transition states */
1914  hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
1915 
1916  /* memory for read/write tape buffers, if spilled */
1917  buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
1918  if (from_tape)
1919  buffer_mem += HASHAGG_READ_BUFFER_SIZE;
1920 
1921  /* update peak mem */
1922  total_mem = meta_mem + hashkey_mem + buffer_mem;
1923  if (total_mem > aggstate->hash_mem_peak)
1924  aggstate->hash_mem_peak = total_mem;
1925 
1926  /* update disk usage */
1927  if (aggstate->hash_tapeset != NULL)
1928  {
1929  uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeset) * (BLCKSZ / 1024);
1930 
1931  if (aggstate->hash_disk_used < disk_used)
1932  aggstate->hash_disk_used = disk_used;
1933  }
1934 
1935  /* update hashentrysize estimate based on contents */
1936  if (aggstate->hash_ngroups_current > 0)
1937  {
1938  aggstate->hashentrysize =
1939  sizeof(TupleHashEntryData) +
1940  (hashkey_mem / (double) aggstate->hash_ngroups_current);
1941  }
1942 }
1943 
1944 /*
1945  * Choose a reasonable number of buckets for the initial hash table size.
1946  */
1947 static long
1948 hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
1949 {
1950  long max_nbuckets;
1951  long nbuckets = ngroups;
1952 
1953  max_nbuckets = memory / hashentrysize;
1954 
1955  /*
1956  * Underestimating is better than overestimating. Too many buckets crowd
1957  * out space for group keys and transition state values.
1958  */
1959  max_nbuckets >>= 1;
1960 
1961  if (nbuckets > max_nbuckets)
1962  nbuckets = max_nbuckets;
1963 
1964  return Max(nbuckets, 1);
1965 }
1966 
1967 /*
1968  * Determine the number of partitions to create when spilling, which will
1969  * always be a power of two. If log2_npartitions is non-NULL, set
1970  * *log2_npartitions to the log2() of the number of partitions.
1971  */
1972 static int
1973 hash_choose_num_partitions(double input_groups, double hashentrysize,
1974  int used_bits, int *log2_npartitions)
1975 {
1976  Size hash_mem_limit = get_hash_memory_limit();
1977  double partition_limit;
1978  double mem_wanted;
1979  double dpartitions;
1980  int npartitions;
1981  int partition_bits;
1982 
1983  /*
1984  * Avoid creating so many partitions that the memory requirements of the
1985  * open partition files are greater than 1/4 of hash_mem.
1986  */
1987  partition_limit =
1988  (hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
1990 
1991  mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
1992 
1993  /* make enough partitions so that each one is likely to fit in memory */
1994  dpartitions = 1 + (mem_wanted / hash_mem_limit);
1995 
1996  if (dpartitions > partition_limit)
1997  dpartitions = partition_limit;
1998 
1999  if (dpartitions < HASHAGG_MIN_PARTITIONS)
2000  dpartitions = HASHAGG_MIN_PARTITIONS;
2001  if (dpartitions > HASHAGG_MAX_PARTITIONS)
2002  dpartitions = HASHAGG_MAX_PARTITIONS;
2003 
2004  /* HASHAGG_MAX_PARTITIONS limit makes this safe */
2005  npartitions = (int) dpartitions;
2006 
2007  /* ceil(log2(npartitions)) */
2008  partition_bits = my_log2(npartitions);
2009 
2010  /* make sure that we don't exhaust the hash bits */
2011  if (partition_bits + used_bits >= 32)
2012  partition_bits = 32 - used_bits;
2013 
2014  if (log2_npartitions != NULL)
2015  *log2_npartitions = partition_bits;
2016 
2017  /* number of partitions will be a power of two */
2018  npartitions = 1 << partition_bits;
2019 
2020  return npartitions;
2021 }
2022 
2023 /*
2024  * Initialize a freshly-created TupleHashEntry.
2025  */
2026 static void
2028  TupleHashEntry entry)
2029 {
2030  AggStatePerGroup pergroup;
2031  int transno;
2032 
2033  aggstate->hash_ngroups_current++;
2034  hash_agg_check_limits(aggstate);
2035 
2036  /* no need to allocate or initialize per-group state */
2037  if (aggstate->numtrans == 0)
2038  return;
2039 
2040  pergroup = (AggStatePerGroup)
2041  MemoryContextAlloc(hashtable->tablecxt,
2042  sizeof(AggStatePerGroupData) * aggstate->numtrans);
2043 
2044  entry->additional = pergroup;
2045 
2046  /*
2047  * Initialize aggregates for new tuple group, lookup_hash_entries()
2048  * already has selected the relevant grouping set.
2049  */
2050  for (transno = 0; transno < aggstate->numtrans; transno++)
2051  {
2052  AggStatePerTrans pertrans = &aggstate->pertrans[transno];
2053  AggStatePerGroup pergroupstate = &pergroup[transno];
2054 
2055  initialize_aggregate(aggstate, pertrans, pergroupstate);
2056  }
2057 }
2058 
2059 /*
2060  * Look up hash entries for the current tuple in all hashed grouping sets.
2061  *
2062  * Be aware that lookup_hash_entry can reset the tmpcontext.
2063  *
2064  * Some entries may be left NULL if we are in "spill mode". The same tuple
2065  * will belong to different groups for each grouping set, so may match a group
2066  * already in memory for one set and match a group not in memory for another
2067  * set. When in "spill mode", the tuple will be spilled for each grouping set
2068  * where it doesn't match a group in memory.
2069  *
2070  * NB: It's possible to spill the same tuple for several different grouping
2071  * sets. This may seem wasteful, but it's actually a trade-off: if we spill
2072  * the tuple multiple times for multiple grouping sets, it can be partitioned
2073  * for each grouping set, making the refilling of the hash table very
2074  * efficient.
2075  */
2076 static void
2078 {
2079  AggStatePerGroup *pergroup = aggstate->hash_pergroup;
2080  TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
2081  int setno;
2082 
2083  for (setno = 0; setno < aggstate->num_hashes; setno++)
2084  {
2085  AggStatePerHash perhash = &aggstate->perhash[setno];
2086  TupleHashTable hashtable = perhash->hashtable;
2087  TupleTableSlot *hashslot = perhash->hashslot;
2088  TupleHashEntry entry;
2089  uint32 hash;
2090  bool isnew = false;
2091  bool *p_isnew;
2092 
2093  /* if hash table already spilled, don't create new entries */
2094  p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
2095 
2096  select_current_set(aggstate, setno, true);
2097  prepare_hash_slot(perhash,
2098  outerslot,
2099  hashslot);
2100 
2101  entry = LookupTupleHashEntry(hashtable, hashslot,
2102  p_isnew, &hash);
2103 
2104  if (entry != NULL)
2105  {
2106  if (isnew)
2107  initialize_hash_entry(aggstate, hashtable, entry);
2108  pergroup[setno] = entry->additional;
2109  }
2110  else
2111  {
2112  HashAggSpill *spill = &aggstate->hash_spills[setno];
2113  TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
2114 
2115  if (spill->partitions == NULL)
2116  hashagg_spill_init(spill, aggstate->hash_tapeset, 0,
2117  perhash->aggnode->numGroups,
2118  aggstate->hashentrysize);
2119 
2120  hashagg_spill_tuple(aggstate, spill, slot, hash);
2121  pergroup[setno] = NULL;
2122  }
2123  }
2124 }
2125 
2126 /*
2127  * ExecAgg -
2128  *
2129  * ExecAgg receives tuples from its outer subplan and aggregates over
2130  * the appropriate attribute for each aggregate function use (Aggref
2131  * node) appearing in the targetlist or qual of the node. The number
2132  * of tuples to aggregate over depends on whether grouped or plain
2133  * aggregation is selected. In grouped aggregation, we produce a result
2134  * row for each group; in plain aggregation there's a single result row
2135  * for the whole query. In either case, the value of each aggregate is
2136  * stored in the expression context to be used when ExecProject evaluates
2137  * the result tuple.
2138  */
2139 static TupleTableSlot *
2141 {
2142  AggState *node = castNode(AggState, pstate);
2143  TupleTableSlot *result = NULL;
2144 
2146 
2147  if (!node->agg_done)
2148  {
2149  /* Dispatch based on strategy */
2150  switch (node->phase->aggstrategy)
2151  {
2152  case AGG_HASHED:
2153  if (!node->table_filled)
2154  agg_fill_hash_table(node);
2155  /* FALLTHROUGH */
2156  case AGG_MIXED:
2157  result = agg_retrieve_hash_table(node);
2158  break;
2159  case AGG_PLAIN:
2160  case AGG_SORTED:
2161  result = agg_retrieve_direct(node);
2162  break;
2163  }
2164 
2165  if (!TupIsNull(result))
2166  return result;
2167  }
2168 
2169  return NULL;
2170 }
2171 
2172 /*
2173  * ExecAgg for non-hashed case
2174  */
2175 static TupleTableSlot *
2177 {
2178  Agg *node = aggstate->phase->aggnode;
2179  ExprContext *econtext;
2180  ExprContext *tmpcontext;
2181  AggStatePerAgg peragg;
2182  AggStatePerGroup *pergroups;
2183  TupleTableSlot *outerslot;
2184  TupleTableSlot *firstSlot;
2185  TupleTableSlot *result;
2186  bool hasGroupingSets = aggstate->phase->numsets > 0;
2187  int numGroupingSets = Max(aggstate->phase->numsets, 1);
2188  int currentSet;
2189  int nextSetSize;
2190  int numReset;
2191  int i;
2192 
2193  /*
2194  * get state info from node
2195  *
2196  * econtext is the per-output-tuple expression context
2197  *
2198  * tmpcontext is the per-input-tuple expression context
2199  */
2200  econtext = aggstate->ss.ps.ps_ExprContext;
2201  tmpcontext = aggstate->tmpcontext;
2202 
2203  peragg = aggstate->peragg;
2204  pergroups = aggstate->pergroups;
2205  firstSlot = aggstate->ss.ss_ScanTupleSlot;
2206 
2207  /*
2208  * We loop retrieving groups until we find one matching
2209  * aggstate->ss.ps.qual
2210  *
2211  * For grouping sets, we have the invariant that aggstate->projected_set
2212  * is either -1 (initial call) or the index (starting from 0) in
2213  * gset_lengths for the group we just completed (either by projecting a
2214  * row or by discarding it in the qual).
2215  */
2216  while (!aggstate->agg_done)
2217  {
2218  /*
2219  * Clear the per-output-tuple context for each group, as well as
2220  * aggcontext (which contains any pass-by-ref transvalues of the old
2221  * group). Some aggregate functions store working state in child
2222  * contexts; those now get reset automatically without us needing to
2223  * do anything special.
2224  *
2225  * We use ReScanExprContext not just ResetExprContext because we want
2226  * any registered shutdown callbacks to be called. That allows
2227  * aggregate functions to ensure they've cleaned up any non-memory
2228  * resources.
2229  */
2230  ReScanExprContext(econtext);
2231 
2232  /*
2233  * Determine how many grouping sets need to be reset at this boundary.
2234  */
2235  if (aggstate->projected_set >= 0 &&
2236  aggstate->projected_set < numGroupingSets)
2237  numReset = aggstate->projected_set + 1;
2238  else
2239  numReset = numGroupingSets;
2240 
2241  /*
2242  * numReset can change on a phase boundary, but that's OK; we want to
2243  * reset the contexts used in _this_ phase, and later, after possibly
2244  * changing phase, initialize the right number of aggregates for the
2245  * _new_ phase.
2246  */
2247 
2248  for (i = 0; i < numReset; i++)
2249  {
2250  ReScanExprContext(aggstate->aggcontexts[i]);
2251  }
2252 
2253  /*
2254  * Check if input is complete and there are no more groups to project
2255  * in this phase; move to next phase or mark as done.
2256  */
2257  if (aggstate->input_done == true &&
2258  aggstate->projected_set >= (numGroupingSets - 1))
2259  {
2260  if (aggstate->current_phase < aggstate->numphases - 1)
2261  {
2262  initialize_phase(aggstate, aggstate->current_phase + 1);
2263  aggstate->input_done = false;
2264  aggstate->projected_set = -1;
2265  numGroupingSets = Max(aggstate->phase->numsets, 1);
2266  node = aggstate->phase->aggnode;
2267  numReset = numGroupingSets;
2268  }
2269  else if (aggstate->aggstrategy == AGG_MIXED)
2270  {
2271  /*
2272  * Mixed mode; we've output all the grouped stuff and have
2273  * full hashtables, so switch to outputting those.
2274  */
2275  initialize_phase(aggstate, 0);
2276  aggstate->table_filled = true;
2278  &aggstate->perhash[0].hashiter);
2279  select_current_set(aggstate, 0, true);
2280  return agg_retrieve_hash_table(aggstate);
2281  }
2282  else
2283  {
2284  aggstate->agg_done = true;
2285  break;
2286  }
2287  }
2288 
2289  /*
2290  * Get the number of columns in the next grouping set after the last
2291  * projected one (if any). This is the number of columns to compare to
2292  * see if we reached the boundary of that set too.
2293  */
2294  if (aggstate->projected_set >= 0 &&
2295  aggstate->projected_set < (numGroupingSets - 1))
2296  nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1];
2297  else
2298  nextSetSize = 0;
2299 
2300  /*----------
2301  * If a subgroup for the current grouping set is present, project it.
2302  *
2303  * We have a new group if:
2304  * - we're out of input but haven't projected all grouping sets
2305  * (checked above)
2306  * OR
2307  * - we already projected a row that wasn't from the last grouping
2308  * set
2309  * AND
2310  * - the next grouping set has at least one grouping column (since
2311  * empty grouping sets project only once input is exhausted)
2312  * AND
2313  * - the previous and pending rows differ on the grouping columns
2314  * of the next grouping set
2315  *----------
2316  */
2317  tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple;
2318  if (aggstate->input_done ||
2319  (node->aggstrategy != AGG_PLAIN &&
2320  aggstate->projected_set != -1 &&
2321  aggstate->projected_set < (numGroupingSets - 1) &&
2322  nextSetSize > 0 &&
2323  !ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1],
2324  tmpcontext)))
2325  {
2326  aggstate->projected_set += 1;
2327 
2328  Assert(aggstate->projected_set < numGroupingSets);
2329  Assert(nextSetSize > 0 || aggstate->input_done);
2330  }
2331  else
2332  {
2333  /*
2334  * We no longer care what group we just projected, the next
2335  * projection will always be the first (or only) grouping set
2336  * (unless the input proves to be empty).
2337  */
2338  aggstate->projected_set = 0;
2339 
2340  /*
2341  * If we don't already have the first tuple of the new group,
2342  * fetch it from the outer plan.
2343  */
2344  if (aggstate->grp_firstTuple == NULL)
2345  {
2346  outerslot = fetch_input_tuple(aggstate);
2347  if (!TupIsNull(outerslot))
2348  {
2349  /*
2350  * Make a copy of the first input tuple; we will use this
2351  * for comparisons (in group mode) and for projection.
2352  */
2353  aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
2354  }
2355  else
2356  {
2357  /* outer plan produced no tuples at all */
2358  if (hasGroupingSets)
2359  {
2360  /*
2361  * If there was no input at all, we need to project
2362  * rows only if there are grouping sets of size 0.
2363  * Note that this implies that there can't be any
2364  * references to ungrouped Vars, which would otherwise
2365  * cause issues with the empty output slot.
2366  *
2367  * XXX: This is no longer true, we currently deal with
2368  * this in finalize_aggregates().
2369  */
2370  aggstate->input_done = true;
2371 
2372  while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0)
2373  {
2374  aggstate->projected_set += 1;
2375  if (aggstate->projected_set >= numGroupingSets)
2376  {
2377  /*
2378  * We can't set agg_done here because we might
2379  * have more phases to do, even though the
2380  * input is empty. So we need to restart the
2381  * whole outer loop.
2382  */
2383  break;
2384  }
2385  }
2386 
2387  if (aggstate->projected_set >= numGroupingSets)
2388  continue;
2389  }
2390  else
2391  {
2392  aggstate->agg_done = true;
2393  /* If we are grouping, we should produce no tuples too */
2394  if (node->aggstrategy != AGG_PLAIN)
2395  return NULL;
2396  }
2397  }
2398  }
2399 
2400  /*
2401  * Initialize working state for a new input tuple group.
2402  */
2403  initialize_aggregates(aggstate, pergroups, numReset);
2404 
2405  if (aggstate->grp_firstTuple != NULL)
2406  {
2407  /*
2408  * Store the copied first input tuple in the tuple table slot
2409  * reserved for it. The tuple will be deleted when it is
2410  * cleared from the slot.
2411  */
2413  firstSlot, true);
2414  aggstate->grp_firstTuple = NULL; /* don't keep two pointers */
2415 
2416  /* set up for first advance_aggregates call */
2417  tmpcontext->ecxt_outertuple = firstSlot;
2418 
2419  /*
2420  * Process each outer-plan tuple, and then fetch the next one,
2421  * until we exhaust the outer plan or cross a group boundary.
2422  */
2423  for (;;)
2424  {
2425  /*
2426  * During phase 1 only of a mixed agg, we need to update
2427  * hashtables as well in advance_aggregates.
2428  */
2429  if (aggstate->aggstrategy == AGG_MIXED &&
2430  aggstate->current_phase == 1)
2431  {
2432  lookup_hash_entries(aggstate);
2433  }
2434 
2435  /* Advance the aggregates (or combine functions) */
2436  advance_aggregates(aggstate);
2437 
2438  /* Reset per-input-tuple context after each tuple */
2439  ResetExprContext(tmpcontext);
2440 
2441  outerslot = fetch_input_tuple(aggstate);
2442  if (TupIsNull(outerslot))
2443  {
2444  /* no more outer-plan tuples available */
2445 
2446  /* if we built hash tables, finalize any spills */
2447  if (aggstate->aggstrategy == AGG_MIXED &&
2448  aggstate->current_phase == 1)
2450 
2451  if (hasGroupingSets)
2452  {
2453  aggstate->input_done = true;
2454  break;
2455  }
2456  else
2457  {
2458  aggstate->agg_done = true;
2459  break;
2460  }
2461  }
2462  /* set up for next advance_aggregates call */
2463  tmpcontext->ecxt_outertuple = outerslot;
2464 
2465  /*
2466  * If we are grouping, check whether we've crossed a group
2467  * boundary.
2468  */
2469  if (node->aggstrategy != AGG_PLAIN)
2470  {
2471  tmpcontext->ecxt_innertuple = firstSlot;
2472  if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1],
2473  tmpcontext))
2474  {
2475  aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
2476  break;
2477  }
2478  }
2479  }
2480  }
2481 
2482  /*
2483  * Use the representative input tuple for any references to
2484  * non-aggregated input columns in aggregate direct args, the node
2485  * qual, and the tlist. (If we are not grouping, and there are no
2486  * input rows at all, we will come here with an empty firstSlot
2487  * ... but if not grouping, there can't be any references to
2488  * non-aggregated input columns, so no problem.)
2489  */
2490  econtext->ecxt_outertuple = firstSlot;
2491  }
2492 
2493  Assert(aggstate->projected_set >= 0);
2494 
2495  currentSet = aggstate->projected_set;
2496 
2497  prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet);
2498 
2499  select_current_set(aggstate, currentSet, false);
2500 
2501  finalize_aggregates(aggstate,
2502  peragg,
2503  pergroups[currentSet]);
2504 
2505  /*
2506  * If there's no row to project right now, we must continue rather
2507  * than returning a null since there might be more groups.
2508  */
2509  result = project_aggregates(aggstate);
2510  if (result)
2511  return result;
2512  }
2513 
2514  /* No more groups */
2515  return NULL;
2516 }
2517 
2518 /*
2519  * ExecAgg for hashed case: read input and build hash table
2520  */
2521 static void
2523 {
2524  TupleTableSlot *outerslot;
2525  ExprContext *tmpcontext = aggstate->tmpcontext;
2526 
2527  /*
2528  * Process each outer-plan tuple, and then fetch the next one, until we
2529  * exhaust the outer plan.
2530  */
2531  for (;;)
2532  {
2533  outerslot = fetch_input_tuple(aggstate);
2534  if (TupIsNull(outerslot))
2535  break;
2536 
2537  /* set up for lookup_hash_entries and advance_aggregates */
2538  tmpcontext->ecxt_outertuple = outerslot;
2539 
2540  /* Find or build hashtable entries */
2541  lookup_hash_entries(aggstate);
2542 
2543  /* Advance the aggregates (or combine functions) */
2544  advance_aggregates(aggstate);
2545 
2546  /*
2547  * Reset per-input-tuple context after each tuple, but note that the
2548  * hash lookups do this too
2549  */
2550  ResetExprContext(aggstate->tmpcontext);
2551  }
2552 
2553  /* finalize spills, if any */
2555 
2556  aggstate->table_filled = true;
2557  /* Initialize to walk the first hash table */
2558  select_current_set(aggstate, 0, true);
2560  &aggstate->perhash[0].hashiter);
2561 }
2562 
2563 /*
2564  * If any data was spilled during hash aggregation, reset the hash table and
2565  * reprocess one batch of spilled data. After reprocessing a batch, the hash
2566  * table will again contain data, ready to be consumed by
2567  * agg_retrieve_hash_table_in_memory().
2568  *
2569  * Should only be called after all in memory hash table entries have been
2570  * finalized and emitted.
2571  *
2572  * Return false when input is exhausted and there's no more work to be done;
2573  * otherwise return true.
2574  */
2575 static bool
2577 {
2578  HashAggBatch *batch;
2579  AggStatePerHash perhash;
2580  HashAggSpill spill;
2581  LogicalTapeSet *tapeset = aggstate->hash_tapeset;
2582  bool spill_initialized = false;
2583 
2584  if (aggstate->hash_batches == NIL)
2585  return false;
2586 
2587  batch = linitial(aggstate->hash_batches);
2588  aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
2589 
2590  hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
2591  batch->used_bits, &aggstate->hash_mem_limit,
2592  &aggstate->hash_ngroups_limit, NULL);
2593 
2594  /*
2595  * Each batch only processes one grouping set; set the rest to NULL so
2596  * that advance_aggregates() knows to ignore them. We don't touch
2597  * pergroups for sorted grouping sets here, because they will be needed if
2598  * we rescan later. The expressions for sorted grouping sets will not be
2599  * evaluated after we recompile anyway.
2600  */
2601  MemSet(aggstate->hash_pergroup, 0,
2602  sizeof(AggStatePerGroup) * aggstate->num_hashes);
2603 
2604  /* free memory and reset hash tables */
2605  ReScanExprContext(aggstate->hashcontext);
2606  for (int setno = 0; setno < aggstate->num_hashes; setno++)
2607  ResetTupleHashTable(aggstate->perhash[setno].hashtable);
2608 
2609  aggstate->hash_ngroups_current = 0;
2610 
2611  /*
2612  * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output
2613  * happens in phase 0. So, we switch to phase 1 when processing a batch,
2614  * and back to phase 0 after the batch is done.
2615  */
2616  Assert(aggstate->current_phase == 0);
2617  if (aggstate->phase->aggstrategy == AGG_MIXED)
2618  {
2619  aggstate->current_phase = 1;
2620  aggstate->phase = &aggstate->phases[aggstate->current_phase];
2621  }
2622 
2623  select_current_set(aggstate, batch->setno, true);
2624 
2625  perhash = &aggstate->perhash[aggstate->current_set];
2626 
2627  /*
2628  * Spilled tuples are always read back as MinimalTuples, which may be
2629  * different from the outer plan, so recompile the aggregate expressions.
2630  *
2631  * We still need the NULL check, because we are only processing one
2632  * grouping set at a time and the rest will be NULL.
2633  */
2634  hashagg_recompile_expressions(aggstate, true, true);
2635 
2636  for (;;)
2637  {
2638  TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
2639  TupleTableSlot *hashslot = perhash->hashslot;
2640  TupleHashEntry entry;
2641  MinimalTuple tuple;
2642  uint32 hash;
2643  bool isnew = false;
2644  bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
2645 
2647 
2648  tuple = hashagg_batch_read(batch, &hash);
2649  if (tuple == NULL)
2650  break;
2651 
2652  ExecStoreMinimalTuple(tuple, spillslot, true);
2653  aggstate->tmpcontext->ecxt_outertuple = spillslot;
2654 
2655  prepare_hash_slot(perhash,
2656  aggstate->tmpcontext->ecxt_outertuple,
2657  hashslot);
2658  entry = LookupTupleHashEntryHash(
2659  perhash->hashtable, hashslot, p_isnew, hash);
2660 
2661  if (entry != NULL)
2662  {
2663  if (isnew)
2664  initialize_hash_entry(aggstate, perhash->hashtable, entry);
2665  aggstate->hash_pergroup[batch->setno] = entry->additional;
2666  advance_aggregates(aggstate);
2667  }
2668  else
2669  {
2670  if (!spill_initialized)
2671  {
2672  /*
2673  * Avoid initializing the spill until we actually need it so
2674  * that we don't assign tapes that will never be used.
2675  */
2676  spill_initialized = true;
2677  hashagg_spill_init(&spill, tapeset, batch->used_bits,
2678  batch->input_card, aggstate->hashentrysize);
2679  }
2680  /* no memory for a new group, spill */
2681  hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
2682 
2683  aggstate->hash_pergroup[batch->setno] = NULL;
2684  }
2685 
2686  /*
2687  * Reset per-input-tuple context after each tuple, but note that the
2688  * hash lookups do this too
2689  */
2690  ResetExprContext(aggstate->tmpcontext);
2691  }
2692 
2693  LogicalTapeClose(batch->input_tape);
2694 
2695  /* change back to phase 0 */
2696  aggstate->current_phase = 0;
2697  aggstate->phase = &aggstate->phases[aggstate->current_phase];
2698 
2699  if (spill_initialized)
2700  {
2701  hashagg_spill_finish(aggstate, &spill, batch->setno);
2702  hash_agg_update_metrics(aggstate, true, spill.npartitions);
2703  }
2704  else
2705  hash_agg_update_metrics(aggstate, true, 0);
2706 
2707  aggstate->hash_spill_mode = false;
2708 
2709  /* prepare to walk the first hash table */
2710  select_current_set(aggstate, batch->setno, true);
2711  ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
2712  &aggstate->perhash[batch->setno].hashiter);
2713 
2714  pfree(batch);
2715 
2716  return true;
2717 }
2718 
2719 /*
2720  * ExecAgg for hashed case: retrieving groups from hash table
2721  *
2722  * After exhausting in-memory tuples, also try refilling the hash table using
2723  * previously-spilled tuples. Only returns NULL after all in-memory and
2724  * spilled tuples are exhausted.
2725  */
2726 static TupleTableSlot *
2728 {
2729  TupleTableSlot *result = NULL;
2730 
2731  while (result == NULL)
2732  {
2733  result = agg_retrieve_hash_table_in_memory(aggstate);
2734  if (result == NULL)
2735  {
2736  if (!agg_refill_hash_table(aggstate))
2737  {
2738  aggstate->agg_done = true;
2739  break;
2740  }
2741  }
2742  }
2743 
2744  return result;
2745 }
2746 
2747 /*
2748  * Retrieve the groups from the in-memory hash tables without considering any
2749  * spilled tuples.
2750  */
2751 static TupleTableSlot *
2753 {
2754  ExprContext *econtext;
2755  AggStatePerAgg peragg;
2756  AggStatePerGroup pergroup;
2757  TupleHashEntryData *entry;
2758  TupleTableSlot *firstSlot;
2759  TupleTableSlot *result;
2760  AggStatePerHash perhash;
2761 
2762  /*
2763  * get state info from node.
2764  *
2765  * econtext is the per-output-tuple expression context.
2766  */
2767  econtext = aggstate->ss.ps.ps_ExprContext;
2768  peragg = aggstate->peragg;
2769  firstSlot = aggstate->ss.ss_ScanTupleSlot;
2770 
2771  /*
2772  * Note that perhash (and therefore anything accessed through it) can
2773  * change inside the loop, as we change between grouping sets.
2774  */
2775  perhash = &aggstate->perhash[aggstate->current_set];
2776 
2777  /*
2778  * We loop retrieving groups until we find one satisfying
2779  * aggstate->ss.ps.qual
2780  */
2781  for (;;)
2782  {
2783  TupleTableSlot *hashslot = perhash->hashslot;
2784  int i;
2785 
2787 
2788  /*
2789  * Find the next entry in the hash table
2790  */
2791  entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
2792  if (entry == NULL)
2793  {
2794  int nextset = aggstate->current_set + 1;
2795 
2796  if (nextset < aggstate->num_hashes)
2797  {
2798  /*
2799  * Switch to next grouping set, reinitialize, and restart the
2800  * loop.
2801  */
2802  select_current_set(aggstate, nextset, true);
2803 
2804  perhash = &aggstate->perhash[aggstate->current_set];
2805 
2806  ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
2807 
2808  continue;
2809  }
2810  else
2811  {
2812  return NULL;
2813  }
2814  }
2815 
2816  /*
2817  * Clear the per-output-tuple context for each group
2818  *
2819  * We intentionally don't use ReScanExprContext here; if any aggs have
2820  * registered shutdown callbacks, they mustn't be called yet, since we
2821  * might not be done with that agg.
2822  */
2823  ResetExprContext(econtext);
2824 
2825  /*
2826  * Transform representative tuple back into one with the right
2827  * columns.
2828  */
2829  ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
2830  slot_getallattrs(hashslot);
2831 
2832  ExecClearTuple(firstSlot);
2833  memset(firstSlot->tts_isnull, true,
2834  firstSlot->tts_tupleDescriptor->natts * sizeof(bool));
2835 
2836  for (i = 0; i < perhash->numhashGrpCols; i++)
2837  {
2838  int varNumber = perhash->hashGrpColIdxInput[i] - 1;
2839 
2840  firstSlot->tts_values[varNumber] = hashslot->tts_values[i];
2841  firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i];
2842  }
2843  ExecStoreVirtualTuple(firstSlot);
2844 
2845  pergroup = (AggStatePerGroup) entry->additional;
2846 
2847  /*
2848  * Use the representative input tuple for any references to
2849  * non-aggregated input columns in the qual and tlist.
2850  */
2851  econtext->ecxt_outertuple = firstSlot;
2852 
2853  prepare_projection_slot(aggstate,
2854  econtext->ecxt_outertuple,
2855  aggstate->current_set);
2856 
2857  finalize_aggregates(aggstate, peragg, pergroup);
2858 
2859  result = project_aggregates(aggstate);
2860  if (result)
2861  return result;
2862  }
2863 
2864  /* No more groups */
2865  return NULL;
2866 }
2867 
2868 /*
2869  * hashagg_spill_init
2870  *
2871  * Called after we determined that spilling is necessary. Chooses the number
2872  * of partitions to create, and initializes them.
2873  */
2874 static void
2875 hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits,
2876  double input_groups, double hashentrysize)
2877 {
2878  int npartitions;
2879  int partition_bits;
2880 
2881  npartitions = hash_choose_num_partitions(input_groups, hashentrysize,
2882  used_bits, &partition_bits);
2883 
2884  spill->partitions = palloc0(sizeof(LogicalTape *) * npartitions);
2885  spill->ntuples = palloc0(sizeof(int64) * npartitions);
2886  spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
2887 
2888  for (int i = 0; i < npartitions; i++)
2889  spill->partitions[i] = LogicalTapeCreate(tapeset);
2890 
2891  spill->shift = 32 - used_bits - partition_bits;
2892  spill->mask = (npartitions - 1) << spill->shift;
2893  spill->npartitions = npartitions;
2894 
2895  for (int i = 0; i < npartitions; i++)
2897 }
2898 
2899 /*
2900  * hashagg_spill_tuple
2901  *
2902  * No room for new groups in the hash table. Save for later in the appropriate
2903  * partition.
2904  */
2905 static Size
2907  TupleTableSlot *inputslot, uint32 hash)
2908 {
2909  TupleTableSlot *spillslot;
2910  int partition;
2911  MinimalTuple tuple;
2912  LogicalTape *tape;
2913  int total_written = 0;
2914  bool shouldFree;
2915 
2916  Assert(spill->partitions != NULL);
2917 
2918  /* spill only attributes that we actually need */
2919  if (!aggstate->all_cols_needed)
2920  {
2921  spillslot = aggstate->hash_spill_wslot;
2922  slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
2923  ExecClearTuple(spillslot);
2924  for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
2925  {
2926  if (bms_is_member(i + 1, aggstate->colnos_needed))
2927  {
2928  spillslot->tts_values[i] = inputslot->tts_values[i];
2929  spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
2930  }
2931  else
2932  spillslot->tts_isnull[i] = true;
2933  }
2934  ExecStoreVirtualTuple(spillslot);
2935  }
2936  else
2937  spillslot = inputslot;
2938 
2939  tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
2940 
2941  partition = (hash & spill->mask) >> spill->shift;
2942  spill->ntuples[partition]++;
2943 
2944  /*
2945  * All hash values destined for a given partition have some bits in
2946  * common, which causes bad HLL cardinality estimates. Hash the hash to
2947  * get a more uniform distribution.
2948  */
2949  addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
2950 
2951  tape = spill->partitions[partition];
2952 
2953  LogicalTapeWrite(tape, (void *) &hash, sizeof(uint32));
2954  total_written += sizeof(uint32);
2955 
2956  LogicalTapeWrite(tape, (void *) tuple, tuple->t_len);
2957  total_written += tuple->t_len;
2958 
2959  if (shouldFree)
2960  pfree(tuple);
2961 
2962  return total_written;
2963 }
2964 
2965 /*
2966  * hashagg_batch_new
2967  *
2968  * Construct a HashAggBatch item, which represents one iteration of HashAgg to
2969  * be done.
2970  */
2971 static HashAggBatch *
2972 hashagg_batch_new(LogicalTape *input_tape, int setno,
2973  int64 input_tuples, double input_card, int used_bits)
2974 {
2975  HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
2976 
2977  batch->setno = setno;
2978  batch->used_bits = used_bits;
2979  batch->input_tape = input_tape;
2980  batch->input_tuples = input_tuples;
2981  batch->input_card = input_card;
2982 
2983  return batch;
2984 }
2985 
2986 /*
2987  * read_spilled_tuple
2988  * read the next tuple from a batch's tape. Return NULL if no more.
2989  */
2990 static MinimalTuple
2992 {
2993  LogicalTape *tape = batch->input_tape;
2994  MinimalTuple tuple;
2995  uint32 t_len;
2996  size_t nread;
2997  uint32 hash;
2998 
2999  nread = LogicalTapeRead(tape, &hash, sizeof(uint32));
3000  if (nread == 0)
3001  return NULL;
3002  if (nread != sizeof(uint32))
3003  ereport(ERROR,
3005  errmsg("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
3006  tape, sizeof(uint32), nread)));
3007  if (hashp != NULL)
3008  *hashp = hash;
3009 
3010  nread = LogicalTapeRead(tape, &t_len, sizeof(t_len));
3011  if (nread != sizeof(uint32))
3012  ereport(ERROR,
3014  errmsg("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
3015  tape, sizeof(uint32), nread)));
3016 
3017  tuple = (MinimalTuple) palloc(t_len);
3018  tuple->t_len = t_len;
3019 
3020  nread = LogicalTapeRead(tape,
3021  (void *) ((char *) tuple + sizeof(uint32)),
3022  t_len - sizeof(uint32));
3023  if (nread != t_len - sizeof(uint32))
3024  ereport(ERROR,
3026  errmsg("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
3027  tape, t_len - sizeof(uint32), nread)));
3028 
3029  return tuple;
3030 }
3031 
3032 /*
3033  * hashagg_finish_initial_spills
3034  *
3035  * After a HashAggBatch has been processed, it may have spilled tuples to
3036  * disk. If so, turn the spilled partitions into new batches that must later
3037  * be executed.
3038  */
3039 static void
3041 {
3042  int setno;
3043  int total_npartitions = 0;
3044 
3045  if (aggstate->hash_spills != NULL)
3046  {
3047  for (setno = 0; setno < aggstate->num_hashes; setno++)
3048  {
3049  HashAggSpill *spill = &aggstate->hash_spills[setno];
3050 
3051  total_npartitions += spill->npartitions;
3052  hashagg_spill_finish(aggstate, spill, setno);
3053  }
3054 
3055  /*
3056  * We're not processing tuples from outer plan any more; only
3057  * processing batches of spilled tuples. The initial spill structures
3058  * are no longer needed.
3059  */
3060  pfree(aggstate->hash_spills);
3061  aggstate->hash_spills = NULL;
3062  }
3063 
3064  hash_agg_update_metrics(aggstate, false, total_npartitions);
3065  aggstate->hash_spill_mode = false;
3066 }
3067 
3068 /*
3069  * hashagg_spill_finish
3070  *
3071  * Transform spill partitions into new batches.
3072  */
3073 static void
3074 hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
3075 {
3076  int i;
3077  int used_bits = 32 - spill->shift;
3078 
3079  if (spill->npartitions == 0)
3080  return; /* didn't spill */
3081 
3082  for (i = 0; i < spill->npartitions; i++)
3083  {
3084  LogicalTape *tape = spill->partitions[i];
3085  HashAggBatch *new_batch;
3086  double cardinality;
3087 
3088  /* if the partition is empty, don't create a new batch of work */
3089  if (spill->ntuples[i] == 0)
3090  continue;
3091 
3092  cardinality = estimateHyperLogLog(&spill->hll_card[i]);
3093  freeHyperLogLog(&spill->hll_card[i]);
3094 
3095  /* rewinding frees the buffer while not in use */
3097 
3098  new_batch = hashagg_batch_new(tape, setno,
3099  spill->ntuples[i], cardinality,
3100  used_bits);
3101  aggstate->hash_batches = lcons(new_batch, aggstate->hash_batches);
3102  aggstate->hash_batches_used++;
3103  }
3104 
3105  pfree(spill->ntuples);
3106  pfree(spill->hll_card);
3107  pfree(spill->partitions);
3108 }
3109 
3110 /*
3111  * Free resources related to a spilled HashAgg.
3112  */
3113 static void
3115 {
3116  ListCell *lc;
3117 
3118  /* free spills from initial pass */
3119  if (aggstate->hash_spills != NULL)
3120  {
3121  int setno;
3122 
3123  for (setno = 0; setno < aggstate->num_hashes; setno++)
3124  {
3125  HashAggSpill *spill = &aggstate->hash_spills[setno];
3126 
3127  pfree(spill->ntuples);
3128  pfree(spill->partitions);
3129  }
3130  pfree(aggstate->hash_spills);
3131  aggstate->hash_spills = NULL;
3132  }
3133 
3134  /* free batches */
3135  foreach(lc, aggstate->hash_batches)
3136  {
3137  HashAggBatch *batch = (HashAggBatch *) lfirst(lc);
3138 
3139  pfree(batch);
3140  }
3141  list_free(aggstate->hash_batches);
3142  aggstate->hash_batches = NIL;
3143 
3144  /* close tape set */
3145  if (aggstate->hash_tapeset != NULL)
3146  {
3147  LogicalTapeSetClose(aggstate->hash_tapeset);
3148  aggstate->hash_tapeset = NULL;
3149  }
3150 }
3151 
3152 
3153 /* -----------------
3154  * ExecInitAgg
3155  *
3156  * Creates the run-time information for the agg node produced by the
3157  * planner and initializes its outer subtree.
3158  *
3159  * -----------------
3160  */
3161 AggState *
3162 ExecInitAgg(Agg *node, EState *estate, int eflags)
3163 {
3164  AggState *aggstate;
3165  AggStatePerAgg peraggs;
3166  AggStatePerTrans pertransstates;
3167  AggStatePerGroup *pergroups;
3168  Plan *outerPlan;
3169  ExprContext *econtext;
3170  TupleDesc scanDesc;
3171  int max_aggno;
3172  int max_transno;
3173  int numaggrefs;
3174  int numaggs;
3175  int numtrans;
3176  int phase;
3177  int phaseidx;
3178  ListCell *l;
3179  Bitmapset *all_grouped_cols = NULL;
3180  int numGroupingSets = 1;
3181  int numPhases;
3182  int numHashes;
3183  int i = 0;
3184  int j = 0;
3185  bool use_hashing = (node->aggstrategy == AGG_HASHED ||
3186  node->aggstrategy == AGG_MIXED);
3187 
3188  /* check for unsupported flags */
3189  Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
3190 
3191  /*
3192  * create state structure
3193  */
3194  aggstate = makeNode(AggState);
3195  aggstate->ss.ps.plan = (Plan *) node;
3196  aggstate->ss.ps.state = estate;
3197  aggstate->ss.ps.ExecProcNode = ExecAgg;
3198 
3199  aggstate->aggs = NIL;
3200  aggstate->numaggs = 0;
3201  aggstate->numtrans = 0;
3202  aggstate->aggstrategy = node->aggstrategy;
3203  aggstate->aggsplit = node->aggsplit;
3204  aggstate->maxsets = 0;
3205  aggstate->projected_set = -1;
3206  aggstate->current_set = 0;
3207  aggstate->peragg = NULL;
3208  aggstate->pertrans = NULL;
3209  aggstate->curperagg = NULL;
3210  aggstate->curpertrans = NULL;
3211  aggstate->input_done = false;
3212  aggstate->agg_done = false;
3213  aggstate->pergroups = NULL;
3214  aggstate->grp_firstTuple = NULL;
3215  aggstate->sort_in = NULL;
3216  aggstate->sort_out = NULL;
3217 
3218  /*
3219  * phases[0] always exists, but is dummy in sorted/plain mode
3220  */
3221  numPhases = (use_hashing ? 1 : 2);
3222  numHashes = (use_hashing ? 1 : 0);
3223 
3224  /*
3225  * Calculate the maximum number of grouping sets in any phase; this
3226  * determines the size of some allocations. Also calculate the number of
3227  * phases, since all hashed/mixed nodes contribute to only a single phase.
3228  */
3229  if (node->groupingSets)
3230  {
3231  numGroupingSets = list_length(node->groupingSets);
3232 
3233  foreach(l, node->chain)
3234  {
3235  Agg *agg = lfirst(l);
3236 
3237  numGroupingSets = Max(numGroupingSets,
3238  list_length(agg->groupingSets));
3239 
3240  /*
3241  * additional AGG_HASHED aggs become part of phase 0, but all
3242  * others add an extra phase.
3243  */
3244  if (agg->aggstrategy != AGG_HASHED)
3245  ++numPhases;
3246  else
3247  ++numHashes;
3248  }
3249  }
3250 
3251  aggstate->maxsets = numGroupingSets;
3252  aggstate->numphases = numPhases;
3253 
3254  aggstate->aggcontexts = (ExprContext **)
3255  palloc0(sizeof(ExprContext *) * numGroupingSets);
3256 
3257  /*
3258  * Create expression contexts. We need three or more, one for
3259  * per-input-tuple processing, one for per-output-tuple processing, one
3260  * for all the hashtables, and one for each grouping set. The per-tuple
3261  * memory context of the per-grouping-set ExprContexts (aggcontexts)
3262  * replaces the standalone memory context formerly used to hold transition
3263  * values. We cheat a little by using ExecAssignExprContext() to build
3264  * all of them.
3265  *
3266  * NOTE: the details of what is stored in aggcontexts and what is stored
3267  * in the regular per-query memory context are driven by a simple
3268  * decision: we want to reset the aggcontext at group boundaries (if not
3269  * hashing) and in ExecReScanAgg to recover no-longer-wanted space.
3270  */
3271  ExecAssignExprContext(estate, &aggstate->ss.ps);
3272  aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
3273 
3274  for (i = 0; i < numGroupingSets; ++i)
3275  {
3276  ExecAssignExprContext(estate, &aggstate->ss.ps);
3277  aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext;
3278  }
3279 
3280  if (use_hashing)
3281  aggstate->hashcontext = CreateWorkExprContext(estate);
3282 
3283  ExecAssignExprContext(estate, &aggstate->ss.ps);
3284 
3285  /*
3286  * Initialize child nodes.
3287  *
3288  * If we are doing a hashed aggregation then the child plan does not need
3289  * to handle REWIND efficiently; see ExecReScanAgg.
3290  */
3291  if (node->aggstrategy == AGG_HASHED)
3292  eflags &= ~EXEC_FLAG_REWIND;
3293  outerPlan = outerPlan(node);
3294  outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
3295 
3296  /*
3297  * initialize source tuple type.
3298  */
3299  aggstate->ss.ps.outerops =
3301  &aggstate->ss.ps.outeropsfixed);
3302  aggstate->ss.ps.outeropsset = true;
3303 
3304  ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
3305  aggstate->ss.ps.outerops);
3306  scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
3307 
3308  /*
3309  * If there are more than two phases (including a potential dummy phase
3310  * 0), input will be resorted using tuplesort. Need a slot for that.
3311  */
3312  if (numPhases > 2)
3313  {
3314  aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc,
3316 
3317  /*
3318  * The output of the tuplesort, and the output from the outer child
3319  * might not use the same type of slot. In most cases the child will
3320  * be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the
3321  * input can also be presorted due an index, in which case it could be
3322  * a different type of slot.
3323  *
3324  * XXX: For efficiency it would be good to instead/additionally
3325  * generate expressions with corresponding settings of outerops* for
3326  * the individual phases - deforming is often a bottleneck for
3327  * aggregations with lots of rows per group. If there's multiple
3328  * sorts, we know that all but the first use TTSOpsMinimalTuple (via
3329  * the nodeAgg.c internal tuplesort).
3330  */
3331  if (aggstate->ss.ps.outeropsfixed &&
3332  aggstate->ss.ps.outerops != &TTSOpsMinimalTuple)
3333  aggstate->ss.ps.outeropsfixed = false;
3334  }
3335 
3336  /*
3337  * Initialize result type, slot and projection.
3338  */
3340  ExecAssignProjectionInfo(&aggstate->ss.ps, NULL);
3341 
3342  /*
3343  * initialize child expressions
3344  *
3345  * We expect the parser to have checked that no aggs contain other agg
3346  * calls in their arguments (and just to be sure, we verify it again while
3347  * initializing the plan node). This would make no sense under SQL
3348  * semantics, and it's forbidden by the spec. Because it is true, we
3349  * don't need to worry about evaluating the aggs in any particular order.
3350  *
3351  * Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs.
3352  * Aggrefs in the qual are found here; Aggrefs in the targetlist are found
3353  * during ExecAssignProjectionInfo, above.
3354  */
3355  aggstate->ss.ps.qual =
3356  ExecInitQual(node->plan.qual, (PlanState *) aggstate);
3357 
3358  /*
3359  * We should now have found all Aggrefs in the targetlist and quals.
3360  */
3361  numaggrefs = list_length(aggstate->aggs);
3362  max_aggno = -1;
3363  max_transno = -1;
3364  foreach(l, aggstate->aggs)
3365  {
3366  Aggref *aggref = (Aggref *) lfirst(l);
3367 
3368  max_aggno = Max(max_aggno, aggref->aggno);
3369  max_transno = Max(max_transno, aggref->aggtransno);
3370  }
3371  numaggs = max_aggno + 1;
3372  numtrans = max_transno + 1;
3373 
3374  /*
3375  * For each phase, prepare grouping set data and fmgr lookup data for
3376  * compare functions. Accumulate all_grouped_cols in passing.
3377  */
3378  aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
3379 
3380  aggstate->num_hashes = numHashes;
3381  if (numHashes)
3382  {
3383  aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes);
3384  aggstate->phases[0].numsets = 0;
3385  aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int));
3386  aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *));
3387  }
3388 
3389  phase = 0;
3390  for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx)
3391  {
3392  Agg *aggnode;
3393  Sort *sortnode;
3394 
3395  if (phaseidx > 0)
3396  {
3397  aggnode = list_nth_node(Agg, node->chain, phaseidx - 1);
3398  sortnode = castNode(Sort, aggnode->plan.lefttree);
3399  }
3400  else
3401  {
3402  aggnode = node;
3403  sortnode = NULL;
3404  }
3405 
3406  Assert(phase <= 1 || sortnode);
3407 
3408  if (aggnode->aggstrategy == AGG_HASHED
3409  || aggnode->aggstrategy == AGG_MIXED)
3410  {
3411  AggStatePerPhase phasedata = &aggstate->phases[0];
3412  AggStatePerHash perhash;
3413  Bitmapset *cols = NULL;
3414 
3415  Assert(phase == 0);
3416  i = phasedata->numsets++;
3417  perhash = &aggstate->perhash[i];
3418 
3419  /* phase 0 always points to the "real" Agg in the hash case */
3420  phasedata->aggnode = node;
3421  phasedata->aggstrategy = node->aggstrategy;
3422 
3423  /* but the actual Agg node representing this hash is saved here */
3424  perhash->aggnode = aggnode;
3425 
3426  phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols;
3427 
3428  for (j = 0; j < aggnode->numCols; ++j)
3429  cols = bms_add_member(cols, aggnode->grpColIdx[j]);
3430 
3431  phasedata->grouped_cols[i] = cols;
3432 
3433  all_grouped_cols = bms_add_members(all_grouped_cols, cols);
3434  continue;
3435  }
3436  else
3437  {
3438  AggStatePerPhase phasedata = &aggstate->phases[++phase];
3439  int num_sets;
3440 
3441  phasedata->numsets = num_sets = list_length(aggnode->groupingSets);
3442 
3443  if (num_sets)
3444  {
3445  phasedata->gset_lengths = palloc(num_sets * sizeof(int));
3446  phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *));
3447 
3448  i = 0;
3449  foreach(l, aggnode->groupingSets)
3450  {
3451  int current_length = list_length(lfirst(l));
3452  Bitmapset *cols = NULL;
3453 
3454  /* planner forces this to be correct */
3455  for (j = 0; j < current_length; ++j)
3456  cols = bms_add_member(cols, aggnode->grpColIdx[j]);
3457 
3458  phasedata->grouped_cols[i] = cols;
3459  phasedata->gset_lengths[i] = current_length;
3460 
3461  ++i;
3462  }
3463 
3464  all_grouped_cols = bms_add_members(all_grouped_cols,
3465  phasedata->grouped_cols[0]);
3466  }
3467  else
3468  {
3469  Assert(phaseidx == 0);
3470 
3471  phasedata->gset_lengths = NULL;
3472  phasedata->grouped_cols = NULL;
3473  }
3474 
3475  /*
3476  * If we are grouping, precompute fmgr lookup data for inner loop.
3477  */
3478  if (aggnode->aggstrategy == AGG_SORTED)
3479  {
3480  int i = 0;
3481 
3482  Assert(aggnode->numCols > 0);
3483 
3484  /*
3485  * Build a separate function for each subset of columns that
3486  * need to be compared.
3487  */
3488  phasedata->eqfunctions =
3489  (ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *));
3490 
3491  /* for each grouping set */
3492  for (i = 0; i < phasedata->numsets; i++)
3493  {
3494  int length = phasedata->gset_lengths[i];
3495 
3496  if (phasedata->eqfunctions[length - 1] != NULL)
3497  continue;
3498 
3499  phasedata->eqfunctions[length - 1] =
3500  execTuplesMatchPrepare(scanDesc,
3501  length,
3502  aggnode->grpColIdx,
3503  aggnode->grpOperators,
3504  aggnode->grpCollations,
3505  (PlanState *) aggstate);
3506  }
3507 
3508  /* and for all grouped columns, unless already computed */
3509  if (phasedata->eqfunctions[aggnode->numCols - 1] == NULL)
3510  {
3511  phasedata->eqfunctions[aggnode->numCols - 1] =
3512  execTuplesMatchPrepare(scanDesc,
3513  aggnode->numCols,
3514  aggnode->grpColIdx,
3515  aggnode->grpOperators,
3516  aggnode->grpCollations,
3517  (PlanState *) aggstate);
3518  }
3519  }
3520 
3521  phasedata->aggnode = aggnode;
3522  phasedata->aggstrategy = aggnode->aggstrategy;
3523  phasedata->sortnode = sortnode;
3524  }
3525  }
3526 
3527  /*
3528  * Convert all_grouped_cols to a descending-order list.
3529  */
3530  i = -1;
3531  while ((i = bms_next_member(all_grouped_cols, i)) >= 0)
3532  aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols);
3533 
3534  /*
3535  * Set up aggregate-result storage in the output expr context, and also
3536  * allocate my private per-agg working storage
3537  */
3538  econtext = aggstate->ss.ps.ps_ExprContext;
3539  econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
3540  econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
3541 
3542  peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
3543  pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans);
3544 
3545  aggstate->peragg = peraggs;
3546  aggstate->pertrans = pertransstates;
3547 
3548 
3549  aggstate->all_pergroups =
3551  * (numGroupingSets + numHashes));
3552  pergroups = aggstate->all_pergroups;
3553 
3554  if (node->aggstrategy != AGG_HASHED)
3555  {
3556  for (i = 0; i < numGroupingSets; i++)
3557  {
3558  pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData)
3559  * numaggs);
3560  }
3561 
3562  aggstate->pergroups = pergroups;
3563  pergroups += numGroupingSets;
3564  }
3565 
3566  /*
3567  * Hashing can only appear in the initial phase.
3568  */
3569  if (use_hashing)
3570  {
3571  Plan *outerplan = outerPlan(node);
3572  uint64 totalGroups = 0;
3573  int i;
3574 
3575  aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
3576  "HashAgg meta context",
3578  aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
3580  aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
3581  &TTSOpsVirtual);
3582 
3583  /* this is an array of pointers, not structures */
3584  aggstate->hash_pergroup = pergroups;
3585 
3586  aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans,
3587  outerplan->plan_width,
3588  node->transitionSpace);
3589 
3590  /*
3591  * Consider all of the grouping sets together when setting the limits
3592  * and estimating the number of partitions. This can be inaccurate
3593  * when there is more than one grouping set, but should still be
3594  * reasonable.
3595  */
3596  for (i = 0; i < aggstate->num_hashes; i++)
3597  totalGroups += aggstate->perhash[i].aggnode->numGroups;
3598 
3599  hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0,
3600  &aggstate->hash_mem_limit,
3601  &aggstate->hash_ngroups_limit,
3602  &aggstate->hash_planned_partitions);
3603  find_hash_columns(aggstate);
3604 
3605  /* Skip massive memory allocation if we are just doing EXPLAIN */
3606  if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
3607  build_hash_tables(aggstate);
3608 
3609  aggstate->table_filled = false;
3610 
3611  /* Initialize this to 1, meaning nothing spilled, yet */
3612  aggstate->hash_batches_used = 1;
3613  }
3614 
3615  /*
3616  * Initialize current phase-dependent values to initial phase. The initial
3617  * phase is 1 (first sort pass) for all strategies that use sorting (if
3618  * hashing is being done too, then phase 0 is processed last); but if only
3619  * hashing is being done, then phase 0 is all there is.
3620  */
3621  if (node->aggstrategy == AGG_HASHED)
3622  {
3623  aggstate->current_phase = 0;
3624  initialize_phase(aggstate, 0);
3625  select_current_set(aggstate, 0, true);
3626  }
3627  else
3628  {
3629  aggstate->current_phase = 1;
3630  initialize_phase(aggstate, 1);
3631  select_current_set(aggstate, 0, false);
3632  }
3633 
3634  /*
3635  * Perform lookups of aggregate function info, and initialize the
3636  * unchanging fields of the per-agg and per-trans data.
3637  */
3638  foreach(l, aggstate->aggs)
3639  {
3640  Aggref *aggref = lfirst(l);
3641  AggStatePerAgg peragg;
3642  AggStatePerTrans pertrans;
3643  Oid aggTransFnInputTypes[FUNC_MAX_ARGS];
3644  int numAggTransFnArgs;
3645  int numDirectArgs;
3646  HeapTuple aggTuple;
3647  Form_pg_aggregate aggform;
3648  AclResult aclresult;
3649  Oid finalfn_oid;
3650  Oid serialfn_oid,
3651  deserialfn_oid;
3652  Oid aggOwner;
3653  Expr *finalfnexpr;
3654  Oid aggtranstype;
3655 
3656  /* Planner should have assigned aggregate to correct level */
3657  Assert(aggref->agglevelsup == 0);
3658  /* ... and the split mode should match */
3659  Assert(aggref->aggsplit == aggstate->aggsplit);
3660 
3661  peragg = &peraggs[aggref->aggno];
3662 
3663  /* Check if we initialized the state for this aggregate already. */
3664  if (peragg->aggref != NULL)
3665  continue;
3666 
3667  peragg->aggref = aggref;
3668  peragg->transno = aggref->aggtransno;
3669 
3670  /* Fetch the pg_aggregate row */
3671  aggTuple = SearchSysCache1(AGGFNOID,
3672  ObjectIdGetDatum(aggref->aggfnoid));
3673  if (!HeapTupleIsValid(aggTuple))
3674  elog(ERROR, "cache lookup failed for aggregate %u",
3675  aggref->aggfnoid);
3676  aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
3677 
3678  /* Check permission to call aggregate function */
3679  aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(),
3680  ACL_EXECUTE);
3681  if (aclresult != ACLCHECK_OK)
3682  aclcheck_error(aclresult, OBJECT_AGGREGATE,
3683  get_func_name(aggref->aggfnoid));
3685 
3686  /* planner recorded transition state type in the Aggref itself */
3687  aggtranstype = aggref->aggtranstype;
3688  Assert(OidIsValid(aggtranstype));
3689 
3690  /* Final function only required if we're finalizing the aggregates */
3691  if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
3692  peragg->finalfn_oid = finalfn_oid = InvalidOid;
3693  else
3694  peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
3695 
3696  serialfn_oid = InvalidOid;
3697  deserialfn_oid = InvalidOid;
3698 
3699  /*
3700  * Check if serialization/deserialization is required. We only do it
3701  * for aggregates that have transtype INTERNAL.
3702  */
3703  if (aggtranstype == INTERNALOID)
3704  {
3705  /*
3706  * The planner should only have generated a serialize agg node if
3707  * every aggregate with an INTERNAL state has a serialization
3708  * function. Verify that.
3709  */
3710  if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit))
3711  {
3712  /* serialization only valid when not running finalfn */
3713  Assert(DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
3714 
3715  if (!OidIsValid(aggform->aggserialfn))
3716  elog(ERROR, "serialfunc not provided for serialization aggregation");
3717  serialfn_oid = aggform->aggserialfn;
3718  }
3719 
3720  /* Likewise for deserialization functions */
3721  if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit))
3722  {
3723  /* deserialization only valid when combining states */
3724  Assert(DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
3725 
3726  if (!OidIsValid(aggform->aggdeserialfn))
3727  elog(ERROR, "deserialfunc not provided for deserialization aggregation");
3728  deserialfn_oid = aggform->aggdeserialfn;
3729  }
3730  }
3731 
3732  /* Check that aggregate owner has permission to call component fns */
3733  {
3734  HeapTuple procTuple;
3735 
3736  procTuple = SearchSysCache1(PROCOID,
3737  ObjectIdGetDatum(aggref->aggfnoid));
3738  if (!HeapTupleIsValid(procTuple))
3739  elog(ERROR, "cache lookup failed for function %u",
3740  aggref->aggfnoid);
3741  aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
3742  ReleaseSysCache(procTuple);
3743 
3744  if (OidIsValid(finalfn_oid))
3745  {
3746  aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
3747  ACL_EXECUTE);
3748  if (aclresult != ACLCHECK_OK)
3749  aclcheck_error(aclresult, OBJECT_FUNCTION,
3750  get_func_name(finalfn_oid));
3751  InvokeFunctionExecuteHook(finalfn_oid);
3752  }
3753  if (OidIsValid(serialfn_oid))
3754  {
3755  aclresult = pg_proc_aclcheck(serialfn_oid, aggOwner,
3756  ACL_EXECUTE);
3757  if (aclresult != ACLCHECK_OK)
3758  aclcheck_error(aclresult, OBJECT_FUNCTION,
3759  get_func_name(serialfn_oid));
3760  InvokeFunctionExecuteHook(serialfn_oid);
3761  }
3762  if (OidIsValid(deserialfn_oid))
3763  {
3764  aclresult = pg_proc_aclcheck(deserialfn_oid, aggOwner,
3765  ACL_EXECUTE);
3766  if (aclresult != ACLCHECK_OK)
3767  aclcheck_error(aclresult, OBJECT_FUNCTION,
3768  get_func_name(deserialfn_oid));
3769  InvokeFunctionExecuteHook(deserialfn_oid);
3770  }
3771  }
3772 
3773  /*
3774  * Get actual datatypes of the (nominal) aggregate inputs. These
3775  * could be different from the agg's declared input types, when the
3776  * agg accepts ANY or a polymorphic type.
3777  */
3778  numAggTransFnArgs = get_aggregate_argtypes(aggref,
3779  aggTransFnInputTypes);
3780 
3781  /* Count the "direct" arguments, if any */
3782  numDirectArgs = list_length(aggref->aggdirectargs);
3783 
3784  /* Detect how many arguments to pass to the finalfn */
3785  if (aggform->aggfinalextra)
3786  peragg->numFinalArgs = numAggTransFnArgs + 1;
3787  else
3788  peragg->numFinalArgs = numDirectArgs + 1;
3789 
3790  /* Initialize any direct-argument expressions */
3791  peragg->aggdirectargs = ExecInitExprList(aggref->aggdirectargs,
3792  (PlanState *) aggstate);
3793 
3794  /*
3795  * build expression trees using actual argument & result types for the
3796  * finalfn, if it exists and is required.
3797  */
3798  if (OidIsValid(finalfn_oid))
3799  {
3800  build_aggregate_finalfn_expr(aggTransFnInputTypes,
3801  peragg->numFinalArgs,
3802  aggtranstype,
3803  aggref->aggtype,
3804  aggref->inputcollid,
3805  finalfn_oid,
3806  &finalfnexpr);
3807  fmgr_info(finalfn_oid, &peragg->finalfn);
3808  fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn);
3809  }
3810 
3811  /* get info about the output value's datatype */
3812  get_typlenbyval(aggref->aggtype,
3813  &peragg->resulttypeLen,
3814  &peragg->resulttypeByVal);
3815 
3816  /*
3817  * Build working state for invoking the transition function, if we
3818  * haven't done it already.
3819  */
3820  pertrans = &pertransstates[aggref->aggtransno];
3821  if (pertrans->aggref == NULL)
3822  {
3823  Datum textInitVal;
3824  Datum initValue;
3825  bool initValueIsNull;
3826  Oid transfn_oid;
3827 
3828  /*
3829  * If this aggregation is performing state combines, then instead
3830  * of using the transition function, we'll use the combine
3831  * function.
3832  */
3833  if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
3834  {
3835  transfn_oid = aggform->aggcombinefn;
3836 
3837  /* If not set then the planner messed up */
3838  if (!OidIsValid(transfn_oid))
3839  elog(ERROR, "combinefn not set for aggregate function");
3840  }
3841  else
3842  transfn_oid = aggform->aggtransfn;
3843 
3844  aclresult = pg_proc_aclcheck(transfn_oid, aggOwner, ACL_EXECUTE);
3845  if (aclresult != ACLCHECK_OK)
3846  aclcheck_error(aclresult, OBJECT_FUNCTION,
3847  get_func_name(transfn_oid));
3848  InvokeFunctionExecuteHook(transfn_oid);
3849 
3850  /*
3851  * initval is potentially null, so don't try to access it as a
3852  * struct field. Must do it the hard way with SysCacheGetAttr.
3853  */
3854  textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
3855  Anum_pg_aggregate_agginitval,
3856  &initValueIsNull);
3857  if (initValueIsNull)
3858  initValue = (Datum) 0;
3859  else
3860  initValue = GetAggInitVal(textInitVal, aggtranstype);
3861 
3862  if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
3863  {
3864  Oid combineFnInputTypes[] = {aggtranstype,
3865  aggtranstype};
3866 
3867  /*
3868  * When combining there's only one input, the to-be-combined
3869  * transition value. The transition value is not counted
3870  * here.
3871  */
3872  pertrans->numTransInputs = 1;
3873 
3874  /* aggcombinefn always has two arguments of aggtranstype */
3875  build_pertrans_for_aggref(pertrans, aggstate, estate,
3876  aggref, transfn_oid, aggtranstype,
3877  serialfn_oid, deserialfn_oid,
3878  initValue, initValueIsNull,
3879  combineFnInputTypes, 2);
3880 
3881  /*
3882  * Ensure that a combine function to combine INTERNAL states
3883  * is not strict. This should have been checked during CREATE
3884  * AGGREGATE, but the strict property could have been changed
3885  * since then.
3886  */
3887  if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID)
3888  ereport(ERROR,
3889  (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
3890  errmsg("combine function with transition type %s must not be declared STRICT",
3891  format_type_be(aggtranstype))));
3892  }
3893  else
3894  {
3895  /* Detect how many arguments to pass to the transfn */
3896  if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
3897  pertrans->numTransInputs = list_length(aggref->args);
3898  else
3899  pertrans->numTransInputs = numAggTransFnArgs;
3900 
3901  build_pertrans_for_aggref(pertrans, aggstate, estate,
3902  aggref, transfn_oid, aggtranstype,
3903  serialfn_oid, deserialfn_oid,
3904  initValue, initValueIsNull,
3905  aggTransFnInputTypes,
3906  numAggTransFnArgs);
3907 
3908  /*
3909  * If the transfn is strict and the initval is NULL, make sure
3910  * input type and transtype are the same (or at least
3911  * binary-compatible), so that it's OK to use the first
3912  * aggregated input value as the initial transValue. This
3913  * should have been checked at agg definition time, but we
3914  * must check again in case the transfn's strictness property
3915  * has been changed.
3916  */
3917  if (pertrans->transfn.fn_strict && pertrans->initValueIsNull)
3918  {
3919  if (numAggTransFnArgs <= numDirectArgs ||
3920  !IsBinaryCoercible(aggTransFnInputTypes[numDirectArgs],
3921  aggtranstype))
3922  ereport(ERROR,
3923  (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
3924  errmsg("aggregate %u needs to have compatible input type and transition type",
3925  aggref->aggfnoid)));
3926  }
3927  }
3928  }
3929  else
3930  pertrans->aggshared = true;
3931  ReleaseSysCache(aggTuple);
3932  }
3933 
3934  /*
3935  * Update aggstate->numaggs to be the number of unique aggregates found.
3936  * Also set numstates to the number of unique transition states found.
3937  */
3938  aggstate->numaggs = numaggs;
3939  aggstate->numtrans = numtrans;
3940 
3941  /*
3942  * Last, check whether any more aggregates got added onto the node while
3943  * we processed the expressions for the aggregate arguments (including not
3944  * only the regular arguments and FILTER expressions handled immediately
3945  * above, but any direct arguments we might've handled earlier). If so,
3946  * we have nested aggregate functions, which is semantically nonsensical,
3947  * so complain. (This should have been caught by the parser, so we don't
3948  * need to work hard on a helpful error message; but we defend against it
3949  * here anyway, just to be sure.)
3950  */
3951  if (numaggrefs != list_length(aggstate->aggs))
3952  ereport(ERROR,
3953  (errcode(ERRCODE_GROUPING_ERROR),
3954  errmsg("aggregate function calls cannot be nested")));
3955 
3956  /*
3957  * Build expressions doing all the transition work at once. We build a
3958  * different one for each phase, as the number of transition function
3959  * invocation can differ between phases. Note this'll work both for
3960  * transition and combination functions (although there'll only be one
3961  * phase in the latter case).
3962  */
3963  for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++)
3964  {
3965  AggStatePerPhase phase = &aggstate->phases[phaseidx];
3966  bool dohash = false;
3967  bool dosort = false;
3968 
3969  /* phase 0 doesn't necessarily exist */
3970  if (!phase->aggnode)
3971  continue;
3972 
3973  if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1)
3974  {
3975  /*
3976  * Phase one, and only phase one, in a mixed agg performs both
3977  * sorting and aggregation.
3978  */
3979  dohash = true;
3980  dosort = true;
3981  }
3982  else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0)
3983  {
3984  /*
3985  * No need to compute a transition function for an AGG_MIXED phase
3986  * 0 - the contents of the hashtables will have been computed
3987  * during phase 1.
3988  */
3989  continue;
3990  }
3991  else if (phase->aggstrategy == AGG_PLAIN ||
3992  phase->aggstrategy == AGG_SORTED)
3993  {
3994  dohash = false;
3995  dosort = true;
3996  }
3997  else if (phase->aggstrategy == AGG_HASHED)
3998  {
3999  dohash = true;
4000  dosort = false;
4001  }
4002  else
4003  Assert(false);
4004 
4005  phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash,
4006  false);
4007 
4008  /* cache compiled expression for outer slot without NULL check */
4009  phase->evaltrans_cache[0][0] = phase->evaltrans;
4010  }
4011 
4012  return aggstate;
4013 }
4014 
4015 /*
4016  * Build the state needed to calculate a state value for an aggregate.
4017  *
4018  * This initializes all the fields in 'pertrans'. 'aggref' is the aggregate
4019  * to initialize the state for. 'transfn_oid', 'aggtranstype', and the rest
4020  * of the arguments could be calculated from 'aggref', but the caller has
4021  * calculated them already, so might as well pass them.
4022  *
4023  * 'transfn_oid' may be either the Oid of the aggtransfn or the aggcombinefn.
4024  */
4025 static void
4027  AggState *aggstate, EState *estate,
4028  Aggref *aggref,
4029  Oid transfn_oid, Oid aggtranstype,
4030  Oid aggserialfn, Oid aggdeserialfn,
4031  Datum initValue, bool initValueIsNull,
4032  Oid *inputTypes, int numArguments)
4033 {
4034  int numGroupingSets = Max(aggstate->maxsets, 1);
4035  Expr *transfnexpr;
4036  int numTransArgs;
4037  Expr *serialfnexpr = NULL;
4038  Expr *deserialfnexpr = NULL;
4039  ListCell *lc;
4040  int numInputs;
4041  int numDirectArgs;
4042  List *sortlist;
4043  int numSortCols;
4044  int numDistinctCols;
4045  int i;
4046 
4047  /* Begin filling in the pertrans data */
4048  pertrans->aggref = aggref;
4049  pertrans->aggshared = false;
4050  pertrans->aggCollation = aggref->inputcollid;
4051  pertrans->transfn_oid = transfn_oid;
4052  pertrans->serialfn_oid = aggserialfn;
4053  pertrans->deserialfn_oid = aggdeserialfn;
4054  pertrans->initValue = initValue;
4055  pertrans->initValueIsNull = initValueIsNull;
4056 
4057  /* Count the "direct" arguments, if any */
4058  numDirectArgs = list_length(aggref->aggdirectargs);
4059 
4060  /* Count the number of aggregated input columns */
4061  pertrans->numInputs = numInputs = list_length(aggref->args);
4062 
4063  pertrans->aggtranstype = aggtranstype;
4064 
4065  /* account for the current transition state */
4066  numTransArgs = pertrans->numTransInputs + 1;
4067 
4068  /*
4069  * Set up infrastructure for calling the transfn. Note that invtrans is
4070  * not needed here.
4071  */
4072  build_aggregate_transfn_expr(inputTypes,
4073  numArguments,
4074  numDirectArgs,
4075  aggref->aggvariadic,
4076  aggtranstype,
4077  aggref->inputcollid,
4078  transfn_oid,
4079  InvalidOid,
4080  &transfnexpr,
4081  NULL);
4082 
4083  fmgr_info(transfn_oid, &pertrans->transfn);
4084  fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn);
4085 
4086  pertrans->transfn_fcinfo =
4089  &pertrans->transfn,
4090  numTransArgs,
4091  pertrans->aggCollation,
4092  (void *) aggstate, NULL);
4093 
4094  /* get info about the state value's datatype */
4095  get_typlenbyval(aggtranstype,
4096  &pertrans->transtypeLen,
4097  &pertrans->transtypeByVal);
4098 
4099  if (OidIsValid(aggserialfn))
4100  {
4101  build_aggregate_serialfn_expr(aggserialfn,
4102  &serialfnexpr);
4103  fmgr_info(aggserialfn, &pertrans->serialfn);
4104  fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn);
4105 
4106  pertrans->serialfn_fcinfo =
4109  &pertrans->serialfn,
4110  1,
4111  InvalidOid,
4112  (void *) aggstate, NULL);
4113  }
4114 
4115  if (OidIsValid(aggdeserialfn))
4116  {
4117  build_aggregate_deserialfn_expr(aggdeserialfn,
4118  &deserialfnexpr);
4119  fmgr_info(aggdeserialfn, &pertrans->deserialfn);
4120  fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn);
4121 
4122  pertrans->deserialfn_fcinfo =
4125  &pertrans->deserialfn,
4126  2,
4127  InvalidOid,
4128  (void *) aggstate, NULL);
4129 
4130  }
4131 
4132  /*
4133  * If we're doing either DISTINCT or ORDER BY for a plain agg, then we
4134  * have a list of SortGroupClause nodes; fish out the data in them and
4135  * stick them into arrays. We ignore ORDER BY for an ordered-set agg,
4136  * however; the agg's transfn and finalfn are responsible for that.
4137  *
4138  * Note that by construction, if there is a DISTINCT clause then the ORDER
4139  * BY clause is a prefix of it (see transformDistinctClause).
4140  */
4141  if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
4142  {
4143  sortlist = NIL;
4144  numSortCols = numDistinctCols = 0;
4145  }
4146  else if (aggref->aggdistinct)
4147  {
4148  sortlist = aggref->aggdistinct;
4149  numSortCols = numDistinctCols = list_length(sortlist);
4150  Assert(numSortCols >= list_length(aggref->aggorder));
4151  }
4152  else
4153  {
4154  sortlist = aggref->aggorder;
4155  numSortCols = list_length(sortlist);
4156  numDistinctCols = 0;
4157  }
4158 
4159  pertrans->numSortCols = numSortCols;
4160  pertrans->numDistinctCols = numDistinctCols;
4161 
4162  /*
4163  * If we have either sorting or filtering to do, create a tupledesc and
4164  * slot corresponding to the aggregated inputs (including sort
4165  * expressions) of the agg.
4166  */
4167  if (numSortCols > 0 || aggref->aggfilter)
4168  {
4169  pertrans->sortdesc = ExecTypeFromTL(aggref->args);
4170  pertrans->sortslot =
4171  ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
4173  }
4174 
4175  if (numSortCols > 0)
4176  {
4177  /*
4178  * We don't implement DISTINCT or ORDER BY aggs in the HASHED case
4179  * (yet)
4180  */
4181  Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
4182 
4183  /* ORDER BY aggregates are not supported with partial aggregation */
4184  Assert(!DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
4185 
4186  /* If we have only one input, we need its len/byval info. */
4187  if (numInputs == 1)
4188  {
4189  get_typlenbyval(inputTypes[numDirectArgs],
4190  &pertrans->inputtypeLen,
4191  &pertrans->inputtypeByVal);
4192  }
4193  else if (numDistinctCols > 0)
4194  {
4195  /* we will need an extra slot to store prior values */
4196  pertrans->uniqslot =
4197  ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
4199  }
4200 
4201  /* Extract the sort information for use later */
4202  pertrans->sortColIdx =
4203  (AttrNumber *) palloc(numSortCols * sizeof(AttrNumber));
4204  pertrans->sortOperators =
4205  (Oid *) palloc(numSortCols * sizeof(Oid));
4206  pertrans->sortCollations =
4207  (Oid *) palloc(numSortCols * sizeof(Oid));
4208  pertrans->sortNullsFirst =
4209  (bool *) palloc(numSortCols * sizeof(bool));
4210 
4211  i = 0;
4212  foreach(lc, sortlist)
4213  {
4214  SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc);
4215  TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args);
4216 
4217  /* the parser should have made sure of this */
4218  Assert(OidIsValid(sortcl->sortop));
4219 
4220  pertrans->sortColIdx[i] = tle->resno;
4221  pertrans->sortOperators[i] = sortcl->sortop;
4222  pertrans->sortCollations[i] = exprCollation((Node *) tle->expr);
4223  pertrans->sortNullsFirst[i] = sortcl->nulls_first;
4224  i++;
4225  }
4226  Assert(i == numSortCols);
4227  }
4228 
4229  if (aggref->aggdistinct)
4230  {
4231  Oid *ops;
4232 
4233  Assert(numArguments > 0);
4234  Assert(list_length(aggref->aggdistinct) == numDistinctCols);
4235 
4236  ops = palloc(numDistinctCols * sizeof(Oid));
4237 
4238  i = 0;
4239  foreach(lc, aggref->aggdistinct)
4240  ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop;
4241 
4242  /* lookup / build the necessary comparators */
4243  if (numDistinctCols == 1)
4244  fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne);
4245  else
4246  pertrans->equalfnMulti =
4247  execTuplesMatchPrepare(pertrans->sortdesc,
4248  numDistinctCols,
4249  pertrans->sortColIdx,
4250  ops,
4251  pertrans->sortCollations,
4252  &aggstate->ss.ps);
4253  pfree(ops);
4254  }
4255 
4256  pertrans->sortstates = (Tuplesortstate **)
4257  palloc0(sizeof(Tuplesortstate *) * numGroupingSets);
4258 }
4259 
4260 
4261 static Datum
4262 GetAggInitVal(Datum textInitVal, Oid transtype)
4263 {
4264  Oid typinput,
4265  typioparam;
4266  char *strInitVal;
4267  Datum initVal;
4268 
4269  getTypeInputInfo(transtype, &typinput, &typioparam);
4270  strInitVal = TextDatumGetCString(textInitVal);
4271  initVal = OidInputFunctionCall(typinput, strInitVal,
4272  typioparam, -1);
4273  pfree(strInitVal);
4274  return initVal;
4275 }
4276 
4277 void
4279 {
4281  int transno;
4282  int numGroupingSets = Max(node->maxsets, 1);
4283  int setno;
4284 
4285  /*
4286  * When ending a parallel worker, copy the statistics gathered by the
4287  * worker back into shared memory so that it can be picked up by the main
4288  * process to report in EXPLAIN ANALYZE.
4289  */
4290  if (node->shared_info && IsParallelWorker())
4291  {
4293 
4294  Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
4297  si->hash_disk_used = node->hash_disk_used;
4298  si->hash_mem_peak = node->hash_mem_peak;
4299  }
4300 
4301  /* Make sure we have closed any open tuplesorts */
4302 
4303  if (node->sort_in)
4304  tuplesort_end(node->sort_in);
4305  if (node->sort_out)
4306  tuplesort_end(node->sort_out);
4307 
4309 
4310  if (node->hash_metacxt != NULL)
4311  {
4313  node->hash_metacxt = NULL;
4314  }
4315 
4316  for (transno = 0; transno < node->numtrans; transno++)
4317  {
4318  AggStatePerTrans pertrans = &node->pertrans[transno];
4319 
4320  for (setno = 0; setno < numGroupingSets; setno++)
4321  {
4322  if (pertrans->sortstates[setno])
4323  tuplesort_end(pertrans->sortstates[setno]);
4324  }
4325  }
4326 
4327  /* And ensure any agg shutdown callbacks have been called */
4328  for (setno = 0; setno < numGroupingSets; setno++)
4329  ReScanExprContext(node->aggcontexts[setno]);
4330  if (node->hashcontext)
4332 
4333  /*
4334  * We don't actually free any ExprContexts here (see comment in
4335  * ExecFreeExprContext), just unlinking the output one from the plan node
4336  * suffices.
4337  */
4338  ExecFreeExprContext(&node->ss.ps);
4339 
4340  /* clean up tuple table */
4342 
4343  outerPlan = outerPlanState(node);
4344  ExecEndNode(outerPlan);
4345 }
4346 
4347 void
4349 {
4350  ExprContext *econtext = node->ss.ps.ps_ExprContext;
4352  Agg *aggnode = (Agg *) node->ss.ps.plan;
4353  int transno;
4354  int numGroupingSets = Max(node->maxsets, 1);
4355  int setno;
4356 
4357  node->agg_done = false;
4358 
4359  if (node->aggstrategy == AGG_HASHED)
4360  {
4361  /*
4362  * In the hashed case, if we haven't yet built the hash table then we
4363  * can just return; nothing done yet, so nothing to undo. If subnode's
4364  * chgParam is not NULL then it will be re-scanned by ExecProcNode,
4365  * else no reason to re-scan it at all.
4366  */
4367  if (!node->table_filled)
4368  return;
4369 
4370  /*
4371  * If we do have the hash table, and it never spilled, and the subplan
4372  * does not have any parameter changes, and none of our own parameter
4373  * changes affect input expressions of the aggregated functions, then
4374  * we can just rescan the existing hash table; no need to build it
4375  * again.
4376  */
4377  if (outerPlan->chgParam == NULL && !node->hash_ever_spilled &&
4378  !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
4379  {
4381  &node->perhash[0].hashiter);
4382  select_current_set(node, 0, true);
4383  return;
4384  }
4385  }
4386 
4387  /* Make sure we have closed any open tuplesorts */
4388  for (transno = 0; transno < node->numtrans; transno++)
4389  {
4390  for (setno = 0; setno < numGroupingSets; setno++)
4391  {
4392  AggStatePerTrans pertrans = &node->pertrans[transno];
4393 
4394  if (pertrans->sortstates[setno])
4395  {
4396  tuplesort_end(pertrans->sortstates[setno]);
4397  pertrans->sortstates[setno] = NULL;
4398  }
4399  }
4400  }
4401 
4402  /*
4403  * We don't need to ReScanExprContext the output tuple context here;
4404  * ExecReScan already did it. But we do need to reset our per-grouping-set
4405  * contexts, which may have transvalues stored in them. (We use rescan
4406  * rather than just reset because transfns may have registered callbacks
4407  * that need to be run now.) For the AGG_HASHED case, see below.
4408  */
4409 
4410  for (setno = 0; setno < numGroupingSets; setno++)
4411  {
4412  ReScanExprContext(node->aggcontexts[setno]);
4413  }
4414 
4415  /* Release first tuple of group, if we have made a copy */
4416  if (node->grp_firstTuple != NULL)
4417  {
4419  node->grp_firstTuple = NULL;
4420  }
4422 
4423  /* Forget current agg values */
4424  MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
4425  MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
4426 
4427  /*
4428  * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
4429  * the hashcontext. This used to be an issue, but now, resetting a context
4430  * automatically deletes sub-contexts too.
4431  */
4432  if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
4433  {
4435 
4436  node->hash_ever_spilled = false;
4437  node->hash_spill_mode = false;
4438  node->hash_ngroups_current = 0;
4439 
4441  /* Rebuild an empty hash table */
4442  build_hash_tables(node);
4443  node->table_filled = false;
4444  /* iterator will be reset when the table is filled */
4445 
4446  hashagg_recompile_expressions(node, false, false);
4447  }
4448 
4449  if (node->aggstrategy != AGG_HASHED)
4450  {
4451  /*
4452  * Reset the per-group state (in particular, mark transvalues null)
4453  */
4454  for (setno = 0; setno < numGroupingSets; setno++)
4455  {
4456  MemSet(node->pergroups[setno], 0,
4457  sizeof(AggStatePerGroupData) * node->numaggs);
4458  }
4459 
4460  /* reset to phase 1 */
4461  initialize_phase(node, 1);
4462 
4463  node->input_done = false;
4464  node->projected_set = -1;
4465  }
4466 
4467  if (outerPlan->chgParam == NULL)
4468  ExecReScan(outerPlan);
4469 }
4470 
4471 
4472 /***********************************************************************
4473  * API exposed to aggregate functions
4474  ***********************************************************************/
4475 
4476 
4477 /*
4478  * AggCheckCallContext - test if a SQL function is being called as an aggregate
4479  *
4480  * The transition and/or final functions of an aggregate may want to verify
4481  * that they are being called as aggregates, rather than as plain SQL
4482  * functions. They should use this function to do so. The return value
4483  * is nonzero if being called as an aggregate, or zero if not. (Specific
4484  * nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more
4485  * values could conceivably appear in future.)
4486  *
4487  * If aggcontext isn't NULL, the function also stores at *aggcontext the
4488  * identity of the memory context that aggregate transition values are being
4489  * stored in. Note that the same aggregate call site (flinfo) may be called
4490  * interleaved on different transition values in different contexts, so it's
4491  * not kosher to cache aggcontext under fn_extra. It is, however, kosher to
4492  * cache it in the transvalue itself (for internal-type transvalues).
4493  */
4494 int
4496 {
4497  if (fcinfo->context && IsA(fcinfo->context, AggState))
4498  {
4499  if (aggcontext)
4500  {
4501  AggState *aggstate = ((AggState *) fcinfo->context);
4502  ExprContext *cxt = aggstate->curaggcontext;
4503 
4504  *aggcontext = cxt->ecxt_per_tuple_memory;
4505  }
4506  return AGG_CONTEXT_AGGREGATE;
4507  }
4508  if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
4509  {
4510  if (aggcontext)
4511  *aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext;
4512  return AGG_CONTEXT_WINDOW;
4513  }
4514 
4515  /* this is just to prevent "uninitialized variable" warnings */
4516  if (aggcontext)
4517  *aggcontext = NULL;
4518  return 0;
4519 }
4520 
4521 /*
4522  * AggGetAggref - allow an aggregate support function to get its Aggref
4523  *
4524  * If the function is being called as an aggregate support function,
4525  * return the Aggref node for the aggregate call. Otherwise, return NULL.
4526  *
4527  * Aggregates sharing the same inputs and transition functions can get
4528  * merged into a single transition calculation. If the transition function
4529  * calls AggGetAggref, it will get some one of the Aggrefs for which it is
4530  * executing. It must therefore not pay attention to the Aggref fields that
4531  * relate to the final function, as those are indeterminate. But if a final
4532  * function calls AggGetAggref, it will get a precise result.
4533  *
4534  * Note that if an aggregate is being used as a window function, this will
4535  * return NULL. We could provide a similar function to return the relevant
4536  * WindowFunc node in such cases, but it's not needed yet.
4537  */
4538 Aggref *
4540 {
4541  if (fcinfo->context && IsA(fcinfo->context, AggState))
4542  {
4543  AggState *aggstate = (AggState *) fcinfo->context;
4544  AggStatePerAgg curperagg;
4545  AggStatePerTrans curpertrans;
4546 
4547  /* check curperagg (valid when in a final function) */
4548  curperagg = aggstate->curperagg;
4549 
4550  if (curperagg)
4551  return curperagg->aggref;
4552 
4553  /* check curpertrans (valid when in a transition function) */
4554  curpertrans = aggstate->curpertrans;
4555 
4556  if (curpertrans)
4557  return curpertrans->aggref;
4558  }
4559  return NULL;
4560 }
4561 
4562 /*
4563  * AggGetTempMemoryContext - fetch short-term memory context for aggregates
4564  *
4565  * This is useful in agg final functions; the context returned is one that
4566  * the final function can safely reset as desired. This isn't useful for
4567  * transition functions, since the context returned MAY (we don't promise)
4568  * be the same as the context those are called in.
4569  *
4570  * As above, this is currently not useful for aggs called as window functions.
4571  */
4574 {
4575  if (fcinfo->context && IsA(fcinfo->context, AggState))
4576  {
4577  AggState *aggstate = (AggState *) fcinfo->context;
4578 
4579  return aggstate->tmpcontext->ecxt_per_tuple_memory;
4580  }
4581  return NULL;
4582 }
4583 
4584 /*
4585  * AggStateIsShared - find out whether transition state is shared
4586  *
4587  * If the function is being called as an aggregate support function,
4588  * return true if the aggregate's transition state is shared across
4589  * multiple aggregates, false if it is not.
4590  *
4591  * Returns true if not called as an aggregate support function.
4592  * This is intended as a conservative answer, ie "no you'd better not
4593  * scribble on your input". In particular, will return true if the
4594  * aggregate is being used as a window function, which is a scenario
4595  * in which changing the transition state is a bad idea. We might
4596  * want to refine the behavior for the window case in future.
4597  */
4598 bool
4600 {
4601  if (fcinfo->context && IsA(fcinfo->context, AggState))
4602  {
4603  AggState *aggstate = (AggState *) fcinfo->context;
4604  AggStatePerAgg curperagg;
4605  AggStatePerTrans curpertrans;
4606 
4607  /* check curperagg (valid when in a final function) */
4608  curperagg = aggstate->curperagg;
4609 
4610  if (curperagg)
4611  return aggstate->pertrans[curperagg->transno].aggshared;
4612 
4613  /* check curpertrans (valid when in a transition function) */
4614  curpertrans = aggstate->curpertrans;
4615 
4616  if (curpertrans)
4617  return curpertrans->aggshared;
4618  }
4619  return true;
4620 }
4621 
4622 /*
4623  * AggRegisterCallback - register a cleanup callback for an aggregate
4624  *
4625  * This is useful for aggs to register shutdown callbacks, which will ensure
4626  * that non-memory resources are freed. The callback will occur just before
4627  * the associated aggcontext (as returned by AggCheckCallContext) is reset,
4628  * either between groups or as a result of rescanning the query. The callback
4629  * will NOT be called on error paths. The typical use-case is for freeing of
4630  * tuplestores or tuplesorts maintained in aggcontext, or pins held by slots
4631  * created by the agg functions. (The callback will not be called until after
4632  * the result of the finalfn is no longer needed, so it's safe for the finalfn
4633  * to return data that will be freed by the callback.)
4634  *
4635  * As above, this is currently not useful for aggs called as window functions.
4636  */
4637 void
4640  Datum arg)
4641 {
4642  if (fcinfo->context && IsA(fcinfo->context, AggState))
4643  {
4644  AggState *aggstate = (AggState *) fcinfo->context;
4645  ExprContext *cxt = aggstate->curaggcontext;
4646 
4647  RegisterExprContextCallback(cxt, func, arg);
4648 
4649  return;
4650  }
4651  elog(ERROR, "aggregate function cannot register a callback in this context");
4652 }
4653 
4654 
4655 /* ----------------------------------------------------------------
4656  * Parallel Query Support
4657  * ----------------------------------------------------------------
4658  */
4659 
4660  /* ----------------------------------------------------------------
4661  * ExecAggEstimate
4662  *
4663  * Estimate space required to propagate aggregate statistics.
4664  * ----------------------------------------------------------------
4665  */
4666 void
4668 {
4669  Size size;
4670 
4671  /* don't need this if not instrumenting or no workers */
4672  if (!node->ss.ps.instrument || pcxt->nworkers == 0)
4673  return;
4674 
4675  size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
4676  size = add_size(size, offsetof(SharedAggInfo, sinstrument));
4677  shm_toc_estimate_chunk(&pcxt->estimator, size);
4678  shm_toc_estimate_keys(&pcxt->estimator, 1);
4679 }
4680 
4681 /* ----------------------------------------------------------------
4682  * ExecAggInitializeDSM
4683  *
4684  * Initialize DSM space for aggregate statistics.
4685  * ----------------------------------------------------------------
4686  */
4687 void
4689 {
4690  Size size;
4691 
4692  /* don't need this if not instrumenting or no workers */
4693  if (!node->ss.ps.instrument || pcxt->nworkers == 0)
4694  return;
4695 
4696  size = offsetof(SharedAggInfo, sinstrument)
4697  + pcxt->nworkers * sizeof(AggregateInstrumentation);
4698  node->shared_info = shm_toc_allocate(pcxt->toc, size);
4699  /* ensure any unfilled slots will contain zeroes */
4700  memset(node->shared_info, 0, size);
4701  node->shared_info->num_workers = pcxt->nworkers;
4702  shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
4703  node->shared_info);
4704 }
4705 
4706 /* ----------------------------------------------------------------
4707  * ExecAggInitializeWorker
4708  *
4709  * Attach worker to DSM space for aggregate statistics.
4710  * ----------------------------------------------------------------
4711  */
4712 void
4714 {
4715  node->shared_info =
4716  shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
4717 }
4718 
4719 /* ----------------------------------------------------------------
4720  * ExecAggRetrieveInstrumentation
4721  *
4722  * Transfer aggregate statistics from DSM to private memory.
4723  * ----------------------------------------------------------------
4724  */
4725 void
4727 {
4728  Size size;
4729  SharedAggInfo *si;
4730 
4731  if (node->shared_info == NULL)
4732  return;
4733 
4734  size = offsetof(SharedAggInfo, sinstrument)
4736  si = palloc(size);
4737  memcpy(si, node->shared_info, size);
4738  node->shared_info = si;
4739 }
static void hashagg_reset_spill_state(AggState *aggstate)
Definition: nodeAgg.c:3114
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
Definition: logtape.c:689
List * aggdistinct
Definition: primnodes.h:332
struct AggStatePerTransData * AggStatePerTrans
Definition: execnodes.h:2270
ExprState ** eqfunctions
Definition: nodeAgg.h:278
struct HashAggSpill * hash_spills
Definition: execnodes.h:2321
AggStatePerGroup * hash_pergroup
Definition: execnodes.h:2341
int varno
Definition: primnodes.h:189
#define NIL
Definition: pg_list.h:65
static TupleTableSlot * fetch_input_tuple(AggState *aggstate)
Definition: nodeAgg.c:549
struct AggStatePerGroupData * AggStatePerGroup
Definition: execnodes.h:2271
#define ScanTupleHashTable(htable, iter)
Definition: execnodes.h:772
static void select_current_set(AggState *aggstate, int setno, bool is_hash)
Definition: nodeAgg.c:457
int numCols
Definition: plannodes.h:865
static int hash_choose_num_partitions(double input_groups, double hashentrysize, int used_bits, int *log2_npartittions)
Definition: nodeAgg.c:1973
List * qual
Definition: plannodes.h:142
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, Datum *val, bool *isNull, Datum *abbrev)
Definition: tuplesort.c:2494
TupleHashTable BuildTupleHashTableExt(PlanState *parent, TupleDesc inputDesc, int numCols, AttrNumber *keyColIdx, const Oid *eqfuncoids, FmgrInfo *hashfunctions, Oid *collations, long nbuckets, Size additionalsize, MemoryContext metacxt, MemoryContext tablecxt, MemoryContext tempcxt, bool use_variable_hash_iv)
Definition: execGrouping.c:154
bool aggvariadic
Definition: primnodes.h:335
int bms_first_member(Bitmapset *a)
Definition: bitmapset.c:996
AggStatePerPhase phases
Definition: execnodes.h:2308
double hashentrysize
Definition: execnodes.h:2333
#define IsA(nodeptr, _type_)
Definition: nodes.h:588
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2043
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
#define AllocSetContextCreate
Definition: memutils.h:173
AttrNumber * hashGrpColIdxInput
Definition: nodeAgg.h:311
Datum * ecxt_aggvalues
Definition: execnodes.h:246
static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
Definition: nodeAgg.c:1899
TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 hash)
Definition: execGrouping.c:361
uint64 hash_ngroups_limit
Definition: execnodes.h:2330
#define HASHAGG_MAX_PARTITIONS
Definition: nodeAgg.c:298
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1446
static Datum ExecEvalExprSwitchContext(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:331
Index varlevelsup
Definition: primnodes.h:196
TargetEntry * get_sortgroupclause_tle(SortGroupClause *sgClause, List *targetList)
Definition: tlist.c:356
TupleTableSlot * ExecInitExtraTupleSlot(EState *estate, TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1831
#define GETSTRUCT(TUP)
Definition: htup_details.h:654
Bitmapset * bms_copy(const Bitmapset *a)
Definition: bitmapset.c:74
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1246
static void hash_agg_check_limits(AggState *aggstate)
Definition: nodeAgg.c:1838
static long hash_choose_num_buckets(double hashentrysize, long estimated_nbuckets, Size memory)
Definition: nodeAgg.c:1948
AttrNumber * grpColIdx
Definition: plannodes.h:866
ProjectionInfo * ps_ProjInfo
Definition: execnodes.h:1007
uint64 transitionSpace
Definition: plannodes.h:870
Instrumentation * instrument
Definition: execnodes.h:977
static void agg_fill_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2522
int aggtransno
Definition: primnodes.h:341
Bitmapset * colnos_needed
Definition: execnodes.h:2303
const TupleTableSlotOps * ExecGetResultSlotOps(PlanState *planstate, bool *isfixed)
Definition: execUtils.c:499
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:425
List * lcons_int(int datum, List *list)
Definition: list.c:486
TupleTableSlot * ExecStoreAllNullTuple(TupleTableSlot *slot)
Definition: execTuples.c:1576
int numaggs
Definition: execnodes.h:2279
Oid GetUserId(void)
Definition: miscinit.c:495
bool agg_done
Definition: execnodes.h:2297
#define castNode(_type_, nodeptr)
Definition: nodes.h:606
Oid * grpCollations
Definition: plannodes.h:868
LogicalTape ** partitions
Definition: nodeAgg.c:335
void ExecEndNode(PlanState *node)
Definition: execProcnode.c:556
#define TTS_EMPTY(slot)
Definition: tuptable.h:97
TupleTableSlot * sort_slot
Definition: execnodes.h:2311
List * all_grouped_cols
Definition: execnodes.h:2302
Tuplesortstate * sort_out
Definition: execnodes.h:2310
MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot, bool *shouldFree)
Definition: execTuples.c:1692
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
static void finalize_partialaggregate(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull)
Definition: nodeAgg.c:1141
ScanState ss
Definition: execnodes.h:2277
FmgrInfo equalfnOne
Definition: nodeAgg.h:110
ExprContext * ps_ExprContext
Definition: execnodes.h:1006
MinimalTuple firstTuple
Definition: execnodes.h:725
shm_toc_estimator estimator
Definition: parallel.h:42
MemoryContext ecxt_per_tuple_memory
Definition: execnodes.h:235
static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
Definition: nodeAgg.c:3074
ExprState * evaltrans
Definition: nodeAgg.h:283
#define SizeForFunctionCallInfo(nargs)
Definition: fmgr.h:102
int64 input_tuples
Definition: nodeAgg.c:356
void ExecReScan(PlanState *node)
Definition: execAmi.c:78
int bms_next_member(const Bitmapset *a, int prevbit)
Definition: bitmapset.c:1043
const TupleTableSlotOps TTSOpsVirtual
Definition: execTuples.c:83
int plan_node_id
Definition: plannodes.h:140
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Oid inputcollid
Definition: primnodes.h:326
int current_phase
Definition: execnodes.h:2285
static void hashagg_finish_initial_spills(AggState *aggstate)
Definition: nodeAgg.c:3040
static void slot_getsomeattrs(TupleTableSlot *slot, int attnum)
Definition: tuptable.h:341
Definition: nodes.h:537
AggSplit aggsplit
Definition: execnodes.h:2282
static TupleTableSlot * ExecAgg(PlanState *pstate)
Definition: nodeAgg.c:2140
bool * nullsFirst
Definition: plannodes.h:818
int errcode(int sqlerrcode)
Definition: elog.c:698
List * args
Definition: primnodes.h:330
#define MemSet(start, val, len)
Definition: c.h:1008
AttrNumber varattno
Definition: primnodes.h:191
static HashAggBatch * hashagg_batch_new(LogicalTape *input_tape, int setno, int64 input_tuples, double input_card, int used_bits)
Definition: nodeAgg.c:2972
char * format_type_be(Oid type_oid)
Definition: format_type.c:339
fmNodePtr context
Definition: fmgr.h:88
Datum * tts_values
Definition: tuptable.h:126
TupleTableSlot * ss_ScanTupleSlot
Definition: execnodes.h:1381
static void build_pertrans_for_aggref(AggStatePerTrans pertrans, AggState *aggstate, EState *estate, Aggref *aggref, Oid transfn_oid, Oid aggtranstype, Oid aggserialfn, Oid aggdeserialfn, Datum initValue, bool initValueIsNull, Oid *inputTypes, int numArguments)
Definition: nodeAgg.c:4026
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:143
void build_aggregate_deserialfn_expr(Oid deserialfn_oid, Expr **deserialfnexpr)
Definition: parse_agg.c:2050
static void finalize_aggregate(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull)
Definition: nodeAgg.c:1038
bool all_cols_needed
Definition: execnodes.h:2305
void build_aggregate_finalfn_expr(Oid *agg_input_types, int num_finalfn_inputs, Oid agg_state_type, Oid agg_result_type, Oid agg_input_collation, Oid finalfn_oid, Expr **finalfnexpr)
Definition: parse_agg.c:2074
AggregateInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]
Definition: execnodes.h:2253
TupleTableSlot * hash_spill_rslot
Definition: execnodes.h:2323
AggStatePerTrans pertrans
Definition: execnodes.h:2287
EState * state
Definition: execnodes.h:969
int projected_set
Definition: execnodes.h:2298
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1148
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1338
unsigned int Oid
Definition: postgres_ext.h:31
uint32 hash_bytes_uint32(uint32 k)
Definition: hashfn.c:610
static bool ExecQual(ExprState *state, ExprContext *econtext)
Definition: executor.h:396
HeapTuple grp_firstTuple
Definition: execnodes.h:2315
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
Definition: primnodes.h:186
Aggref * aggref
Definition: nodeAgg.h:187
static TupleTableSlot * project_aggregates(AggState *aggstate)
Definition: nodeAgg.c:1354
static void advance_aggregates(AggState *aggstate)
Definition: nodeAgg.c:820
int current_set
Definition: execnodes.h:2300
uint32 mask
Definition: nodeAgg.c:337
#define OidIsValid(objectId)
Definition: c.h:710
void LogicalTapeClose(LogicalTape *lt)
Definition: logtape.c:742
#define DO_AGGSPLIT_COMBINE(as)
Definition: nodes.h:800
FunctionCallInfo transfn_fcinfo
Definition: nodeAgg.h:162
TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 *hash)
Definition: execGrouping.c:306
void ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
Definition: nodeAgg.c:4688
struct HashAggBatch HashAggBatch
void ExecFreeExprContext(PlanState *planstate)
Definition: execUtils.c:650
Datum ExecAggTransReparent(AggState *aggstate, AggStatePerTrans pertrans, Datum newValue, bool newValueIsNull, Datum oldValue, bool oldValueIsNull)
int numtrans
Definition: execnodes.h:2280
void ExecForceStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1469
static void hash_agg_enter_spill_mode(AggState *aggstate)
Definition: nodeAgg.c:1864
TupleDesc sortdesc
Definition: nodeAgg.h:138
Oid * sortOperators
Definition: plannodes.h:816
void execTuplesHashPrepare(int numCols, const Oid *eqOperators, Oid **eqFuncOids, FmgrInfo **hashFunctions)
Definition: execGrouping.c:96
ExprState * ExecInitQual(List *qual, PlanState *parent)
Definition: execExpr.c:209
void ResetTupleHashTable(TupleHashTable hashtable)
Definition: execGrouping.c:285
ExprContext * tmpcontext
Definition: execnodes.h:2290
FmgrInfo transfn
Definition: nodeAgg.h:81
#define HASHAGG_PARTITION_FACTOR
Definition: nodeAgg.c:296
static void build_hash_table(AggState *aggstate, int setno, long nbuckets)
Definition: nodeAgg.c:1486
int max_colno_needed
Definition: execnodes.h:2304
static void prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
Definition: nodeAgg.c:1246
bool hash_spill_mode
Definition: execnodes.h:2327
#define FUNC_MAX_ARGS
List * hash_batches
Definition: execnodes.h:2325
Aggref * aggref
Definition: nodeAgg.h:44
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:896
#define linitial_int(l)
Definition: pg_list.h:175
Bitmapset ** grouped_cols
Definition: nodeAgg.h:277
PlanState ps
Definition: execnodes.h:1378
int maxsets
Definition: execnodes.h:2307
static bool agg_refill_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2576
static bool find_cols_walker(Node *node, FindColsContext *context)
Definition: nodeAgg.c:1403
Size hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
Definition: nodeAgg.c:1676
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:3308
void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
Definition: hyperloglog.c:66
#define DO_AGGSPLIT_SERIALIZE(as)
Definition: nodes.h:802
#define HASHAGG_MIN_PARTITIONS
Definition: nodeAgg.c:297
void pfree(void *pointer)
Definition: mcxt.c:1169
MemoryContext es_query_cxt
Definition: execnodes.h:601
AggStrategy aggstrategy
Definition: plannodes.h:863
AggState * ExecInitAgg(Agg *node, EState *estate, int eflags)
Definition: nodeAgg.c:3162
#define linitial(l)
Definition: pg_list.h:174
bool table_filled
Definition: execnodes.h:2317
AggStrategy aggstrategy
Definition: execnodes.h:2281
#define HASHAGG_HLL_BIT_WIDTH
Definition: nodeAgg.c:315
static TupleTableSlot * agg_retrieve_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2727
static void find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
Definition: nodeAgg.c:1380
#define ObjectIdGetDatum(X)
Definition: postgres.h:551
#define ERROR
Definition: elog.h:46
bool fn_strict
Definition: fmgr.h:61
#define lfirst_int(lc)
Definition: pg_list.h:170
static void * list_nth(const List *list, int n)
Definition: pg_list.h:278
char * get_func_name(Oid funcid)
Definition: lsyscache.c:1579
MemoryContext hash_metacxt
Definition: execnodes.h:2319
void LogicalTapeWrite(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:770
struct LogicalTapeSet * hash_tapeset
Definition: execnodes.h:2320
NullableDatum args[FLEXIBLE_ARRAY_MEMBER]
Definition: fmgr.h:95
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:126
static TupleTableSlot * agg_retrieve_direct(AggState *aggstate)
Definition: nodeAgg.c:2176
#define AGG_CONTEXT_AGGREGATE
Definition: fmgr.h:738
struct TupleHashEntryData TupleHashEntryData
static void slot_getallattrs(TupleTableSlot *slot)
Definition: tuptable.h:354
static void find_hash_columns(AggState *aggstate)
Definition: nodeAgg.c:1546
ExprState * equalfnMulti
Definition: nodeAgg.h:111
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:195
Tuplesortstate * sort_in
Definition: execnodes.h:2309
#define EXEC_FLAG_BACKWARD
Definition: executor.h:58
#define outerPlanState(node)
Definition: execnodes.h:1063
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Definition: tuplesort.c:2408
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:937
int bms_num_members(const Bitmapset *a)
Definition: bitmapset.c:646
static void finalize_aggregates(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroup)
Definition: nodeAgg.c:1291
bool AggStateIsShared(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4599
#define list_nth_node(type, list, n)
Definition: pg_list.h:306
Tuplesortstate ** sortstates
Definition: nodeAgg.h:154
#define FunctionCallInvoke(fcinfo)
Definition: fmgr.h:172
Bitmapset * aggParams
Definition: plannodes.h:871
static int initValue(long lng_val)
Definition: informix.c:677
MemoryContext tablecxt
Definition: execnodes.h:747
void ExecAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc)
Definition: execUtils.c:535
bool * tts_isnull
Definition: tuptable.h:128
int npartitions
Definition: nodeAgg.c:334
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:316
MinimalTupleData * MinimalTuple
Definition: htup.h:27
static void process_ordered_aggregate_multi(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:944
List * aggorder
Definition: primnodes.h:331
void ExecAggEstimate(AggState *node, ParallelContext *pcxt)
Definition: nodeAgg.c:4667
int errcode_for_file_access(void)
Definition: elog.c:721
#define fmgr_info_set_expr(expr, finfo)
Definition: fmgr.h:135
AttrNumber resno
Definition: primnodes.h:1456
#define DatumGetBool(X)
Definition: postgres.h:437
int ParallelWorkerNumber
Definition: parallel.c:112
static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, TupleTableSlot *slot, uint32 hash)
Definition: nodeAgg.c:2906
TupleTableSlot * ecxt_innertuple
Definition: execnodes.h:229
List * ExecInitExprList(List *nodes, PlanState *parent)
Definition: execExpr.c:318
#define MakeExpandedObjectReadOnly(d, isnull, typlen)
Index agglevelsup
Definition: primnodes.h:338
int used_bits
Definition: nodeAgg.c:354
struct AggregateInstrumentation AggregateInstrumentation
Bitmapset * unaggregated
Definition: nodeAgg.c:365
#define TupIsNull(slot)
Definition: tuptable.h:292
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:207
unsigned int uint32
Definition: c.h:441
List * aggdirectargs
Definition: primnodes.h:329
static Datum GetAggInitVal(Datum textInitVal, Oid transtype)
Definition: nodeAgg.c:4262
AggStatePerAgg curperagg
Definition: execnodes.h:2293
AttrNumber * sortColIdx
Definition: nodeAgg.h:100
struct AggStatePerGroupData AggStatePerGroupData
AggStatePerHash perhash
Definition: execnodes.h:2340
bool outeropsset
Definition: execnodes.h:1050
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
static void initialize_aggregates(AggState *aggstate, AggStatePerGroup *pergroups, int numReset)
Definition: nodeAgg.c:667
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
Definition: logtape.c:855
AggStrategy aggstrategy
Definition: nodeAgg.h:274
ExprState * evaltrans_cache[2][2]
Definition: nodeAgg.h:291
#define InstrCountFiltered1(node, delta)
Definition: execnodes.h:1071
#define EXEC_FLAG_REWIND
Definition: executor.h:57
hyperLogLogState * hll_card
Definition: nodeAgg.c:339
void getTypeInputInfo(Oid type, Oid *typInput, Oid *typIOParam)
Definition: lsyscache.c:2821
Datum value
Definition: postgres.h:422
static void hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *lts, int used_bits, double input_groups, double hashentrysize)
Definition: nodeAgg.c:2875
Bitmapset * grouped_cols
Definition: execnodes.h:2301
#define IsParallelWorker()
Definition: parallel.h:61
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:131
TupleTableSlot * ExecAllocTableSlot(List **tupleTable, TupleDesc desc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1171
void ExecAggRetrieveInstrumentation(AggState *node)
Definition: nodeAgg.c:4726
int hash_batches_used
Definition: execnodes.h:2338
MemoryContext AggGetTempMemoryContext(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4573
Bitmapset * chgParam
Definition: execnodes.h:999
#define InvokeFunctionExecuteHook(objectId)
Definition: objectaccess.h:191
bool IsBinaryCoercible(Oid srctype, Oid targettype)
int my_log2(long num)
Definition: dynahash.c:1765
double input_card
Definition: nodeAgg.c:357
#define outerPlan(node)
Definition: plannodes.h:171
List * lappend(List *list, void *datum)
Definition: list.c:336
Bitmapset * aggregated
Definition: nodeAgg.c:364
TupleHashIterator hashiter
Definition: nodeAgg.h:304
int numCols
Definition: plannodes.h:814
static void initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:580
int num_hashes
Definition: execnodes.h:2318
Plan plan
Definition: plannodes.h:862
AttrNumber * hashGrpColIdxHash
Definition: nodeAgg.h:312
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:1127
bool input_done
Definition: execnodes.h:2296
#define SizeofMinimalTupleHeader
Definition: htup_details.h:648
TupleDesc tts_tupleDescriptor
Definition: tuptable.h:124
ExprContext * curaggcontext
Definition: execnodes.h:2292
ExprContext * hashcontext
Definition: execnodes.h:2288
bool * ecxt_aggnulls
Definition: execnodes.h:248
static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)
Definition: executor.h:423
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
#define TextDatumGetCString(d)
Definition: builtins.h:87
List * es_tupleTable
Definition: execnodes.h:603
#define HASHAGG_READ_BUFFER_SIZE
Definition: nodeAgg.c:306
AggStatePerPhase phase
Definition: execnodes.h:2283
void * palloc0(Size size)
Definition: mcxt.c:1093
ExecProcNodeMtd ExecProcNode
Definition: execnodes.h:973
AclResult
Definition: acl.h:177
uintptr_t Datum
Definition: postgres.h:411
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:1175
struct FunctionCallInfoBaseData * FunctionCallInfo
Definition: fmgr.h:38
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
static TupleTableSlot * ExecProcNode(PlanState *node)
Definition: executor.h:252
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:1388
FmgrInfo deserialfn
Definition: nodeAgg.h:87
int work_mem
Definition: globals.c:124
List * groupingSets
Definition: plannodes.h:873
int16 resulttypeLen
Definition: nodeAgg.h:216
static void initialize_phase(AggState *aggstate, int newphase)
Definition: nodeAgg.c:479
double estimateHyperLogLog(hyperLogLogState *cState)
Definition: hyperloglog.c:186
struct FindColsContext FindColsContext
FormData_pg_proc * Form_pg_proc
Definition: pg_proc.h:136
Plan * plan
Definition: execnodes.h:967
#define InvalidOid
Definition: postgres_ext.h:36
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1256
Oid aggfnoid
Definition: primnodes.h:323
int16 attnum
Definition: pg_attribute.h:83
#define ResetTupleHashIterator(htable, iter)
Definition: execnodes.h:770
#define ereport(elevel,...)
Definition: elog.h:157
static HeapTuple ExecCopySlotHeapTuple(TupleTableSlot *slot)
Definition: tuptable.h:452
static void advance_transition_function(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:708
#define LOCAL_FCINFO(name, nargs)
Definition: fmgr.h:110
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
Definition: nodeAgg.c:1723
List * lcons(void *datum, List *list)
Definition: list.c:468
static void prepare_hash_slot(AggStatePerHash perhash, TupleTableSlot *inputslot, TupleTableSlot *hashslot)
Definition: nodeAgg.c:1201
int aggno
Definition: primnodes.h:340
uint64 hash_disk_used
Definition: execnodes.h:2337
Size MemoryContextMemAllocated(MemoryContext context, bool recurse)
Definition: mcxt.c:477
void bms_free(Bitmapset *a)
Definition: bitmapset.c:208
#define Max(x, y)
Definition: c.h:980
ExprContext ** aggcontexts
Definition: execnodes.h:2289
#define makeNode(_type_)
Definition: nodes.h:585
TupleTableSlot * ecxt_outertuple
Definition: execnodes.h:231
int plan_width
Definition: plannodes.h:124
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
FmgrInfo * hashfunctions
Definition: nodeAgg.h:306
#define Assert(condition)
Definition: c.h:804
#define lfirst(lc)
Definition: pg_list.h:169
void RegisterExprContextCallback(ExprContext *econtext, ExprContextCallbackFunction function, Datum arg)
Definition: execUtils.c:925
FmgrInfo serialfn
Definition: nodeAgg.h:84
ExprState * execTuplesMatchPrepare(TupleDesc desc, int numCols, const AttrNumber *keyColIdx, const Oid *eqOperators, const Oid *collations, PlanState *parent)
Definition: execGrouping.c:59
FunctionCallInfo deserialfn_fcinfo
Definition: nodeAgg.h:167
#define EXEC_FLAG_MARK
Definition: executor.h:59
AggSplit aggsplit
Definition: plannodes.h:864
struct AggStatePerAggData * AggStatePerAgg
Definition: execnodes.h:2269
void ExecReScanAgg(AggState *node)
Definition: nodeAgg.c:4348
void build_aggregate_serialfn_expr(Oid serialfn_oid, Expr **serialfnexpr)
Definition: parse_agg.c:2027
FormData_pg_aggregate * Form_pg_aggregate
Definition: pg_aggregate.h:109
Expr * expr
Definition: primnodes.h:1455
AggSplit aggsplit
Definition: primnodes.h:339
bool MemoryContextContains(MemoryContext context, void *pointer)
Definition: mcxt.c:758
void(* ExprContextCallbackFunction)(Datum arg)
Definition: execnodes.h:189
void build_aggregate_transfn_expr(Oid *agg_input_types, int agg_num_inputs, int agg_num_direct_inputs, bool agg_variadic, Oid agg_state_type, Oid agg_input_collation, Oid transfn_oid, Oid invtransfn_oid, Expr **transfnexpr, Expr **invtransfnexpr)
Definition: parse_agg.c:1966
bool hash_ever_spilled
Definition: execnodes.h:2326
AggStatePerGroup * pergroups
Definition: execnodes.h:2313
void freeHyperLogLog(hyperLogLogState *cState)
Definition: hyperloglog.c:151
size_t Size
Definition: c.h:540
Bitmapset * bms_union(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:225
void ExecAssignExprContext(EState *estate, PlanState *planstate)
Definition: execUtils.c:480
#define AGG_CONTEXT_WINDOW
Definition: fmgr.h:739
#define InitFunctionCallInfoData(Fcinfo, Flinfo, Nargs, Collation, Context, Resultinfo)
Definition: fmgr.h:150
FunctionCallInfo serialfn_fcinfo
Definition: nodeAgg.h:165
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
bool expression_tree_walker(Node *node, bool(*walker)(), void *context)
Definition: nodeFuncs.c:1904
static int list_length(const List *l)
Definition: pg_list.h:149
long numGroups
Definition: plannodes.h:869
Oid exprCollation(const Node *expr)
Definition: nodeFuncs.c:759
#define DO_AGGSPLIT_SKIPFINAL(as)
Definition: nodes.h:801
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2198
void addHyperLogLog(hyperLogLogState *cState, uint32 hash)
Definition: hyperloglog.c:167
Expr * aggfilter
Definition: primnodes.h:333
int AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext)
Definition: nodeAgg.c:4495
TupleDesc ExecTypeFromTL(List *targetList)
Definition: execTuples.c:1938
size_t get_hash_memory_limit(void)
Definition: nodeHash.c:3401
#define MAXALIGN(LEN)
Definition: c.h:757
void ExecInitResultTupleSlotTL(PlanState *planstate, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1799
void ReScanExprContext(ExprContext *econtext)
Definition: execUtils.c:438
static TupleTableSlot * agg_retrieve_hash_table_in_memory(AggState *aggstate)
Definition: nodeAgg.c:2752
bool outeropsfixed
Definition: execnodes.h:1046
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:736
#define DO_AGGSPLIT_DESERIALIZE(as)
Definition: nodes.h:803
Size hash_mem_limit
Definition: execnodes.h:2329
struct Plan * lefttree
Definition: plannodes.h:143
TupleTableSlot * uniqslot
Definition: nodeAgg.h:137
int numphases
Definition: execnodes.h:2284
TupleDesc ExecGetResultType(PlanState *planstate)
Definition: execUtils.c:490
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
Definition: logtape.c:565
List * targetlist
Definition: plannodes.h:141
ExprState * qual
Definition: execnodes.h:988
void ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
Definition: nodeAgg.c:4713
#define DatumGetPointer(X)
Definition: postgres.h:593
AttrNumber * sortColIdx
Definition: plannodes.h:815
#define CHUNKHDRSZ
Definition: nodeAgg.c:321
#define HASHAGG_WRITE_BUFFER_SIZE
Definition: nodeAgg.c:307
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:494
void AggRegisterCallback(FunctionCallInfo fcinfo, ExprContextCallbackFunction func, Datum arg)
Definition: nodeAgg.c:4638
void hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, Size *mem_limit, uint64 *ngroups_limit, int *num_partitions)
Definition: nodeAgg.c:1780
Size hash_mem_peak
Definition: execnodes.h:2334
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
Oid * grpOperators
Definition: plannodes.h:867
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
List * chain
Definition: plannodes.h:874
AggStatePerAgg peragg
Definition: execnodes.h:2286
static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
Definition: nodeAgg.c:2991
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
#define ACL_EXECUTE
Definition: parsenodes.h:89
void list_free(List *list)
Definition: list.c:1391
SharedAggInfo * shared_info
Definition: execnodes.h:2349
#define elog(elevel,...)
Definition: elog.h:232
AclResult pg_proc_aclcheck(Oid proc_oid, Oid roleid, AclMode mode)
Definition: aclchk.c:4718
int i
List * aggdirectargs
Definition: nodeAgg.h:210
Oid aggtranstype
Definition: primnodes.h:327
uint64 hash_ngroups_current
Definition: execnodes.h:2335
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:676
void * arg
AggStatePerTrans curpertrans
Definition: execnodes.h:2295
Oid aggtype
Definition: primnodes.h:324
static void process_ordered_aggregate_single(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:852
static void initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable, TupleHashEntry entry)
Definition: nodeAgg.c:2027
bool resulttypeByVal
Definition: nodeAgg.h:217
char aggkind
Definition: primnodes.h:337
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
Definition: plannodes.h:860
ExprContext * CreateWorkExprContext(EState *estate)
Definition: execUtils.c:316
List * aggs
Definition: execnodes.h:2278
TupleTableSlot * sortslot
Definition: nodeAgg.h:136
void ExecCreateScanSlotFromOuterPlan(EState *estate, ScanState *scanstate, const TupleTableSlotOps *tts_ops)
Definition: execUtils.c:682
long LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1192
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1467
TupleTableSlot * hashslot
Definition: nodeAgg.h:305
Oid * collations
Definition: plannodes.h:817
int get_aggregate_argtypes(Aggref *aggref, Oid *inputTypes)
Definition: parse_agg.c:1880
PlanState * ExecInitNode(Plan *node, EState *estate, int eflags)
Definition: execProcnode.c:141
Bitmapset * bms_del_member(Bitmapset *a, int x)
Definition: bitmapset.c:773
bool is_aggref
Definition: nodeAgg.c:363
Definition: pg_list.h:50
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
bool bms_is_member(int x, const Bitmapset *a)
Definition: bitmapset.c:427
Datum OidInputFunctionCall(Oid functionId, char *str, Oid typioparam, int32 typmod)
Definition: fmgr.c:1644
TupleHashTable hashtable
Definition: nodeAgg.h:303
#define EXEC_FLAG_EXPLAIN_ONLY
Definition: executor.h:56
int16 AttrNumber
Definition: attnum.h:21
void ExecEndAgg(AggState *node)
Definition: nodeAgg.c:4278