PostgreSQL Source Code  git master
nodeAgg.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nodeAgg.c
4  * Routines to handle aggregate nodes.
5  *
6  * ExecAgg normally evaluates each aggregate in the following steps:
7  *
8  * transvalue = initcond
9  * foreach input_tuple do
10  * transvalue = transfunc(transvalue, input_value(s))
11  * result = finalfunc(transvalue, direct_argument(s))
12  *
13  * If a finalfunc is not supplied then the result is just the ending
14  * value of transvalue.
15  *
16  * Other behaviors can be selected by the "aggsplit" mode, which exists
17  * to support partial aggregation. It is possible to:
18  * * Skip running the finalfunc, so that the output is always the
19  * final transvalue state.
20  * * Substitute the combinefunc for the transfunc, so that transvalue
21  * states (propagated up from a child partial-aggregation step) are merged
22  * rather than processing raw input rows. (The statements below about
23  * the transfunc apply equally to the combinefunc, when it's selected.)
24  * * Apply the serializefunc to the output values (this only makes sense
25  * when skipping the finalfunc, since the serializefunc works on the
26  * transvalue data type).
27  * * Apply the deserializefunc to the input values (this only makes sense
28  * when using the combinefunc, for similar reasons).
29  * It is the planner's responsibility to connect up Agg nodes using these
30  * alternate behaviors in a way that makes sense, with partial aggregation
31  * results being fed to nodes that expect them.
32  *
33  * If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the
34  * input tuples and eliminate duplicates (if required) before performing
35  * the above-depicted process. (However, we don't do that for ordered-set
36  * aggregates; their "ORDER BY" inputs are ordinary aggregate arguments
37  * so far as this module is concerned.) Note that partial aggregation
38  * is not supported in these cases, since we couldn't ensure global
39  * ordering or distinctness of the inputs.
40  *
41  * If transfunc is marked "strict" in pg_proc and initcond is NULL,
42  * then the first non-NULL input_value is assigned directly to transvalue,
43  * and transfunc isn't applied until the second non-NULL input_value.
44  * The agg's first input type and transtype must be the same in this case!
45  *
46  * If transfunc is marked "strict" then NULL input_values are skipped,
47  * keeping the previous transvalue. If transfunc is not strict then it
48  * is called for every input tuple and must deal with NULL initcond
49  * or NULL input_values for itself.
50  *
51  * If finalfunc is marked "strict" then it is not called when the
52  * ending transvalue is NULL, instead a NULL result is created
53  * automatically (this is just the usual handling of strict functions,
54  * of course). A non-strict finalfunc can make its own choice of
55  * what to return for a NULL ending transvalue.
56  *
57  * Ordered-set aggregates are treated specially in one other way: we
58  * evaluate any "direct" arguments and pass them to the finalfunc along
59  * with the transition value.
60  *
61  * A finalfunc can have additional arguments beyond the transvalue and
62  * any "direct" arguments, corresponding to the input arguments of the
63  * aggregate. These are always just passed as NULL. Such arguments may be
64  * needed to allow resolution of a polymorphic aggregate's result type.
65  *
66  * We compute aggregate input expressions and run the transition functions
67  * in a temporary econtext (aggstate->tmpcontext). This is reset at least
68  * once per input tuple, so when the transvalue datatype is
69  * pass-by-reference, we have to be careful to copy it into a longer-lived
70  * memory context, and free the prior value to avoid memory leakage. We
71  * store transvalues in another set of econtexts, aggstate->aggcontexts
72  * (one per grouping set, see below), which are also used for the hashtable
73  * structures in AGG_HASHED mode. These econtexts are rescanned, not just
74  * reset, at group boundaries so that aggregate transition functions can
75  * register shutdown callbacks via AggRegisterCallback.
76  *
77  * The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to
78  * run finalize functions and compute the output tuple; this context can be
79  * reset once per output tuple.
80  *
81  * The executor's AggState node is passed as the fmgr "context" value in
82  * all transfunc and finalfunc calls. It is not recommended that the
83  * transition functions look at the AggState node directly, but they can
84  * use AggCheckCallContext() to verify that they are being called by
85  * nodeAgg.c (and not as ordinary SQL functions). The main reason a
86  * transition function might want to know this is so that it can avoid
87  * palloc'ing a fixed-size pass-by-ref transition value on every call:
88  * it can instead just scribble on and return its left input. Ordinarily
89  * it is completely forbidden for functions to modify pass-by-ref inputs,
90  * but in the aggregate case we know the left input is either the initial
91  * transition value or a previous function result, and in either case its
92  * value need not be preserved. See int8inc() for an example. Notice that
93  * the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when
94  * the previous transition value pointer is returned. It is also possible
95  * to avoid repeated data copying when the transition value is an expanded
96  * object: to do that, the transition function must take care to return
97  * an expanded object that is in a child context of the memory context
98  * returned by AggCheckCallContext(). Also, some transition functions want
99  * to store working state in addition to the nominal transition value; they
100  * can use the memory context returned by AggCheckCallContext() to do that.
101  *
102  * Note: AggCheckCallContext() is available as of PostgreSQL 9.0. The
103  * AggState is available as context in earlier releases (back to 8.1),
104  * but direct examination of the node is needed to use it before 9.0.
105  *
106  * As of 9.4, aggregate transition functions can also use AggGetAggref()
107  * to get hold of the Aggref expression node for their aggregate call.
108  * This is mainly intended for ordered-set aggregates, which are not
109  * supported as window functions. (A regular aggregate function would
110  * need some fallback logic to use this, since there's no Aggref node
111  * for a window function.)
112  *
113  * Grouping sets:
114  *
115  * A list of grouping sets which is structurally equivalent to a ROLLUP
116  * clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over
117  * ordered data. We do this by keeping a separate set of transition values
118  * for each grouping set being concurrently processed; for each input tuple
119  * we update them all, and on group boundaries we reset those states
120  * (starting at the front of the list) whose grouping values have changed
121  * (the list of grouping sets is ordered from most specific to least
122  * specific).
123  *
124  * Where more complex grouping sets are used, we break them down into
125  * "phases", where each phase has a different sort order (except phase 0
126  * which is reserved for hashing). During each phase but the last, the
127  * input tuples are additionally stored in a tuplesort which is keyed to the
128  * next phase's sort order; during each phase but the first, the input
129  * tuples are drawn from the previously sorted data. (The sorting of the
130  * data for the first phase is handled by the planner, as it might be
131  * satisfied by underlying nodes.)
132  *
133  * Hashing can be mixed with sorted grouping. To do this, we have an
134  * AGG_MIXED strategy that populates the hashtables during the first sorted
135  * phase, and switches to reading them out after completing all sort phases.
136  * We can also support AGG_HASHED with multiple hash tables and no sorting
137  * at all.
138  *
139  * From the perspective of aggregate transition and final functions, the
140  * only issue regarding grouping sets is this: a single call site (flinfo)
141  * of an aggregate function may be used for updating several different
142  * transition values in turn. So the function must not cache in the flinfo
143  * anything which logically belongs as part of the transition value (most
144  * importantly, the memory context in which the transition value exists).
145  * The support API functions (AggCheckCallContext, AggRegisterCallback) are
146  * sensitive to the grouping set for which the aggregate function is
147  * currently being called.
148  *
149  * Plan structure:
150  *
151  * What we get from the planner is actually one "real" Agg node which is
152  * part of the plan tree proper, but which optionally has an additional list
153  * of Agg nodes hung off the side via the "chain" field. This is because an
154  * Agg node happens to be a convenient representation of all the data we
155  * need for grouping sets.
156  *
157  * For many purposes, we treat the "real" node as if it were just the first
158  * node in the chain. The chain must be ordered such that hashed entries
159  * come before sorted/plain entries; the real node is marked AGG_MIXED if
160  * there are both types present (in which case the real node describes one
161  * of the hashed groupings, other AGG_HASHED nodes may optionally follow in
162  * the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If
163  * the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
164  * nodes must be of the same type; if it is AGG_PLAIN, there can be no
165  * chained nodes.
166  *
167  * We collect all hashed nodes into a single "phase", numbered 0, and create
168  * a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
169  * Phase 0 is allocated even if there are no hashes, but remains unused in
170  * that case.
171  *
172  * AGG_HASHED nodes actually refer to only a single grouping set each,
173  * because for each hashed grouping we need a separate grpColIdx and
174  * numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of
175  * grouping sets that share a sort order. Each AGG_SORTED node other than
176  * the first one has an associated Sort node which describes the sort order
177  * to be used; the first sorted node takes its input from the outer subtree,
178  * which the planner has already arranged to provide ordered data.
179  *
180  * Memory and ExprContext usage:
181  *
182  * Because we're accumulating aggregate values across input rows, we need to
183  * use more memory contexts than just simple input/output tuple contexts.
184  * In fact, for a rollup, we need a separate context for each grouping set
185  * so that we can reset the inner (finer-grained) aggregates on their group
186  * boundaries while continuing to accumulate values for outer
187  * (coarser-grained) groupings. On top of this, we might be simultaneously
188  * populating hashtables; however, we only need one context for all the
189  * hashtables.
190  *
191  * So we create an array, aggcontexts, with an ExprContext for each grouping
192  * set in the largest rollup that we're going to process, and use the
193  * per-tuple memory context of those ExprContexts to store the aggregate
194  * transition values. hashcontext is the single context created to support
195  * all hash tables.
196  *
197  * Spilling To Disk
198  *
199  * When performing hash aggregation, if the hash table memory exceeds the
200  * limit (see hash_agg_check_limits()), we enter "spill mode". In spill
201  * mode, we advance the transition states only for groups already in the
202  * hash table. For tuples that would need to create a new hash table
203  * entries (and initialize new transition states), we instead spill them to
204  * disk to be processed later. The tuples are spilled in a partitioned
205  * manner, so that subsequent batches are smaller and less likely to exceed
206  * hash_mem (if a batch does exceed hash_mem, it must be spilled
207  * recursively).
208  *
209  * Spilled data is written to logical tapes. These provide better control
210  * over memory usage, disk space, and the number of files than if we were
211  * to use a BufFile for each spill.
212  *
213  * Note that it's possible for transition states to start small but then
214  * grow very large; for instance in the case of ARRAY_AGG. In such cases,
215  * it's still possible to significantly exceed hash_mem. We try to avoid
216  * this situation by estimating what will fit in the available memory, and
217  * imposing a limit on the number of groups separately from the amount of
218  * memory consumed.
219  *
220  * Transition / Combine function invocation:
221  *
222  * For performance reasons transition functions, including combine
223  * functions, aren't invoked one-by-one from nodeAgg.c after computing
224  * arguments using the expression evaluation engine. Instead
225  * ExecBuildAggTrans() builds one large expression that does both argument
226  * evaluation and transition function invocation. That avoids performance
227  * issues due to repeated uses of expression evaluation, complications due
228  * to filter expressions having to be evaluated early, and allows to JIT
229  * the entire expression into one native function.
230  *
231  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
232  * Portions Copyright (c) 1994, Regents of the University of California
233  *
234  * IDENTIFICATION
235  * src/backend/executor/nodeAgg.c
236  *
237  *-------------------------------------------------------------------------
238  */
239 
240 #include "postgres.h"
241 
242 #include "access/htup_details.h"
243 #include "access/parallel.h"
244 #include "catalog/objectaccess.h"
245 #include "catalog/pg_aggregate.h"
246 #include "catalog/pg_proc.h"
247 #include "catalog/pg_type.h"
248 #include "common/hashfn.h"
249 #include "executor/execExpr.h"
250 #include "executor/executor.h"
251 #include "executor/nodeAgg.h"
252 #include "lib/hyperloglog.h"
253 #include "miscadmin.h"
254 #include "nodes/makefuncs.h"
255 #include "nodes/nodeFuncs.h"
256 #include "optimizer/optimizer.h"
257 #include "parser/parse_agg.h"
258 #include "parser/parse_coerce.h"
259 #include "utils/acl.h"
260 #include "utils/builtins.h"
261 #include "utils/datum.h"
262 #include "utils/dynahash.h"
263 #include "utils/expandeddatum.h"
264 #include "utils/logtape.h"
265 #include "utils/lsyscache.h"
266 #include "utils/memutils.h"
267 #include "utils/syscache.h"
268 #include "utils/tuplesort.h"
269 
270 /*
271  * Control how many partitions are created when spilling HashAgg to
272  * disk.
273  *
274  * HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of
275  * partitions needed such that each partition will fit in memory. The factor
276  * is set higher than one because there's not a high cost to having a few too
277  * many partitions, and it makes it less likely that a partition will need to
278  * be spilled recursively. Another benefit of having more, smaller partitions
279  * is that small hash tables may perform better than large ones due to memory
280  * caching effects.
281  *
282  * We also specify a min and max number of partitions per spill. Too few might
283  * mean a lot of wasted I/O from repeated spilling of the same tuples. Too
284  * many will result in lots of memory wasted buffering the spill files (which
285  * could instead be spent on a larger hash table).
286  */
287 #define HASHAGG_PARTITION_FACTOR 1.50
288 #define HASHAGG_MIN_PARTITIONS 4
289 #define HASHAGG_MAX_PARTITIONS 1024
290 
291 /*
292  * For reading from tapes, the buffer size must be a multiple of
293  * BLCKSZ. Larger values help when reading from multiple tapes concurrently,
294  * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a
295  * tape always uses a buffer of size BLCKSZ.
296  */
297 #define HASHAGG_READ_BUFFER_SIZE BLCKSZ
298 #define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
299 
300 /*
301  * HyperLogLog is used for estimating the cardinality of the spilled tuples in
302  * a given partition. 5 bits corresponds to a size of about 32 bytes and a
303  * worst-case error of around 18%. That's effective enough to choose a
304  * reasonable number of partitions when recursing.
305  */
306 #define HASHAGG_HLL_BIT_WIDTH 5
307 
308 /*
309  * Estimate chunk overhead as a constant 16 bytes. XXX: should this be
310  * improved?
311  */
312 #define CHUNKHDRSZ 16
313 
314 /*
315  * Track all tapes needed for a HashAgg that spills. We don't know the maximum
316  * number of tapes needed at the start of the algorithm (because it can
317  * recurse), so one tape set is allocated and extended as needed for new
318  * tapes. When a particular tape is already read, rewind it for write mode and
319  * put it in the free list.
320  *
321  * Tapes' buffers can take up substantial memory when many tapes are open at
322  * once. We only need one tape open at a time in read mode (using a buffer
323  * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
324  * requiring a buffer of size BLCKSZ) for each partition.
325  */
326 typedef struct HashTapeInfo
327 {
329  int ntapes;
330  int *freetapes;
333 } HashTapeInfo;
334 
335 /*
336  * Represents partitioned spill data for a single hashtable. Contains the
337  * necessary information to route tuples to the correct partition, and to
338  * transform the spilled data into new batches.
339  *
340  * The high bits are used for partition selection (when recursing, we ignore
341  * the bits that have already been used for partition selection at an earlier
342  * level).
343  */
344 typedef struct HashAggSpill
345 {
346  LogicalTapeSet *tapeset; /* borrowed reference to tape set */
347  int npartitions; /* number of partitions */
348  int *partitions; /* spill partition tape numbers */
349  int64 *ntuples; /* number of tuples in each partition */
350  uint32 mask; /* mask to find partition from hash value */
351  int shift; /* after masking, shift by this amount */
352  hyperLogLogState *hll_card; /* cardinality estimate for contents */
353 } HashAggSpill;
354 
355 /*
356  * Represents work to be done for one pass of hash aggregation (with only one
357  * grouping set).
358  *
359  * Also tracks the bits of the hash already used for partition selection by
360  * earlier iterations, so that this batch can use new bits. If all bits have
361  * already been used, no partitioning will be done (any spilled data will go
362  * to a single output tape).
363  */
364 typedef struct HashAggBatch
365 {
366  int setno; /* grouping set */
367  int used_bits; /* number of bits of hash already used */
368  LogicalTapeSet *tapeset; /* borrowed reference to tape set */
369  int input_tapenum; /* input partition tape */
370  int64 input_tuples; /* number of tuples in this batch */
371  double input_card; /* estimated group cardinality */
372 } HashAggBatch;
373 
374 /* used to find referenced colnos */
375 typedef struct FindColsContext
376 {
377  bool is_aggref; /* is under an aggref */
378  Bitmapset *aggregated; /* column references under an aggref */
379  Bitmapset *unaggregated; /* other column references */
381 
382 static void select_current_set(AggState *aggstate, int setno, bool is_hash);
383 static void initialize_phase(AggState *aggstate, int newphase);
384 static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
385 static void initialize_aggregates(AggState *aggstate,
386  AggStatePerGroup *pergroups,
387  int numReset);
388 static void advance_transition_function(AggState *aggstate,
389  AggStatePerTrans pertrans,
390  AggStatePerGroup pergroupstate);
391 static void advance_aggregates(AggState *aggstate);
392 static void process_ordered_aggregate_single(AggState *aggstate,
393  AggStatePerTrans pertrans,
394  AggStatePerGroup pergroupstate);
395 static void process_ordered_aggregate_multi(AggState *aggstate,
396  AggStatePerTrans pertrans,
397  AggStatePerGroup pergroupstate);
398 static void finalize_aggregate(AggState *aggstate,
399  AggStatePerAgg peragg,
400  AggStatePerGroup pergroupstate,
401  Datum *resultVal, bool *resultIsNull);
402 static void finalize_partialaggregate(AggState *aggstate,
403  AggStatePerAgg peragg,
404  AggStatePerGroup pergroupstate,
405  Datum *resultVal, bool *resultIsNull);
406 static inline void prepare_hash_slot(AggStatePerHash perhash,
407  TupleTableSlot *inputslot,
408  TupleTableSlot *hashslot);
409 static void prepare_projection_slot(AggState *aggstate,
410  TupleTableSlot *slot,
411  int currentSet);
412 static void finalize_aggregates(AggState *aggstate,
413  AggStatePerAgg peragg,
414  AggStatePerGroup pergroup);
415 static TupleTableSlot *project_aggregates(AggState *aggstate);
416 static void find_cols(AggState *aggstate, Bitmapset **aggregated,
417  Bitmapset **unaggregated);
418 static bool find_cols_walker(Node *node, FindColsContext *context);
419 static void build_hash_tables(AggState *aggstate);
420 static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
421 static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
422  bool nullcheck);
423 static long hash_choose_num_buckets(double hashentrysize,
424  long estimated_nbuckets,
425  Size memory);
426 static int hash_choose_num_partitions(double input_groups,
427  double hashentrysize,
428  int used_bits,
429  int *log2_npartittions);
430 static void initialize_hash_entry(AggState *aggstate,
431  TupleHashTable hashtable,
432  TupleHashEntry entry);
433 static void lookup_hash_entries(AggState *aggstate);
434 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
435 static void agg_fill_hash_table(AggState *aggstate);
436 static bool agg_refill_hash_table(AggState *aggstate);
439 static void hash_agg_check_limits(AggState *aggstate);
440 static void hash_agg_enter_spill_mode(AggState *aggstate);
441 static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
442  int npartitions);
443 static void hashagg_finish_initial_spills(AggState *aggstate);
444 static void hashagg_reset_spill_state(AggState *aggstate);
446  int input_tapenum, int setno,
447  int64 input_tuples, double input_card,
448  int used_bits);
449 static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
450 static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
451  int used_bits, double input_groups,
452  double hashentrysize);
453 static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
454  TupleTableSlot *slot, uint32 hash);
455 static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
456  int setno);
457 static void hashagg_tapeinfo_init(AggState *aggstate);
458 static void hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *dest,
459  int ndest);
460 static void hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum);
461 static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
462 static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
463  AggState *aggstate, EState *estate,
464  Aggref *aggref, Oid transfn_oid,
465  Oid aggtranstype, Oid aggserialfn,
466  Oid aggdeserialfn, Datum initValue,
467  bool initValueIsNull, Oid *inputTypes,
468  int numArguments);
469 
470 
471 /*
472  * Select the current grouping set; affects current_set and
473  * curaggcontext.
474  */
475 static void
476 select_current_set(AggState *aggstate, int setno, bool is_hash)
477 {
478  /*
479  * When changing this, also adapt ExecAggPlainTransByVal() and
480  * ExecAggPlainTransByRef().
481  */
482  if (is_hash)
483  aggstate->curaggcontext = aggstate->hashcontext;
484  else
485  aggstate->curaggcontext = aggstate->aggcontexts[setno];
486 
487  aggstate->current_set = setno;
488 }
489 
490 /*
491  * Switch to phase "newphase", which must either be 0 or 1 (to reset) or
492  * current_phase + 1. Juggle the tuplesorts accordingly.
493  *
494  * Phase 0 is for hashing, which we currently handle last in the AGG_MIXED
495  * case, so when entering phase 0, all we need to do is drop open sorts.
496  */
497 static void
498 initialize_phase(AggState *aggstate, int newphase)
499 {
500  Assert(newphase <= 1 || newphase == aggstate->current_phase + 1);
501 
502  /*
503  * Whatever the previous state, we're now done with whatever input
504  * tuplesort was in use.
505  */
506  if (aggstate->sort_in)
507  {
508  tuplesort_end(aggstate->sort_in);
509  aggstate->sort_in = NULL;
510  }
511 
512  if (newphase <= 1)
513  {
514  /*
515  * Discard any existing output tuplesort.
516  */
517  if (aggstate->sort_out)
518  {
519  tuplesort_end(aggstate->sort_out);
520  aggstate->sort_out = NULL;
521  }
522  }
523  else
524  {
525  /*
526  * The old output tuplesort becomes the new input one, and this is the
527  * right time to actually sort it.
528  */
529  aggstate->sort_in = aggstate->sort_out;
530  aggstate->sort_out = NULL;
531  Assert(aggstate->sort_in);
532  tuplesort_performsort(aggstate->sort_in);
533  }
534 
535  /*
536  * If this isn't the last phase, we need to sort appropriately for the
537  * next phase in sequence.
538  */
539  if (newphase > 0 && newphase < aggstate->numphases - 1)
540  {
541  Sort *sortnode = aggstate->phases[newphase + 1].sortnode;
542  PlanState *outerNode = outerPlanState(aggstate);
543  TupleDesc tupDesc = ExecGetResultType(outerNode);
544 
545  aggstate->sort_out = tuplesort_begin_heap(tupDesc,
546  sortnode->numCols,
547  sortnode->sortColIdx,
548  sortnode->sortOperators,
549  sortnode->collations,
550  sortnode->nullsFirst,
551  work_mem,
552  NULL, false);
553  }
554 
555  aggstate->current_phase = newphase;
556  aggstate->phase = &aggstate->phases[newphase];
557 }
558 
559 /*
560  * Fetch a tuple from either the outer plan (for phase 1) or from the sorter
561  * populated by the previous phase. Copy it to the sorter for the next phase
562  * if any.
563  *
564  * Callers cannot rely on memory for tuple in returned slot remaining valid
565  * past any subsequently fetched tuple.
566  */
567 static TupleTableSlot *
569 {
570  TupleTableSlot *slot;
571 
572  if (aggstate->sort_in)
573  {
574  /* make sure we check for interrupts in either path through here */
576  if (!tuplesort_gettupleslot(aggstate->sort_in, true, false,
577  aggstate->sort_slot, NULL))
578  return NULL;
579  slot = aggstate->sort_slot;
580  }
581  else
582  slot = ExecProcNode(outerPlanState(aggstate));
583 
584  if (!TupIsNull(slot) && aggstate->sort_out)
585  tuplesort_puttupleslot(aggstate->sort_out, slot);
586 
587  return slot;
588 }
589 
590 /*
591  * (Re)Initialize an individual aggregate.
592  *
593  * This function handles only one grouping set, already set in
594  * aggstate->current_set.
595  *
596  * When called, CurrentMemoryContext should be the per-query context.
597  */
598 static void
600  AggStatePerGroup pergroupstate)
601 {
602  /*
603  * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
604  */
605  if (pertrans->numSortCols > 0)
606  {
607  /*
608  * In case of rescan, maybe there could be an uncompleted sort
609  * operation? Clean it up if so.
610  */
611  if (pertrans->sortstates[aggstate->current_set])
612  tuplesort_end(pertrans->sortstates[aggstate->current_set]);
613 
614 
615  /*
616  * We use a plain Datum sorter when there's a single input column;
617  * otherwise sort the full tuple. (See comments for
618  * process_ordered_aggregate_single.)
619  */
620  if (pertrans->numInputs == 1)
621  {
622  Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0);
623 
624  pertrans->sortstates[aggstate->current_set] =
625  tuplesort_begin_datum(attr->atttypid,
626  pertrans->sortOperators[0],
627  pertrans->sortCollations[0],
628  pertrans->sortNullsFirst[0],
629  work_mem, NULL, false);
630  }
631  else
632  pertrans->sortstates[aggstate->current_set] =
633  tuplesort_begin_heap(pertrans->sortdesc,
634  pertrans->numSortCols,
635  pertrans->sortColIdx,
636  pertrans->sortOperators,
637  pertrans->sortCollations,
638  pertrans->sortNullsFirst,
639  work_mem, NULL, false);
640  }
641 
642  /*
643  * (Re)set transValue to the initial value.
644  *
645  * Note that when the initial value is pass-by-ref, we must copy it (into
646  * the aggcontext) since we will pfree the transValue later.
647  */
648  if (pertrans->initValueIsNull)
649  pergroupstate->transValue = pertrans->initValue;
650  else
651  {
652  MemoryContext oldContext;
653 
655  pergroupstate->transValue = datumCopy(pertrans->initValue,
656  pertrans->transtypeByVal,
657  pertrans->transtypeLen);
658  MemoryContextSwitchTo(oldContext);
659  }
660  pergroupstate->transValueIsNull = pertrans->initValueIsNull;
661 
662  /*
663  * If the initial value for the transition state doesn't exist in the
664  * pg_aggregate table then we will let the first non-NULL value returned
665  * from the outer procNode become the initial value. (This is useful for
666  * aggregates like max() and min().) The noTransValue flag signals that we
667  * still need to do this.
668  */
669  pergroupstate->noTransValue = pertrans->initValueIsNull;
670 }
671 
672 /*
673  * Initialize all aggregate transition states for a new group of input values.
674  *
675  * If there are multiple grouping sets, we initialize only the first numReset
676  * of them (the grouping sets are ordered so that the most specific one, which
677  * is reset most often, is first). As a convenience, if numReset is 0, we
678  * reinitialize all sets.
679  *
680  * NB: This cannot be used for hash aggregates, as for those the grouping set
681  * number has to be specified from further up.
682  *
683  * When called, CurrentMemoryContext should be the per-query context.
684  */
685 static void
687  AggStatePerGroup *pergroups,
688  int numReset)
689 {
690  int transno;
691  int numGroupingSets = Max(aggstate->phase->numsets, 1);
692  int setno = 0;
693  int numTrans = aggstate->numtrans;
694  AggStatePerTrans transstates = aggstate->pertrans;
695 
696  if (numReset == 0)
697  numReset = numGroupingSets;
698 
699  for (setno = 0; setno < numReset; setno++)
700  {
701  AggStatePerGroup pergroup = pergroups[setno];
702 
703  select_current_set(aggstate, setno, false);
704 
705  for (transno = 0; transno < numTrans; transno++)
706  {
707  AggStatePerTrans pertrans = &transstates[transno];
708  AggStatePerGroup pergroupstate = &pergroup[transno];
709 
710  initialize_aggregate(aggstate, pertrans, pergroupstate);
711  }
712  }
713 }
714 
715 /*
716  * Given new input value(s), advance the transition function of one aggregate
717  * state within one grouping set only (already set in aggstate->current_set)
718  *
719  * The new values (and null flags) have been preloaded into argument positions
720  * 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to
721  * pass to the transition function. We also expect that the static fields of
722  * the fcinfo are already initialized; that was done by ExecInitAgg().
723  *
724  * It doesn't matter which memory context this is called in.
725  */
726 static void
728  AggStatePerTrans pertrans,
729  AggStatePerGroup pergroupstate)
730 {
731  FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
732  MemoryContext oldContext;
733  Datum newVal;
734 
735  if (pertrans->transfn.fn_strict)
736  {
737  /*
738  * For a strict transfn, nothing happens when there's a NULL input; we
739  * just keep the prior transValue.
740  */
741  int numTransInputs = pertrans->numTransInputs;
742  int i;
743 
744  for (i = 1; i <= numTransInputs; i++)
745  {
746  if (fcinfo->args[i].isnull)
747  return;
748  }
749  if (pergroupstate->noTransValue)
750  {
751  /*
752  * transValue has not been initialized. This is the first non-NULL
753  * input value. We use it as the initial value for transValue. (We
754  * already checked that the agg's input type is binary-compatible
755  * with its transtype, so straight copy here is OK.)
756  *
757  * We must copy the datum into aggcontext if it is pass-by-ref. We
758  * do not need to pfree the old transValue, since it's NULL.
759  */
761  pergroupstate->transValue = datumCopy(fcinfo->args[1].value,
762  pertrans->transtypeByVal,
763  pertrans->transtypeLen);
764  pergroupstate->transValueIsNull = false;
765  pergroupstate->noTransValue = false;
766  MemoryContextSwitchTo(oldContext);
767  return;
768  }
769  if (pergroupstate->transValueIsNull)
770  {
771  /*
772  * Don't call a strict function with NULL inputs. Note it is
773  * possible to get here despite the above tests, if the transfn is
774  * strict *and* returned a NULL on a prior cycle. If that happens
775  * we will propagate the NULL all the way to the end.
776  */
777  return;
778  }
779  }
780 
781  /* We run the transition functions in per-input-tuple memory context */
782  oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
783 
784  /* set up aggstate->curpertrans for AggGetAggref() */
785  aggstate->curpertrans = pertrans;
786 
787  /*
788  * OK to call the transition function
789  */
790  fcinfo->args[0].value = pergroupstate->transValue;
791  fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
792  fcinfo->isnull = false; /* just in case transfn doesn't set it */
793 
794  newVal = FunctionCallInvoke(fcinfo);
795 
796  aggstate->curpertrans = NULL;
797 
798  /*
799  * If pass-by-ref datatype, must copy the new value into aggcontext and
800  * free the prior transValue. But if transfn returned a pointer to its
801  * first input, we don't need to do anything. Also, if transfn returned a
802  * pointer to a R/W expanded object that is already a child of the
803  * aggcontext, assume we can adopt that value without copying it.
804  *
805  * It's safe to compare newVal with pergroup->transValue without regard
806  * for either being NULL, because ExecAggTransReparent() takes care to set
807  * transValue to 0 when NULL. Otherwise we could end up accidentally not
808  * reparenting, when the transValue has the same numerical value as
809  * newValue, despite being NULL. This is a somewhat hot path, making it
810  * undesirable to instead solve this with another branch for the common
811  * case of the transition function returning its (modified) input
812  * argument.
813  */
814  if (!pertrans->transtypeByVal &&
815  DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
816  newVal = ExecAggTransReparent(aggstate, pertrans,
817  newVal, fcinfo->isnull,
818  pergroupstate->transValue,
819  pergroupstate->transValueIsNull);
820 
821  pergroupstate->transValue = newVal;
822  pergroupstate->transValueIsNull = fcinfo->isnull;
823 
824  MemoryContextSwitchTo(oldContext);
825 }
826 
827 /*
828  * Advance each aggregate transition state for one input tuple. The input
829  * tuple has been stored in tmpcontext->ecxt_outertuple, so that it is
830  * accessible to ExecEvalExpr.
831  *
832  * We have two sets of transition states to handle: one for sorted aggregation
833  * and one for hashed; we do them both here, to avoid multiple evaluation of
834  * the inputs.
835  *
836  * When called, CurrentMemoryContext should be the per-query context.
837  */
838 static void
840 {
841  bool dummynull;
842 
844  aggstate->tmpcontext,
845  &dummynull);
846 }
847 
848 /*
849  * Run the transition function for a DISTINCT or ORDER BY aggregate
850  * with only one input. This is called after we have completed
851  * entering all the input values into the sort object. We complete the
852  * sort, read out the values in sorted order, and run the transition
853  * function on each value (applying DISTINCT if appropriate).
854  *
855  * Note that the strictness of the transition function was checked when
856  * entering the values into the sort, so we don't check it again here;
857  * we just apply standard SQL DISTINCT logic.
858  *
859  * The one-input case is handled separately from the multi-input case
860  * for performance reasons: for single by-value inputs, such as the
861  * common case of count(distinct id), the tuplesort_getdatum code path
862  * is around 300% faster. (The speedup for by-reference types is less
863  * but still noticeable.)
864  *
865  * This function handles only one grouping set (already set in
866  * aggstate->current_set).
867  *
868  * When called, CurrentMemoryContext should be the per-query context.
869  */
870 static void
872  AggStatePerTrans pertrans,
873  AggStatePerGroup pergroupstate)
874 {
875  Datum oldVal = (Datum) 0;
876  bool oldIsNull = true;
877  bool haveOldVal = false;
878  MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
879  MemoryContext oldContext;
880  bool isDistinct = (pertrans->numDistinctCols > 0);
881  Datum newAbbrevVal = (Datum) 0;
882  Datum oldAbbrevVal = (Datum) 0;
883  FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
884  Datum *newVal;
885  bool *isNull;
886 
887  Assert(pertrans->numDistinctCols < 2);
888 
889  tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
890 
891  /* Load the column into argument 1 (arg 0 will be transition value) */
892  newVal = &fcinfo->args[1].value;
893  isNull = &fcinfo->args[1].isnull;
894 
895  /*
896  * Note: if input type is pass-by-ref, the datums returned by the sort are
897  * freshly palloc'd in the per-query context, so we must be careful to
898  * pfree them when they are no longer needed.
899  */
900 
901  while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
902  true, newVal, isNull, &newAbbrevVal))
903  {
904  /*
905  * Clear and select the working context for evaluation of the equality
906  * function and transition function.
907  */
908  MemoryContextReset(workcontext);
909  oldContext = MemoryContextSwitchTo(workcontext);
910 
911  /*
912  * If DISTINCT mode, and not distinct from prior, skip it.
913  */
914  if (isDistinct &&
915  haveOldVal &&
916  ((oldIsNull && *isNull) ||
917  (!oldIsNull && !*isNull &&
918  oldAbbrevVal == newAbbrevVal &&
920  pertrans->aggCollation,
921  oldVal, *newVal)))))
922  {
923  /* equal to prior, so forget this one */
924  if (!pertrans->inputtypeByVal && !*isNull)
925  pfree(DatumGetPointer(*newVal));
926  }
927  else
928  {
929  advance_transition_function(aggstate, pertrans, pergroupstate);
930  /* forget the old value, if any */
931  if (!oldIsNull && !pertrans->inputtypeByVal)
932  pfree(DatumGetPointer(oldVal));
933  /* and remember the new one for subsequent equality checks */
934  oldVal = *newVal;
935  oldAbbrevVal = newAbbrevVal;
936  oldIsNull = *isNull;
937  haveOldVal = true;
938  }
939 
940  MemoryContextSwitchTo(oldContext);
941  }
942 
943  if (!oldIsNull && !pertrans->inputtypeByVal)
944  pfree(DatumGetPointer(oldVal));
945 
946  tuplesort_end(pertrans->sortstates[aggstate->current_set]);
947  pertrans->sortstates[aggstate->current_set] = NULL;
948 }
949 
950 /*
951  * Run the transition function for a DISTINCT or ORDER BY aggregate
952  * with more than one input. This is called after we have completed
953  * entering all the input values into the sort object. We complete the
954  * sort, read out the values in sorted order, and run the transition
955  * function on each value (applying DISTINCT if appropriate).
956  *
957  * This function handles only one grouping set (already set in
958  * aggstate->current_set).
959  *
960  * When called, CurrentMemoryContext should be the per-query context.
961  */
962 static void
964  AggStatePerTrans pertrans,
965  AggStatePerGroup pergroupstate)
966 {
967  ExprContext *tmpcontext = aggstate->tmpcontext;
968  FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
969  TupleTableSlot *slot1 = pertrans->sortslot;
970  TupleTableSlot *slot2 = pertrans->uniqslot;
971  int numTransInputs = pertrans->numTransInputs;
972  int numDistinctCols = pertrans->numDistinctCols;
973  Datum newAbbrevVal = (Datum) 0;
974  Datum oldAbbrevVal = (Datum) 0;
975  bool haveOldValue = false;
976  TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple;
977  int i;
978 
979  tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
980 
981  ExecClearTuple(slot1);
982  if (slot2)
983  ExecClearTuple(slot2);
984 
985  while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set],
986  true, true, slot1, &newAbbrevVal))
987  {
989 
990  tmpcontext->ecxt_outertuple = slot1;
991  tmpcontext->ecxt_innertuple = slot2;
992 
993  if (numDistinctCols == 0 ||
994  !haveOldValue ||
995  newAbbrevVal != oldAbbrevVal ||
996  !ExecQual(pertrans->equalfnMulti, tmpcontext))
997  {
998  /*
999  * Extract the first numTransInputs columns as datums to pass to
1000  * the transfn.
1001  */
1002  slot_getsomeattrs(slot1, numTransInputs);
1003 
1004  /* Load values into fcinfo */
1005  /* Start from 1, since the 0th arg will be the transition value */
1006  for (i = 0; i < numTransInputs; i++)
1007  {
1008  fcinfo->args[i + 1].value = slot1->tts_values[i];
1009  fcinfo->args[i + 1].isnull = slot1->tts_isnull[i];
1010  }
1011 
1012  advance_transition_function(aggstate, pertrans, pergroupstate);
1013 
1014  if (numDistinctCols > 0)
1015  {
1016  /* swap the slot pointers to retain the current tuple */
1017  TupleTableSlot *tmpslot = slot2;
1018 
1019  slot2 = slot1;
1020  slot1 = tmpslot;
1021  /* avoid ExecQual() calls by reusing abbreviated keys */
1022  oldAbbrevVal = newAbbrevVal;
1023  haveOldValue = true;
1024  }
1025  }
1026 
1027  /* Reset context each time */
1028  ResetExprContext(tmpcontext);
1029 
1030  ExecClearTuple(slot1);
1031  }
1032 
1033  if (slot2)
1034  ExecClearTuple(slot2);
1035 
1036  tuplesort_end(pertrans->sortstates[aggstate->current_set]);
1037  pertrans->sortstates[aggstate->current_set] = NULL;
1038 
1039  /* restore previous slot, potentially in use for grouping sets */
1040  tmpcontext->ecxt_outertuple = save;
1041 }
1042 
1043 /*
1044  * Compute the final value of one aggregate.
1045  *
1046  * This function handles only one grouping set (already set in
1047  * aggstate->current_set).
1048  *
1049  * The finalfn will be run, and the result delivered, in the
1050  * output-tuple context; caller's CurrentMemoryContext does not matter.
1051  *
1052  * The finalfn uses the state as set in the transno. This also might be
1053  * being used by another aggregate function, so it's important that we do
1054  * nothing destructive here.
1055  */
1056 static void
1058  AggStatePerAgg peragg,
1059  AggStatePerGroup pergroupstate,
1060  Datum *resultVal, bool *resultIsNull)
1061 {
1062  LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
1063  bool anynull = false;
1064  MemoryContext oldContext;
1065  int i;
1066  ListCell *lc;
1067  AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
1068 
1070 
1071  /*
1072  * Evaluate any direct arguments. We do this even if there's no finalfn
1073  * (which is unlikely anyway), so that side-effects happen as expected.
1074  * The direct arguments go into arg positions 1 and up, leaving position 0
1075  * for the transition state value.
1076  */
1077  i = 1;
1078  foreach(lc, peragg->aggdirectargs)
1079  {
1080  ExprState *expr = (ExprState *) lfirst(lc);
1081 
1082  fcinfo->args[i].value = ExecEvalExpr(expr,
1083  aggstate->ss.ps.ps_ExprContext,
1084  &fcinfo->args[i].isnull);
1085  anynull |= fcinfo->args[i].isnull;
1086  i++;
1087  }
1088 
1089  /*
1090  * Apply the agg's finalfn if one is provided, else return transValue.
1091  */
1092  if (OidIsValid(peragg->finalfn_oid))
1093  {
1094  int numFinalArgs = peragg->numFinalArgs;
1095 
1096  /* set up aggstate->curperagg for AggGetAggref() */
1097  aggstate->curperagg = peragg;
1098 
1099  InitFunctionCallInfoData(*fcinfo, &peragg->finalfn,
1100  numFinalArgs,
1101  pertrans->aggCollation,
1102  (void *) aggstate, NULL);
1103 
1104  /* Fill in the transition state value */
1105  fcinfo->args[0].value =
1106  MakeExpandedObjectReadOnly(pergroupstate->transValue,
1107  pergroupstate->transValueIsNull,
1108  pertrans->transtypeLen);
1109  fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
1110  anynull |= pergroupstate->transValueIsNull;
1111 
1112  /* Fill any remaining argument positions with nulls */
1113  for (; i < numFinalArgs; i++)
1114  {
1115  fcinfo->args[i].value = (Datum) 0;
1116  fcinfo->args[i].isnull = true;
1117  anynull = true;
1118  }
1119 
1120  if (fcinfo->flinfo->fn_strict && anynull)
1121  {
1122  /* don't call a strict function with NULL inputs */
1123  *resultVal = (Datum) 0;
1124  *resultIsNull = true;
1125  }
1126  else
1127  {
1128  *resultVal = FunctionCallInvoke(fcinfo);
1129  *resultIsNull = fcinfo->isnull;
1130  }
1131  aggstate->curperagg = NULL;
1132  }
1133  else
1134  {
1135  /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
1136  *resultVal = pergroupstate->transValue;
1137  *resultIsNull = pergroupstate->transValueIsNull;
1138  }
1139 
1140  /*
1141  * If result is pass-by-ref, make sure it is in the right context.
1142  */
1143  if (!peragg->resulttypeByVal && !*resultIsNull &&
1145  DatumGetPointer(*resultVal)))
1146  *resultVal = datumCopy(*resultVal,
1147  peragg->resulttypeByVal,
1148  peragg->resulttypeLen);
1149 
1150  MemoryContextSwitchTo(oldContext);
1151 }
1152 
1153 /*
1154  * Compute the output value of one partial aggregate.
1155  *
1156  * The serialization function will be run, and the result delivered, in the
1157  * output-tuple context; caller's CurrentMemoryContext does not matter.
1158  */
1159 static void
1161  AggStatePerAgg peragg,
1162  AggStatePerGroup pergroupstate,
1163  Datum *resultVal, bool *resultIsNull)
1164 {
1165  AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
1166  MemoryContext oldContext;
1167 
1169 
1170  /*
1171  * serialfn_oid will be set if we must serialize the transvalue before
1172  * returning it
1173  */
1174  if (OidIsValid(pertrans->serialfn_oid))
1175  {
1176  /* Don't call a strict serialization function with NULL input. */
1177  if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull)
1178  {
1179  *resultVal = (Datum) 0;
1180  *resultIsNull = true;
1181  }
1182  else
1183  {
1184  FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo;
1185 
1186  fcinfo->args[0].value =
1187  MakeExpandedObjectReadOnly(pergroupstate->transValue,
1188  pergroupstate->transValueIsNull,
1189  pertrans->transtypeLen);
1190  fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
1191  fcinfo->isnull = false;
1192 
1193  *resultVal = FunctionCallInvoke(fcinfo);
1194  *resultIsNull = fcinfo->isnull;
1195  }
1196  }
1197  else
1198  {
1199  /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
1200  *resultVal = pergroupstate->transValue;
1201  *resultIsNull = pergroupstate->transValueIsNull;
1202  }
1203 
1204  /* If result is pass-by-ref, make sure it is in the right context. */
1205  if (!peragg->resulttypeByVal && !*resultIsNull &&
1207  DatumGetPointer(*resultVal)))
1208  *resultVal = datumCopy(*resultVal,
1209  peragg->resulttypeByVal,
1210  peragg->resulttypeLen);
1211 
1212  MemoryContextSwitchTo(oldContext);
1213 }
1214 
1215 /*
1216  * Extract the attributes that make up the grouping key into the
1217  * hashslot. This is necessary to compute the hash or perform a lookup.
1218  */
1219 static inline void
1221  TupleTableSlot *inputslot,
1222  TupleTableSlot *hashslot)
1223 {
1224  int i;
1225 
1226  /* transfer just the needed columns into hashslot */
1227  slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
1228  ExecClearTuple(hashslot);
1229 
1230  for (i = 0; i < perhash->numhashGrpCols; i++)
1231  {
1232  int varNumber = perhash->hashGrpColIdxInput[i] - 1;
1233 
1234  hashslot->tts_values[i] = inputslot->tts_values[varNumber];
1235  hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
1236  }
1237  ExecStoreVirtualTuple(hashslot);
1238 }
1239 
1240 /*
1241  * Prepare to finalize and project based on the specified representative tuple
1242  * slot and grouping set.
1243  *
1244  * In the specified tuple slot, force to null all attributes that should be
1245  * read as null in the context of the current grouping set. Also stash the
1246  * current group bitmap where GroupingExpr can get at it.
1247  *
1248  * This relies on three conditions:
1249  *
1250  * 1) Nothing is ever going to try and extract the whole tuple from this slot,
1251  * only reference it in evaluations, which will only access individual
1252  * attributes.
1253  *
1254  * 2) No system columns are going to need to be nulled. (If a system column is
1255  * referenced in a group clause, it is actually projected in the outer plan
1256  * tlist.)
1257  *
1258  * 3) Within a given phase, we never need to recover the value of an attribute
1259  * once it has been set to null.
1260  *
1261  * Poking into the slot this way is a bit ugly, but the consensus is that the
1262  * alternative was worse.
1263  */
1264 static void
1265 prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
1266 {
1267  if (aggstate->phase->grouped_cols)
1268  {
1269  Bitmapset *grouped_cols = aggstate->phase->grouped_cols[currentSet];
1270 
1271  aggstate->grouped_cols = grouped_cols;
1272 
1273  if (TTS_EMPTY(slot))
1274  {
1275  /*
1276  * Force all values to be NULL if working on an empty input tuple
1277  * (i.e. an empty grouping set for which no input rows were
1278  * supplied).
1279  */
1280  ExecStoreAllNullTuple(slot);
1281  }
1282  else if (aggstate->all_grouped_cols)
1283  {
1284  ListCell *lc;
1285 
1286  /* all_grouped_cols is arranged in desc order */
1288 
1289  foreach(lc, aggstate->all_grouped_cols)
1290  {
1291  int attnum = lfirst_int(lc);
1292 
1293  if (!bms_is_member(attnum, grouped_cols))
1294  slot->tts_isnull[attnum - 1] = true;
1295  }
1296  }
1297  }
1298 }
1299 
1300 /*
1301  * Compute the final value of all aggregates for one group.
1302  *
1303  * This function handles only one grouping set at a time, which the caller must
1304  * have selected. It's also the caller's responsibility to adjust the supplied
1305  * pergroup parameter to point to the current set's transvalues.
1306  *
1307  * Results are stored in the output econtext aggvalues/aggnulls.
1308  */
1309 static void
1311  AggStatePerAgg peraggs,
1312  AggStatePerGroup pergroup)
1313 {
1314  ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
1315  Datum *aggvalues = econtext->ecxt_aggvalues;
1316  bool *aggnulls = econtext->ecxt_aggnulls;
1317  int aggno;
1318  int transno;
1319 
1320  /*
1321  * If there were any DISTINCT and/or ORDER BY aggregates, sort their
1322  * inputs and run the transition functions.
1323  */
1324  for (transno = 0; transno < aggstate->numtrans; transno++)
1325  {
1326  AggStatePerTrans pertrans = &aggstate->pertrans[transno];
1327  AggStatePerGroup pergroupstate;
1328 
1329  pergroupstate = &pergroup[transno];
1330 
1331  if (pertrans->numSortCols > 0)
1332  {
1333  Assert(aggstate->aggstrategy != AGG_HASHED &&
1334  aggstate->aggstrategy != AGG_MIXED);
1335 
1336  if (pertrans->numInputs == 1)
1338  pertrans,
1339  pergroupstate);
1340  else
1342  pertrans,
1343  pergroupstate);
1344  }
1345  }
1346 
1347  /*
1348  * Run the final functions.
1349  */
1350  for (aggno = 0; aggno < aggstate->numaggs; aggno++)
1351  {
1352  AggStatePerAgg peragg = &peraggs[aggno];
1353  int transno = peragg->transno;
1354  AggStatePerGroup pergroupstate;
1355 
1356  pergroupstate = &pergroup[transno];
1357 
1358  if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
1359  finalize_partialaggregate(aggstate, peragg, pergroupstate,
1360  &aggvalues[aggno], &aggnulls[aggno]);
1361  else
1362  finalize_aggregate(aggstate, peragg, pergroupstate,
1363  &aggvalues[aggno], &aggnulls[aggno]);
1364  }
1365 }
1366 
1367 /*
1368  * Project the result of a group (whose aggs have already been calculated by
1369  * finalize_aggregates). Returns the result slot, or NULL if no row is
1370  * projected (suppressed by qual).
1371  */
1372 static TupleTableSlot *
1374 {
1375  ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
1376 
1377  /*
1378  * Check the qual (HAVING clause); if the group does not match, ignore it.
1379  */
1380  if (ExecQual(aggstate->ss.ps.qual, econtext))
1381  {
1382  /*
1383  * Form and return projection tuple using the aggregate results and
1384  * the representative input tuple.
1385  */
1386  return ExecProject(aggstate->ss.ps.ps_ProjInfo);
1387  }
1388  else
1389  InstrCountFiltered1(aggstate, 1);
1390 
1391  return NULL;
1392 }
1393 
1394 /*
1395  * Find input-tuple columns that are needed, dividing them into
1396  * aggregated and unaggregated sets.
1397  */
1398 static void
1399 find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
1400 {
1401  Agg *agg = (Agg *) aggstate->ss.ps.plan;
1402  FindColsContext context;
1403 
1404  context.is_aggref = false;
1405  context.aggregated = NULL;
1406  context.unaggregated = NULL;
1407 
1408  /* Examine tlist and quals */
1409  (void) find_cols_walker((Node *) agg->plan.targetlist, &context);
1410  (void) find_cols_walker((Node *) agg->plan.qual, &context);
1411 
1412  /* In some cases, grouping columns will not appear in the tlist */
1413  for (int i = 0; i < agg->numCols; i++)
1414  context.unaggregated = bms_add_member(context.unaggregated,
1415  agg->grpColIdx[i]);
1416 
1417  *aggregated = context.aggregated;
1418  *unaggregated = context.unaggregated;
1419 }
1420 
1421 static bool
1423 {
1424  if (node == NULL)
1425  return false;
1426  if (IsA(node, Var))
1427  {
1428  Var *var = (Var *) node;
1429 
1430  /* setrefs.c should have set the varno to OUTER_VAR */
1431  Assert(var->varno == OUTER_VAR);
1432  Assert(var->varlevelsup == 0);
1433  if (context->is_aggref)
1434  context->aggregated = bms_add_member(context->aggregated,
1435  var->varattno);
1436  else
1437  context->unaggregated = bms_add_member(context->unaggregated,
1438  var->varattno);
1439  return false;
1440  }
1441  if (IsA(node, Aggref))
1442  {
1443  Assert(!context->is_aggref);
1444  context->is_aggref = true;
1445  expression_tree_walker(node, find_cols_walker, (void *) context);
1446  context->is_aggref = false;
1447  return false;
1448  }
1450  (void *) context);
1451 }
1452 
1453 /*
1454  * (Re-)initialize the hash table(s) to empty.
1455  *
1456  * To implement hashed aggregation, we need a hashtable that stores a
1457  * representative tuple and an array of AggStatePerGroup structs for each
1458  * distinct set of GROUP BY column values. We compute the hash key from the
1459  * GROUP BY columns. The per-group data is allocated in lookup_hash_entry(),
1460  * for each entry.
1461  *
1462  * We have a separate hashtable and associated perhash data structure for each
1463  * grouping set for which we're doing hashing.
1464  *
1465  * The contents of the hash tables always live in the hashcontext's per-tuple
1466  * memory context (there is only one of these for all tables together, since
1467  * they are all reset at the same time).
1468  */
1469 static void
1471 {
1472  int setno;
1473 
1474  for (setno = 0; setno < aggstate->num_hashes; ++setno)
1475  {
1476  AggStatePerHash perhash = &aggstate->perhash[setno];
1477  long nbuckets;
1478  Size memory;
1479 
1480  if (perhash->hashtable != NULL)
1481  {
1482  ResetTupleHashTable(perhash->hashtable);
1483  continue;
1484  }
1485 
1486  Assert(perhash->aggnode->numGroups > 0);
1487 
1488  memory = aggstate->hash_mem_limit / aggstate->num_hashes;
1489 
1490  /* choose reasonable number of buckets per hashtable */
1491  nbuckets = hash_choose_num_buckets(aggstate->hashentrysize,
1492  perhash->aggnode->numGroups,
1493  memory);
1494 
1495  build_hash_table(aggstate, setno, nbuckets);
1496  }
1497 
1498  aggstate->hash_ngroups_current = 0;
1499 }
1500 
1501 /*
1502  * Build a single hashtable for this grouping set.
1503  */
1504 static void
1505 build_hash_table(AggState *aggstate, int setno, long nbuckets)
1506 {
1507  AggStatePerHash perhash = &aggstate->perhash[setno];
1508  MemoryContext metacxt = aggstate->hash_metacxt;
1509  MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
1510  MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
1511  Size additionalsize;
1512 
1513  Assert(aggstate->aggstrategy == AGG_HASHED ||
1514  aggstate->aggstrategy == AGG_MIXED);
1515 
1516  /*
1517  * Used to make sure initial hash table allocation does not exceed
1518  * hash_mem. Note that the estimate does not include space for
1519  * pass-by-reference transition data values, nor for the representative
1520  * tuple of each group.
1521  */
1522  additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
1523 
1524  perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
1525  perhash->hashslot->tts_tupleDescriptor,
1526  perhash->numCols,
1527  perhash->hashGrpColIdxHash,
1528  perhash->eqfuncoids,
1529  perhash->hashfunctions,
1530  perhash->aggnode->grpCollations,
1531  nbuckets,
1532  additionalsize,
1533  metacxt,
1534  hashcxt,
1535  tmpcxt,
1536  DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
1537 }
1538 
1539 /*
1540  * Compute columns that actually need to be stored in hashtable entries. The
1541  * incoming tuples from the child plan node will contain grouping columns,
1542  * other columns referenced in our targetlist and qual, columns used to
1543  * compute the aggregate functions, and perhaps just junk columns we don't use
1544  * at all. Only columns of the first two types need to be stored in the
1545  * hashtable, and getting rid of the others can make the table entries
1546  * significantly smaller. The hashtable only contains the relevant columns,
1547  * and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table()
1548  * into the format of the normal input descriptor.
1549  *
1550  * Additional columns, in addition to the columns grouped by, come from two
1551  * sources: Firstly functionally dependent columns that we don't need to group
1552  * by themselves, and secondly ctids for row-marks.
1553  *
1554  * To eliminate duplicates, we build a bitmapset of the needed columns, and
1555  * then build an array of the columns included in the hashtable. We might
1556  * still have duplicates if the passed-in grpColIdx has them, which can happen
1557  * in edge cases from semijoins/distinct; these can't always be removed,
1558  * because it's not certain that the duplicate cols will be using the same
1559  * hash function.
1560  *
1561  * Note that the array is preserved over ExecReScanAgg, so we allocate it in
1562  * the per-query context (unlike the hash table itself).
1563  */
1564 static void
1566 {
1567  Bitmapset *base_colnos;
1568  Bitmapset *aggregated_colnos;
1569  TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
1570  List *outerTlist = outerPlanState(aggstate)->plan->targetlist;
1571  int numHashes = aggstate->num_hashes;
1572  EState *estate = aggstate->ss.ps.state;
1573  int j;
1574 
1575  /* Find Vars that will be needed in tlist and qual */
1576  find_cols(aggstate, &aggregated_colnos, &base_colnos);
1577  aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
1578  aggstate->max_colno_needed = 0;
1579  aggstate->all_cols_needed = true;
1580 
1581  for (int i = 0; i < scanDesc->natts; i++)
1582  {
1583  int colno = i + 1;
1584 
1585  if (bms_is_member(colno, aggstate->colnos_needed))
1586  aggstate->max_colno_needed = colno;
1587  else
1588  aggstate->all_cols_needed = false;
1589  }
1590 
1591  for (j = 0; j < numHashes; ++j)
1592  {
1593  AggStatePerHash perhash = &aggstate->perhash[j];
1594  Bitmapset *colnos = bms_copy(base_colnos);
1595  AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
1596  List *hashTlist = NIL;
1597  TupleDesc hashDesc;
1598  int maxCols;
1599  int i;
1600 
1601  perhash->largestGrpColIdx = 0;
1602 
1603  /*
1604  * If we're doing grouping sets, then some Vars might be referenced in
1605  * tlist/qual for the benefit of other grouping sets, but not needed
1606  * when hashing; i.e. prepare_projection_slot will null them out, so
1607  * there'd be no point storing them. Use prepare_projection_slot's
1608  * logic to determine which.
1609  */
1610  if (aggstate->phases[0].grouped_cols)
1611  {
1612  Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[j];
1613  ListCell *lc;
1614 
1615  foreach(lc, aggstate->all_grouped_cols)
1616  {
1617  int attnum = lfirst_int(lc);
1618 
1619  if (!bms_is_member(attnum, grouped_cols))
1620  colnos = bms_del_member(colnos, attnum);
1621  }
1622  }
1623 
1624  /*
1625  * Compute maximum number of input columns accounting for possible
1626  * duplications in the grpColIdx array, which can happen in some edge
1627  * cases where HashAggregate was generated as part of a semijoin or a
1628  * DISTINCT.
1629  */
1630  maxCols = bms_num_members(colnos) + perhash->numCols;
1631 
1632  perhash->hashGrpColIdxInput =
1633  palloc(maxCols * sizeof(AttrNumber));
1634  perhash->hashGrpColIdxHash =
1635  palloc(perhash->numCols * sizeof(AttrNumber));
1636 
1637  /* Add all the grouping columns to colnos */
1638  for (i = 0; i < perhash->numCols; i++)
1639  colnos = bms_add_member(colnos, grpColIdx[i]);
1640 
1641  /*
1642  * First build mapping for columns directly hashed. These are the
1643  * first, because they'll be accessed when computing hash values and
1644  * comparing tuples for exact matches. We also build simple mapping
1645  * for execGrouping, so it knows where to find the to-be-hashed /
1646  * compared columns in the input.
1647  */
1648  for (i = 0; i < perhash->numCols; i++)
1649  {
1650  perhash->hashGrpColIdxInput[i] = grpColIdx[i];
1651  perhash->hashGrpColIdxHash[i] = i + 1;
1652  perhash->numhashGrpCols++;
1653  /* delete already mapped columns */
1654  bms_del_member(colnos, grpColIdx[i]);
1655  }
1656 
1657  /* and add the remaining columns */
1658  while ((i = bms_first_member(colnos)) >= 0)
1659  {
1660  perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i;
1661  perhash->numhashGrpCols++;
1662  }
1663 
1664  /* and build a tuple descriptor for the hashtable */
1665  for (i = 0; i < perhash->numhashGrpCols; i++)
1666  {
1667  int varNumber = perhash->hashGrpColIdxInput[i] - 1;
1668 
1669  hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber));
1670  perhash->largestGrpColIdx =
1671  Max(varNumber + 1, perhash->largestGrpColIdx);
1672  }
1673 
1674  hashDesc = ExecTypeFromTL(hashTlist);
1675 
1676  execTuplesHashPrepare(perhash->numCols,
1677  perhash->aggnode->grpOperators,
1678  &perhash->eqfuncoids,
1679  &perhash->hashfunctions);
1680  perhash->hashslot =
1681  ExecAllocTableSlot(&estate->es_tupleTable, hashDesc,
1683 
1684  list_free(hashTlist);
1685  bms_free(colnos);
1686  }
1687 
1688  bms_free(base_colnos);
1689 }
1690 
1691 /*
1692  * Estimate per-hash-table-entry overhead.
1693  */
1694 Size
1695 hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
1696 {
1697  Size tupleChunkSize;
1698  Size pergroupChunkSize;
1699  Size transitionChunkSize;
1700  Size tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) +
1701  tupleWidth);
1702  Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
1703 
1704  tupleChunkSize = CHUNKHDRSZ + tupleSize;
1705 
1706  if (pergroupSize > 0)
1707  pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
1708  else
1709  pergroupChunkSize = 0;
1710 
1711  if (transitionSpace > 0)
1712  transitionChunkSize = CHUNKHDRSZ + transitionSpace;
1713  else
1714  transitionChunkSize = 0;
1715 
1716  return
1717  sizeof(TupleHashEntryData) +
1718  tupleChunkSize +
1719  pergroupChunkSize +
1720  transitionChunkSize;
1721 }
1722 
1723 /*
1724  * hashagg_recompile_expressions()
1725  *
1726  * Identifies the right phase, compiles the right expression given the
1727  * arguments, and then sets phase->evalfunc to that expression.
1728  *
1729  * Different versions of the compiled expression are needed depending on
1730  * whether hash aggregation has spilled or not, and whether it's reading from
1731  * the outer plan or a tape. Before spilling to disk, the expression reads
1732  * from the outer plan and does not need to perform a NULL check. After
1733  * HashAgg begins to spill, new groups will not be created in the hash table,
1734  * and the AggStatePerGroup array may be NULL; therefore we need to add a null
1735  * pointer check to the expression. Then, when reading spilled data from a
1736  * tape, we change the outer slot type to be a fixed minimal tuple slot.
1737  *
1738  * It would be wasteful to recompile every time, so cache the compiled
1739  * expressions in the AggStatePerPhase, and reuse when appropriate.
1740  */
1741 static void
1742 hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
1743 {
1744  AggStatePerPhase phase;
1745  int i = minslot ? 1 : 0;
1746  int j = nullcheck ? 1 : 0;
1747 
1748  Assert(aggstate->aggstrategy == AGG_HASHED ||
1749  aggstate->aggstrategy == AGG_MIXED);
1750 
1751  if (aggstate->aggstrategy == AGG_HASHED)
1752  phase = &aggstate->phases[0];
1753  else /* AGG_MIXED */
1754  phase = &aggstate->phases[1];
1755 
1756  if (phase->evaltrans_cache[i][j] == NULL)
1757  {
1758  const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops;
1759  bool outerfixed = aggstate->ss.ps.outeropsfixed;
1760  bool dohash = true;
1761  bool dosort = false;
1762 
1763  /*
1764  * If minslot is true, that means we are processing a spilled batch
1765  * (inside agg_refill_hash_table()), and we must not advance the
1766  * sorted grouping sets.
1767  */
1768  if (aggstate->aggstrategy == AGG_MIXED && !minslot)
1769  dosort = true;
1770 
1771  /* temporarily change the outerops while compiling the expression */
1772  if (minslot)
1773  {
1774  aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
1775  aggstate->ss.ps.outeropsfixed = true;
1776  }
1777 
1778  phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase,
1779  dosort, dohash,
1780  nullcheck);
1781 
1782  /* change back */
1783  aggstate->ss.ps.outerops = outerops;
1784  aggstate->ss.ps.outeropsfixed = outerfixed;
1785  }
1786 
1787  phase->evaltrans = phase->evaltrans_cache[i][j];
1788 }
1789 
1790 /*
1791  * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
1792  * number of partitions we expect to create (if we do spill).
1793  *
1794  * There are two limits: a memory limit, and also an ngroups limit. The
1795  * ngroups limit becomes important when we expect transition values to grow
1796  * substantially larger than the initial value.
1797  */
1798 void
1799 hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
1800  Size *mem_limit, uint64 *ngroups_limit,
1801  int *num_partitions)
1802 {
1803  int npartitions;
1804  Size partition_mem;
1805  Size hash_mem_limit = get_hash_memory_limit();
1806 
1807  /* if not expected to spill, use all of hash_mem */
1808  if (input_groups * hashentrysize <= hash_mem_limit)
1809  {
1810  if (num_partitions != NULL)
1811  *num_partitions = 0;
1812  *mem_limit = hash_mem_limit;
1813  *ngroups_limit = hash_mem_limit / hashentrysize;
1814  return;
1815  }
1816 
1817  /*
1818  * Calculate expected memory requirements for spilling, which is the size
1819  * of the buffers needed for all the tapes that need to be open at once.
1820  * Then, subtract that from the memory available for holding hash tables.
1821  */
1822  npartitions = hash_choose_num_partitions(input_groups,
1823  hashentrysize,
1824  used_bits,
1825  NULL);
1826  if (num_partitions != NULL)
1827  *num_partitions = npartitions;
1828 
1829  partition_mem =
1831  HASHAGG_WRITE_BUFFER_SIZE * npartitions;
1832 
1833  /*
1834  * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
1835  * minimum number of partitions, so we aren't going to dramatically exceed
1836  * work mem anyway.
1837  */
1838  if (hash_mem_limit > 4 * partition_mem)
1839  *mem_limit = hash_mem_limit - partition_mem;
1840  else
1841  *mem_limit = hash_mem_limit * 0.75;
1842 
1843  if (*mem_limit > hashentrysize)
1844  *ngroups_limit = *mem_limit / hashentrysize;
1845  else
1846  *ngroups_limit = 1;
1847 }
1848 
1849 /*
1850  * hash_agg_check_limits
1851  *
1852  * After adding a new group to the hash table, check whether we need to enter
1853  * spill mode. Allocations may happen without adding new groups (for instance,
1854  * if the transition state size grows), so this check is imperfect.
1855  */
1856 static void
1858 {
1859  uint64 ngroups = aggstate->hash_ngroups_current;
1860  Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
1861  true);
1863  true);
1864 
1865  /*
1866  * Don't spill unless there's at least one group in the hash table so we
1867  * can be sure to make progress even in edge cases.
1868  */
1869  if (aggstate->hash_ngroups_current > 0 &&
1870  (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
1871  ngroups > aggstate->hash_ngroups_limit))
1872  {
1873  hash_agg_enter_spill_mode(aggstate);
1874  }
1875 }
1876 
1877 /*
1878  * Enter "spill mode", meaning that no new groups are added to any of the hash
1879  * tables. Tuples that would create a new group are instead spilled, and
1880  * processed later.
1881  */
1882 static void
1884 {
1885  aggstate->hash_spill_mode = true;
1886  hashagg_recompile_expressions(aggstate, aggstate->table_filled, true);
1887 
1888  if (!aggstate->hash_ever_spilled)
1889  {
1890  Assert(aggstate->hash_tapeinfo == NULL);
1891  Assert(aggstate->hash_spills == NULL);
1892 
1893  aggstate->hash_ever_spilled = true;
1894 
1895  hashagg_tapeinfo_init(aggstate);
1896 
1897  aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes);
1898 
1899  for (int setno = 0; setno < aggstate->num_hashes; setno++)
1900  {
1901  AggStatePerHash perhash = &aggstate->perhash[setno];
1902  HashAggSpill *spill = &aggstate->hash_spills[setno];
1903 
1904  hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0,
1905  perhash->aggnode->numGroups,
1906  aggstate->hashentrysize);
1907  }
1908  }
1909 }
1910 
1911 /*
1912  * Update metrics after filling the hash table.
1913  *
1914  * If reading from the outer plan, from_tape should be false; if reading from
1915  * another tape, from_tape should be true.
1916  */
1917 static void
1918 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
1919 {
1920  Size meta_mem;
1921  Size hashkey_mem;
1922  Size buffer_mem;
1923  Size total_mem;
1924 
1925  if (aggstate->aggstrategy != AGG_MIXED &&
1926  aggstate->aggstrategy != AGG_HASHED)
1927  return;
1928 
1929  /* memory for the hash table itself */
1930  meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
1931 
1932  /* memory for the group keys and transition states */
1933  hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
1934 
1935  /* memory for read/write tape buffers, if spilled */
1936  buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
1937  if (from_tape)
1938  buffer_mem += HASHAGG_READ_BUFFER_SIZE;
1939 
1940  /* update peak mem */
1941  total_mem = meta_mem + hashkey_mem + buffer_mem;
1942  if (total_mem > aggstate->hash_mem_peak)
1943  aggstate->hash_mem_peak = total_mem;
1944 
1945  /* update disk usage */
1946  if (aggstate->hash_tapeinfo != NULL)
1947  {
1948  uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeinfo->tapeset) * (BLCKSZ / 1024);
1949 
1950  if (aggstate->hash_disk_used < disk_used)
1951  aggstate->hash_disk_used = disk_used;
1952  }
1953 
1954  /* update hashentrysize estimate based on contents */
1955  if (aggstate->hash_ngroups_current > 0)
1956  {
1957  aggstate->hashentrysize =
1958  sizeof(TupleHashEntryData) +
1959  (hashkey_mem / (double) aggstate->hash_ngroups_current);
1960  }
1961 }
1962 
1963 /*
1964  * Choose a reasonable number of buckets for the initial hash table size.
1965  */
1966 static long
1967 hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
1968 {
1969  long max_nbuckets;
1970  long nbuckets = ngroups;
1971 
1972  max_nbuckets = memory / hashentrysize;
1973 
1974  /*
1975  * Underestimating is better than overestimating. Too many buckets crowd
1976  * out space for group keys and transition state values.
1977  */
1978  max_nbuckets >>= 1;
1979 
1980  if (nbuckets > max_nbuckets)
1981  nbuckets = max_nbuckets;
1982 
1983  return Max(nbuckets, 1);
1984 }
1985 
1986 /*
1987  * Determine the number of partitions to create when spilling, which will
1988  * always be a power of two. If log2_npartitions is non-NULL, set
1989  * *log2_npartitions to the log2() of the number of partitions.
1990  */
1991 static int
1992 hash_choose_num_partitions(double input_groups, double hashentrysize,
1993  int used_bits, int *log2_npartitions)
1994 {
1995  Size hash_mem_limit = get_hash_memory_limit();
1996  double partition_limit;
1997  double mem_wanted;
1998  double dpartitions;
1999  int npartitions;
2000  int partition_bits;
2001 
2002  /*
2003  * Avoid creating so many partitions that the memory requirements of the
2004  * open partition files are greater than 1/4 of hash_mem.
2005  */
2006  partition_limit =
2007  (hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
2009 
2010  mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
2011 
2012  /* make enough partitions so that each one is likely to fit in memory */
2013  dpartitions = 1 + (mem_wanted / hash_mem_limit);
2014 
2015  if (dpartitions > partition_limit)
2016  dpartitions = partition_limit;
2017 
2018  if (dpartitions < HASHAGG_MIN_PARTITIONS)
2019  dpartitions = HASHAGG_MIN_PARTITIONS;
2020  if (dpartitions > HASHAGG_MAX_PARTITIONS)
2021  dpartitions = HASHAGG_MAX_PARTITIONS;
2022 
2023  /* HASHAGG_MAX_PARTITIONS limit makes this safe */
2024  npartitions = (int) dpartitions;
2025 
2026  /* ceil(log2(npartitions)) */
2027  partition_bits = my_log2(npartitions);
2028 
2029  /* make sure that we don't exhaust the hash bits */
2030  if (partition_bits + used_bits >= 32)
2031  partition_bits = 32 - used_bits;
2032 
2033  if (log2_npartitions != NULL)
2034  *log2_npartitions = partition_bits;
2035 
2036  /* number of partitions will be a power of two */
2037  npartitions = 1 << partition_bits;
2038 
2039  return npartitions;
2040 }
2041 
2042 /*
2043  * Initialize a freshly-created TupleHashEntry.
2044  */
2045 static void
2047  TupleHashEntry entry)
2048 {
2049  AggStatePerGroup pergroup;
2050  int transno;
2051 
2052  aggstate->hash_ngroups_current++;
2053  hash_agg_check_limits(aggstate);
2054 
2055  /* no need to allocate or initialize per-group state */
2056  if (aggstate->numtrans == 0)
2057  return;
2058 
2059  pergroup = (AggStatePerGroup)
2060  MemoryContextAlloc(hashtable->tablecxt,
2061  sizeof(AggStatePerGroupData) * aggstate->numtrans);
2062 
2063  entry->additional = pergroup;
2064 
2065  /*
2066  * Initialize aggregates for new tuple group, lookup_hash_entries()
2067  * already has selected the relevant grouping set.
2068  */
2069  for (transno = 0; transno < aggstate->numtrans; transno++)
2070  {
2071  AggStatePerTrans pertrans = &aggstate->pertrans[transno];
2072  AggStatePerGroup pergroupstate = &pergroup[transno];
2073 
2074  initialize_aggregate(aggstate, pertrans, pergroupstate);
2075  }
2076 }
2077 
2078 /*
2079  * Look up hash entries for the current tuple in all hashed grouping sets.
2080  *
2081  * Be aware that lookup_hash_entry can reset the tmpcontext.
2082  *
2083  * Some entries may be left NULL if we are in "spill mode". The same tuple
2084  * will belong to different groups for each grouping set, so may match a group
2085  * already in memory for one set and match a group not in memory for another
2086  * set. When in "spill mode", the tuple will be spilled for each grouping set
2087  * where it doesn't match a group in memory.
2088  *
2089  * NB: It's possible to spill the same tuple for several different grouping
2090  * sets. This may seem wasteful, but it's actually a trade-off: if we spill
2091  * the tuple multiple times for multiple grouping sets, it can be partitioned
2092  * for each grouping set, making the refilling of the hash table very
2093  * efficient.
2094  */
2095 static void
2097 {
2098  AggStatePerGroup *pergroup = aggstate->hash_pergroup;
2099  TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
2100  int setno;
2101 
2102  for (setno = 0; setno < aggstate->num_hashes; setno++)
2103  {
2104  AggStatePerHash perhash = &aggstate->perhash[setno];
2105  TupleHashTable hashtable = perhash->hashtable;
2106  TupleTableSlot *hashslot = perhash->hashslot;
2107  TupleHashEntry entry;
2108  uint32 hash;
2109  bool isnew = false;
2110  bool *p_isnew;
2111 
2112  /* if hash table already spilled, don't create new entries */
2113  p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
2114 
2115  select_current_set(aggstate, setno, true);
2116  prepare_hash_slot(perhash,
2117  outerslot,
2118  hashslot);
2119 
2120  entry = LookupTupleHashEntry(hashtable, hashslot,
2121  p_isnew, &hash);
2122 
2123  if (entry != NULL)
2124  {
2125  if (isnew)
2126  initialize_hash_entry(aggstate, hashtable, entry);
2127  pergroup[setno] = entry->additional;
2128  }
2129  else
2130  {
2131  HashAggSpill *spill = &aggstate->hash_spills[setno];
2132  TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
2133 
2134  if (spill->partitions == NULL)
2135  hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0,
2136  perhash->aggnode->numGroups,
2137  aggstate->hashentrysize);
2138 
2139  hashagg_spill_tuple(aggstate, spill, slot, hash);
2140  pergroup[setno] = NULL;
2141  }
2142  }
2143 }
2144 
2145 /*
2146  * ExecAgg -
2147  *
2148  * ExecAgg receives tuples from its outer subplan and aggregates over
2149  * the appropriate attribute for each aggregate function use (Aggref
2150  * node) appearing in the targetlist or qual of the node. The number
2151  * of tuples to aggregate over depends on whether grouped or plain
2152  * aggregation is selected. In grouped aggregation, we produce a result
2153  * row for each group; in plain aggregation there's a single result row
2154  * for the whole query. In either case, the value of each aggregate is
2155  * stored in the expression context to be used when ExecProject evaluates
2156  * the result tuple.
2157  */
2158 static TupleTableSlot *
2160 {
2161  AggState *node = castNode(AggState, pstate);
2162  TupleTableSlot *result = NULL;
2163 
2165 
2166  if (!node->agg_done)
2167  {
2168  /* Dispatch based on strategy */
2169  switch (node->phase->aggstrategy)
2170  {
2171  case AGG_HASHED:
2172  if (!node->table_filled)
2173  agg_fill_hash_table(node);
2174  /* FALLTHROUGH */
2175  case AGG_MIXED:
2176  result = agg_retrieve_hash_table(node);
2177  break;
2178  case AGG_PLAIN:
2179  case AGG_SORTED:
2180  result = agg_retrieve_direct(node);
2181  break;
2182  }
2183 
2184  if (!TupIsNull(result))
2185  return result;
2186  }
2187 
2188  return NULL;
2189 }
2190 
2191 /*
2192  * ExecAgg for non-hashed case
2193  */
2194 static TupleTableSlot *
2196 {
2197  Agg *node = aggstate->phase->aggnode;
2198  ExprContext *econtext;
2199  ExprContext *tmpcontext;
2200  AggStatePerAgg peragg;
2201  AggStatePerGroup *pergroups;
2202  TupleTableSlot *outerslot;
2203  TupleTableSlot *firstSlot;
2204  TupleTableSlot *result;
2205  bool hasGroupingSets = aggstate->phase->numsets > 0;
2206  int numGroupingSets = Max(aggstate->phase->numsets, 1);
2207  int currentSet;
2208  int nextSetSize;
2209  int numReset;
2210  int i;
2211 
2212  /*
2213  * get state info from node
2214  *
2215  * econtext is the per-output-tuple expression context
2216  *
2217  * tmpcontext is the per-input-tuple expression context
2218  */
2219  econtext = aggstate->ss.ps.ps_ExprContext;
2220  tmpcontext = aggstate->tmpcontext;
2221 
2222  peragg = aggstate->peragg;
2223  pergroups = aggstate->pergroups;
2224  firstSlot = aggstate->ss.ss_ScanTupleSlot;
2225 
2226  /*
2227  * We loop retrieving groups until we find one matching
2228  * aggstate->ss.ps.qual
2229  *
2230  * For grouping sets, we have the invariant that aggstate->projected_set
2231  * is either -1 (initial call) or the index (starting from 0) in
2232  * gset_lengths for the group we just completed (either by projecting a
2233  * row or by discarding it in the qual).
2234  */
2235  while (!aggstate->agg_done)
2236  {
2237  /*
2238  * Clear the per-output-tuple context for each group, as well as
2239  * aggcontext (which contains any pass-by-ref transvalues of the old
2240  * group). Some aggregate functions store working state in child
2241  * contexts; those now get reset automatically without us needing to
2242  * do anything special.
2243  *
2244  * We use ReScanExprContext not just ResetExprContext because we want
2245  * any registered shutdown callbacks to be called. That allows
2246  * aggregate functions to ensure they've cleaned up any non-memory
2247  * resources.
2248  */
2249  ReScanExprContext(econtext);
2250 
2251  /*
2252  * Determine how many grouping sets need to be reset at this boundary.
2253  */
2254  if (aggstate->projected_set >= 0 &&
2255  aggstate->projected_set < numGroupingSets)
2256  numReset = aggstate->projected_set + 1;
2257  else
2258  numReset = numGroupingSets;
2259 
2260  /*
2261  * numReset can change on a phase boundary, but that's OK; we want to
2262  * reset the contexts used in _this_ phase, and later, after possibly
2263  * changing phase, initialize the right number of aggregates for the
2264  * _new_ phase.
2265  */
2266 
2267  for (i = 0; i < numReset; i++)
2268  {
2269  ReScanExprContext(aggstate->aggcontexts[i]);
2270  }
2271 
2272  /*
2273  * Check if input is complete and there are no more groups to project
2274  * in this phase; move to next phase or mark as done.
2275  */
2276  if (aggstate->input_done == true &&
2277  aggstate->projected_set >= (numGroupingSets - 1))
2278  {
2279  if (aggstate->current_phase < aggstate->numphases - 1)
2280  {
2281  initialize_phase(aggstate, aggstate->current_phase + 1);
2282  aggstate->input_done = false;
2283  aggstate->projected_set = -1;
2284  numGroupingSets = Max(aggstate->phase->numsets, 1);
2285  node = aggstate->phase->aggnode;
2286  numReset = numGroupingSets;
2287  }
2288  else if (aggstate->aggstrategy == AGG_MIXED)
2289  {
2290  /*
2291  * Mixed mode; we've output all the grouped stuff and have
2292  * full hashtables, so switch to outputting those.
2293  */
2294  initialize_phase(aggstate, 0);
2295  aggstate->table_filled = true;
2297  &aggstate->perhash[0].hashiter);
2298  select_current_set(aggstate, 0, true);
2299  return agg_retrieve_hash_table(aggstate);
2300  }
2301  else
2302  {
2303  aggstate->agg_done = true;
2304  break;
2305  }
2306  }
2307 
2308  /*
2309  * Get the number of columns in the next grouping set after the last
2310  * projected one (if any). This is the number of columns to compare to
2311  * see if we reached the boundary of that set too.
2312  */
2313  if (aggstate->projected_set >= 0 &&
2314  aggstate->projected_set < (numGroupingSets - 1))
2315  nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1];
2316  else
2317  nextSetSize = 0;
2318 
2319  /*----------
2320  * If a subgroup for the current grouping set is present, project it.
2321  *
2322  * We have a new group if:
2323  * - we're out of input but haven't projected all grouping sets
2324  * (checked above)
2325  * OR
2326  * - we already projected a row that wasn't from the last grouping
2327  * set
2328  * AND
2329  * - the next grouping set has at least one grouping column (since
2330  * empty grouping sets project only once input is exhausted)
2331  * AND
2332  * - the previous and pending rows differ on the grouping columns
2333  * of the next grouping set
2334  *----------
2335  */
2336  tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple;
2337  if (aggstate->input_done ||
2338  (node->aggstrategy != AGG_PLAIN &&
2339  aggstate->projected_set != -1 &&
2340  aggstate->projected_set < (numGroupingSets - 1) &&
2341  nextSetSize > 0 &&
2342  !ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1],
2343  tmpcontext)))
2344  {
2345  aggstate->projected_set += 1;
2346 
2347  Assert(aggstate->projected_set < numGroupingSets);
2348  Assert(nextSetSize > 0 || aggstate->input_done);
2349  }
2350  else
2351  {
2352  /*
2353  * We no longer care what group we just projected, the next
2354  * projection will always be the first (or only) grouping set
2355  * (unless the input proves to be empty).
2356  */
2357  aggstate->projected_set = 0;
2358 
2359  /*
2360  * If we don't already have the first tuple of the new group,
2361  * fetch it from the outer plan.
2362  */
2363  if (aggstate->grp_firstTuple == NULL)
2364  {
2365  outerslot = fetch_input_tuple(aggstate);
2366  if (!TupIsNull(outerslot))
2367  {
2368  /*
2369  * Make a copy of the first input tuple; we will use this
2370  * for comparisons (in group mode) and for projection.
2371  */
2372  aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
2373  }
2374  else
2375  {
2376  /* outer plan produced no tuples at all */
2377  if (hasGroupingSets)
2378  {
2379  /*
2380  * If there was no input at all, we need to project
2381  * rows only if there are grouping sets of size 0.
2382  * Note that this implies that there can't be any
2383  * references to ungrouped Vars, which would otherwise
2384  * cause issues with the empty output slot.
2385  *
2386  * XXX: This is no longer true, we currently deal with
2387  * this in finalize_aggregates().
2388  */
2389  aggstate->input_done = true;
2390 
2391  while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0)
2392  {
2393  aggstate->projected_set += 1;
2394  if (aggstate->projected_set >= numGroupingSets)
2395  {
2396  /*
2397  * We can't set agg_done here because we might
2398  * have more phases to do, even though the
2399  * input is empty. So we need to restart the
2400  * whole outer loop.
2401  */
2402  break;
2403  }
2404  }
2405 
2406  if (aggstate->projected_set >= numGroupingSets)
2407  continue;
2408  }
2409  else
2410  {
2411  aggstate->agg_done = true;
2412  /* If we are grouping, we should produce no tuples too */
2413  if (node->aggstrategy != AGG_PLAIN)
2414  return NULL;
2415  }
2416  }
2417  }
2418 
2419  /*
2420  * Initialize working state for a new input tuple group.
2421  */
2422  initialize_aggregates(aggstate, pergroups, numReset);
2423 
2424  if (aggstate->grp_firstTuple != NULL)
2425  {
2426  /*
2427  * Store the copied first input tuple in the tuple table slot
2428  * reserved for it. The tuple will be deleted when it is
2429  * cleared from the slot.
2430  */
2432  firstSlot, true);
2433  aggstate->grp_firstTuple = NULL; /* don't keep two pointers */
2434 
2435  /* set up for first advance_aggregates call */
2436  tmpcontext->ecxt_outertuple = firstSlot;
2437 
2438  /*
2439  * Process each outer-plan tuple, and then fetch the next one,
2440  * until we exhaust the outer plan or cross a group boundary.
2441  */
2442  for (;;)
2443  {
2444  /*
2445  * During phase 1 only of a mixed agg, we need to update
2446  * hashtables as well in advance_aggregates.
2447  */
2448  if (aggstate->aggstrategy == AGG_MIXED &&
2449  aggstate->current_phase == 1)
2450  {
2451  lookup_hash_entries(aggstate);
2452  }
2453 
2454  /* Advance the aggregates (or combine functions) */
2455  advance_aggregates(aggstate);
2456 
2457  /* Reset per-input-tuple context after each tuple */
2458  ResetExprContext(tmpcontext);
2459 
2460  outerslot = fetch_input_tuple(aggstate);
2461  if (TupIsNull(outerslot))
2462  {
2463  /* no more outer-plan tuples available */
2464 
2465  /* if we built hash tables, finalize any spills */
2466  if (aggstate->aggstrategy == AGG_MIXED &&
2467  aggstate->current_phase == 1)
2469 
2470  if (hasGroupingSets)
2471  {
2472  aggstate->input_done = true;
2473  break;
2474  }
2475  else
2476  {
2477  aggstate->agg_done = true;
2478  break;
2479  }
2480  }
2481  /* set up for next advance_aggregates call */
2482  tmpcontext->ecxt_outertuple = outerslot;
2483 
2484  /*
2485  * If we are grouping, check whether we've crossed a group
2486  * boundary.
2487  */
2488  if (node->aggstrategy != AGG_PLAIN)
2489  {
2490  tmpcontext->ecxt_innertuple = firstSlot;
2491  if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1],
2492  tmpcontext))
2493  {
2494  aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
2495  break;
2496  }
2497  }
2498  }
2499  }
2500 
2501  /*
2502  * Use the representative input tuple for any references to
2503  * non-aggregated input columns in aggregate direct args, the node
2504  * qual, and the tlist. (If we are not grouping, and there are no
2505  * input rows at all, we will come here with an empty firstSlot
2506  * ... but if not grouping, there can't be any references to
2507  * non-aggregated input columns, so no problem.)
2508  */
2509  econtext->ecxt_outertuple = firstSlot;
2510  }
2511 
2512  Assert(aggstate->projected_set >= 0);
2513 
2514  currentSet = aggstate->projected_set;
2515 
2516  prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet);
2517 
2518  select_current_set(aggstate, currentSet, false);
2519 
2520  finalize_aggregates(aggstate,
2521  peragg,
2522  pergroups[currentSet]);
2523 
2524  /*
2525  * If there's no row to project right now, we must continue rather
2526  * than returning a null since there might be more groups.
2527  */
2528  result = project_aggregates(aggstate);
2529  if (result)
2530  return result;
2531  }
2532 
2533  /* No more groups */
2534  return NULL;
2535 }
2536 
2537 /*
2538  * ExecAgg for hashed case: read input and build hash table
2539  */
2540 static void
2542 {
2543  TupleTableSlot *outerslot;
2544  ExprContext *tmpcontext = aggstate->tmpcontext;
2545 
2546  /*
2547  * Process each outer-plan tuple, and then fetch the next one, until we
2548  * exhaust the outer plan.
2549  */
2550  for (;;)
2551  {
2552  outerslot = fetch_input_tuple(aggstate);
2553  if (TupIsNull(outerslot))
2554  break;
2555 
2556  /* set up for lookup_hash_entries and advance_aggregates */
2557  tmpcontext->ecxt_outertuple = outerslot;
2558 
2559  /* Find or build hashtable entries */
2560  lookup_hash_entries(aggstate);
2561 
2562  /* Advance the aggregates (or combine functions) */
2563  advance_aggregates(aggstate);
2564 
2565  /*
2566  * Reset per-input-tuple context after each tuple, but note that the
2567  * hash lookups do this too
2568  */
2569  ResetExprContext(aggstate->tmpcontext);
2570  }
2571 
2572  /* finalize spills, if any */
2574 
2575  aggstate->table_filled = true;
2576  /* Initialize to walk the first hash table */
2577  select_current_set(aggstate, 0, true);
2579  &aggstate->perhash[0].hashiter);
2580 }
2581 
2582 /*
2583  * If any data was spilled during hash aggregation, reset the hash table and
2584  * reprocess one batch of spilled data. After reprocessing a batch, the hash
2585  * table will again contain data, ready to be consumed by
2586  * agg_retrieve_hash_table_in_memory().
2587  *
2588  * Should only be called after all in memory hash table entries have been
2589  * finalized and emitted.
2590  *
2591  * Return false when input is exhausted and there's no more work to be done;
2592  * otherwise return true.
2593  */
2594 static bool
2596 {
2597  HashAggBatch *batch;
2598  AggStatePerHash perhash;
2599  HashAggSpill spill;
2600  HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
2601  bool spill_initialized = false;
2602 
2603  if (aggstate->hash_batches == NIL)
2604  return false;
2605 
2606  batch = linitial(aggstate->hash_batches);
2607  aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
2608 
2609  hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
2610  batch->used_bits, &aggstate->hash_mem_limit,
2611  &aggstate->hash_ngroups_limit, NULL);
2612 
2613  /*
2614  * Each batch only processes one grouping set; set the rest to NULL so
2615  * that advance_aggregates() knows to ignore them. We don't touch
2616  * pergroups for sorted grouping sets here, because they will be needed if
2617  * we rescan later. The expressions for sorted grouping sets will not be
2618  * evaluated after we recompile anyway.
2619  */
2620  MemSet(aggstate->hash_pergroup, 0,
2621  sizeof(AggStatePerGroup) * aggstate->num_hashes);
2622 
2623  /* free memory and reset hash tables */
2624  ReScanExprContext(aggstate->hashcontext);
2625  for (int setno = 0; setno < aggstate->num_hashes; setno++)
2626  ResetTupleHashTable(aggstate->perhash[setno].hashtable);
2627 
2628  aggstate->hash_ngroups_current = 0;
2629 
2630  /*
2631  * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output
2632  * happens in phase 0. So, we switch to phase 1 when processing a batch,
2633  * and back to phase 0 after the batch is done.
2634  */
2635  Assert(aggstate->current_phase == 0);
2636  if (aggstate->phase->aggstrategy == AGG_MIXED)
2637  {
2638  aggstate->current_phase = 1;
2639  aggstate->phase = &aggstate->phases[aggstate->current_phase];
2640  }
2641 
2642  select_current_set(aggstate, batch->setno, true);
2643 
2644  perhash = &aggstate->perhash[aggstate->current_set];
2645 
2646  /*
2647  * Spilled tuples are always read back as MinimalTuples, which may be
2648  * different from the outer plan, so recompile the aggregate expressions.
2649  *
2650  * We still need the NULL check, because we are only processing one
2651  * grouping set at a time and the rest will be NULL.
2652  */
2653  hashagg_recompile_expressions(aggstate, true, true);
2654 
2655  for (;;)
2656  {
2657  TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
2658  TupleTableSlot *hashslot = perhash->hashslot;
2659  TupleHashEntry entry;
2660  MinimalTuple tuple;
2661  uint32 hash;
2662  bool isnew = false;
2663  bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
2664 
2666 
2667  tuple = hashagg_batch_read(batch, &hash);
2668  if (tuple == NULL)
2669  break;
2670 
2671  ExecStoreMinimalTuple(tuple, spillslot, true);
2672  aggstate->tmpcontext->ecxt_outertuple = spillslot;
2673 
2674  prepare_hash_slot(perhash,
2675  aggstate->tmpcontext->ecxt_outertuple,
2676  hashslot);
2677  entry = LookupTupleHashEntryHash(
2678  perhash->hashtable, hashslot, p_isnew, hash);
2679 
2680  if (entry != NULL)
2681  {
2682  if (isnew)
2683  initialize_hash_entry(aggstate, perhash->hashtable, entry);
2684  aggstate->hash_pergroup[batch->setno] = entry->additional;
2685  advance_aggregates(aggstate);
2686  }
2687  else
2688  {
2689  if (!spill_initialized)
2690  {
2691  /*
2692  * Avoid initializing the spill until we actually need it so
2693  * that we don't assign tapes that will never be used.
2694  */
2695  spill_initialized = true;
2696  hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
2697  batch->input_card, aggstate->hashentrysize);
2698  }
2699  /* no memory for a new group, spill */
2700  hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
2701 
2702  aggstate->hash_pergroup[batch->setno] = NULL;
2703  }
2704 
2705  /*
2706  * Reset per-input-tuple context after each tuple, but note that the
2707  * hash lookups do this too
2708  */
2709  ResetExprContext(aggstate->tmpcontext);
2710  }
2711 
2712  hashagg_tapeinfo_release(tapeinfo, batch->input_tapenum);
2713 
2714  /* change back to phase 0 */
2715  aggstate->current_phase = 0;
2716  aggstate->phase = &aggstate->phases[aggstate->current_phase];
2717 
2718  if (spill_initialized)
2719  {
2720  hashagg_spill_finish(aggstate, &spill, batch->setno);
2721  hash_agg_update_metrics(aggstate, true, spill.npartitions);
2722  }
2723  else
2724  hash_agg_update_metrics(aggstate, true, 0);
2725 
2726  aggstate->hash_spill_mode = false;
2727 
2728  /* prepare to walk the first hash table */
2729  select_current_set(aggstate, batch->setno, true);
2730  ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
2731  &aggstate->perhash[batch->setno].hashiter);
2732 
2733  pfree(batch);
2734 
2735  return true;
2736 }
2737 
2738 /*
2739  * ExecAgg for hashed case: retrieving groups from hash table
2740  *
2741  * After exhausting in-memory tuples, also try refilling the hash table using
2742  * previously-spilled tuples. Only returns NULL after all in-memory and
2743  * spilled tuples are exhausted.
2744  */
2745 static TupleTableSlot *
2747 {
2748  TupleTableSlot *result = NULL;
2749 
2750  while (result == NULL)
2751  {
2752  result = agg_retrieve_hash_table_in_memory(aggstate);
2753  if (result == NULL)
2754  {
2755  if (!agg_refill_hash_table(aggstate))
2756  {
2757  aggstate->agg_done = true;
2758  break;
2759  }
2760  }
2761  }
2762 
2763  return result;
2764 }
2765 
2766 /*
2767  * Retrieve the groups from the in-memory hash tables without considering any
2768  * spilled tuples.
2769  */
2770 static TupleTableSlot *
2772 {
2773  ExprContext *econtext;
2774  AggStatePerAgg peragg;
2775  AggStatePerGroup pergroup;
2776  TupleHashEntryData *entry;
2777  TupleTableSlot *firstSlot;
2778  TupleTableSlot *result;
2779  AggStatePerHash perhash;
2780 
2781  /*
2782  * get state info from node.
2783  *
2784  * econtext is the per-output-tuple expression context.
2785  */
2786  econtext = aggstate->ss.ps.ps_ExprContext;
2787  peragg = aggstate->peragg;
2788  firstSlot = aggstate->ss.ss_ScanTupleSlot;
2789 
2790  /*
2791  * Note that perhash (and therefore anything accessed through it) can
2792  * change inside the loop, as we change between grouping sets.
2793  */
2794  perhash = &aggstate->perhash[aggstate->current_set];
2795 
2796  /*
2797  * We loop retrieving groups until we find one satisfying
2798  * aggstate->ss.ps.qual
2799  */
2800  for (;;)
2801  {
2802  TupleTableSlot *hashslot = perhash->hashslot;
2803  int i;
2804 
2806 
2807  /*
2808  * Find the next entry in the hash table
2809  */
2810  entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
2811  if (entry == NULL)
2812  {
2813  int nextset = aggstate->current_set + 1;
2814 
2815  if (nextset < aggstate->num_hashes)
2816  {
2817  /*
2818  * Switch to next grouping set, reinitialize, and restart the
2819  * loop.
2820  */
2821  select_current_set(aggstate, nextset, true);
2822 
2823  perhash = &aggstate->perhash[aggstate->current_set];
2824 
2825  ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
2826 
2827  continue;
2828  }
2829  else
2830  {
2831  return NULL;
2832  }
2833  }
2834 
2835  /*
2836  * Clear the per-output-tuple context for each group
2837  *
2838  * We intentionally don't use ReScanExprContext here; if any aggs have
2839  * registered shutdown callbacks, they mustn't be called yet, since we
2840  * might not be done with that agg.
2841  */
2842  ResetExprContext(econtext);
2843 
2844  /*
2845  * Transform representative tuple back into one with the right
2846  * columns.
2847  */
2848  ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
2849  slot_getallattrs(hashslot);
2850 
2851  ExecClearTuple(firstSlot);
2852  memset(firstSlot->tts_isnull, true,
2853  firstSlot->tts_tupleDescriptor->natts * sizeof(bool));
2854 
2855  for (i = 0; i < perhash->numhashGrpCols; i++)
2856  {
2857  int varNumber = perhash->hashGrpColIdxInput[i] - 1;
2858 
2859  firstSlot->tts_values[varNumber] = hashslot->tts_values[i];
2860  firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i];
2861  }
2862  ExecStoreVirtualTuple(firstSlot);
2863 
2864  pergroup = (AggStatePerGroup) entry->additional;
2865 
2866  /*
2867  * Use the representative input tuple for any references to
2868  * non-aggregated input columns in the qual and tlist.
2869  */
2870  econtext->ecxt_outertuple = firstSlot;
2871 
2872  prepare_projection_slot(aggstate,
2873  econtext->ecxt_outertuple,
2874  aggstate->current_set);
2875 
2876  finalize_aggregates(aggstate, peragg, pergroup);
2877 
2878  result = project_aggregates(aggstate);
2879  if (result)
2880  return result;
2881  }
2882 
2883  /* No more groups */
2884  return NULL;
2885 }
2886 
2887 /*
2888  * Initialize HashTapeInfo
2889  */
2890 static void
2892 {
2893  HashTapeInfo *tapeinfo = palloc(sizeof(HashTapeInfo));
2894  int init_tapes = 16; /* expanded dynamically */
2895 
2896  tapeinfo->tapeset = LogicalTapeSetCreate(init_tapes, true, NULL, NULL, -1);
2897  tapeinfo->ntapes = init_tapes;
2898  tapeinfo->nfreetapes = init_tapes;
2899  tapeinfo->freetapes_alloc = init_tapes;
2900  tapeinfo->freetapes = palloc(init_tapes * sizeof(int));
2901  for (int i = 0; i < init_tapes; i++)
2902  tapeinfo->freetapes[i] = i;
2903 
2904  aggstate->hash_tapeinfo = tapeinfo;
2905 }
2906 
2907 /*
2908  * Assign unused tapes to spill partitions, extending the tape set if
2909  * necessary.
2910  */
2911 static void
2913  int npartitions)
2914 {
2915  int partidx = 0;
2916 
2917  /* use free tapes if available */
2918  while (partidx < npartitions && tapeinfo->nfreetapes > 0)
2919  partitions[partidx++] = tapeinfo->freetapes[--tapeinfo->nfreetapes];
2920 
2921  if (partidx < npartitions)
2922  {
2923  LogicalTapeSetExtend(tapeinfo->tapeset, npartitions - partidx);
2924 
2925  while (partidx < npartitions)
2926  partitions[partidx++] = tapeinfo->ntapes++;
2927  }
2928 }
2929 
2930 /*
2931  * After a tape has already been written to and then read, this function
2932  * rewinds it for writing and adds it to the free list.
2933  */
2934 static void
2936 {
2937  /* rewinding frees the buffer while not in use */
2938  LogicalTapeRewindForWrite(tapeinfo->tapeset, tapenum);
2939  if (tapeinfo->freetapes_alloc == tapeinfo->nfreetapes)
2940  {
2941  tapeinfo->freetapes_alloc <<= 1;
2942  tapeinfo->freetapes = repalloc(tapeinfo->freetapes,
2943  tapeinfo->freetapes_alloc * sizeof(int));
2944  }
2945  tapeinfo->freetapes[tapeinfo->nfreetapes++] = tapenum;
2946 }
2947 
2948 /*
2949  * hashagg_spill_init
2950  *
2951  * Called after we determined that spilling is necessary. Chooses the number
2952  * of partitions to create, and initializes them.
2953  */
2954 static void
2955 hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
2956  double input_groups, double hashentrysize)
2957 {
2958  int npartitions;
2959  int partition_bits;
2960 
2961  npartitions = hash_choose_num_partitions(input_groups, hashentrysize,
2962  used_bits, &partition_bits);
2963 
2964  spill->partitions = palloc0(sizeof(int) * npartitions);
2965  spill->ntuples = palloc0(sizeof(int64) * npartitions);
2966  spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
2967 
2968  hashagg_tapeinfo_assign(tapeinfo, spill->partitions, npartitions);
2969 
2970  spill->tapeset = tapeinfo->tapeset;
2971  spill->shift = 32 - used_bits - partition_bits;
2972  spill->mask = (npartitions - 1) << spill->shift;
2973  spill->npartitions = npartitions;
2974 
2975  for (int i = 0; i < npartitions; i++)
2977 }
2978 
2979 /*
2980  * hashagg_spill_tuple
2981  *
2982  * No room for new groups in the hash table. Save for later in the appropriate
2983  * partition.
2984  */
2985 static Size
2987  TupleTableSlot *inputslot, uint32 hash)
2988 {
2989  LogicalTapeSet *tapeset = spill->tapeset;
2990  TupleTableSlot *spillslot;
2991  int partition;
2992  MinimalTuple tuple;
2993  int tapenum;
2994  int total_written = 0;
2995  bool shouldFree;
2996 
2997  Assert(spill->partitions != NULL);
2998 
2999  /* spill only attributes that we actually need */
3000  if (!aggstate->all_cols_needed)
3001  {
3002  spillslot = aggstate->hash_spill_wslot;
3003  slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
3004  ExecClearTuple(spillslot);
3005  for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
3006  {
3007  if (bms_is_member(i + 1, aggstate->colnos_needed))
3008  {
3009  spillslot->tts_values[i] = inputslot->tts_values[i];
3010  spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
3011  }
3012  else
3013  spillslot->tts_isnull[i] = true;
3014  }
3015  ExecStoreVirtualTuple(spillslot);
3016  }
3017  else
3018  spillslot = inputslot;
3019 
3020  tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
3021 
3022  partition = (hash & spill->mask) >> spill->shift;
3023  spill->ntuples[partition]++;
3024 
3025  /*
3026  * All hash values destined for a given partition have some bits in
3027  * common, which causes bad HLL cardinality estimates. Hash the hash to
3028  * get a more uniform distribution.
3029  */
3030  addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
3031 
3032  tapenum = spill->partitions[partition];
3033 
3034  LogicalTapeWrite(tapeset, tapenum, (void *) &hash, sizeof(uint32));
3035  total_written += sizeof(uint32);
3036 
3037  LogicalTapeWrite(tapeset, tapenum, (void *) tuple, tuple->t_len);
3038  total_written += tuple->t_len;
3039 
3040  if (shouldFree)
3041  pfree(tuple);
3042 
3043  return total_written;
3044 }
3045 
3046 /*
3047  * hashagg_batch_new
3048  *
3049  * Construct a HashAggBatch item, which represents one iteration of HashAgg to
3050  * be done.
3051  */
3052 static HashAggBatch *
3053 hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
3054  int64 input_tuples, double input_card, int used_bits)
3055 {
3056  HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
3057 
3058  batch->setno = setno;
3059  batch->used_bits = used_bits;
3060  batch->tapeset = tapeset;
3061  batch->input_tapenum = tapenum;
3062  batch->input_tuples = input_tuples;
3063  batch->input_card = input_card;
3064 
3065  return batch;
3066 }
3067 
3068 /*
3069  * read_spilled_tuple
3070  * read the next tuple from a batch's tape. Return NULL if no more.
3071  */
3072 static MinimalTuple
3074 {
3075  LogicalTapeSet *tapeset = batch->tapeset;
3076  int tapenum = batch->input_tapenum;
3077  MinimalTuple tuple;
3078  uint32 t_len;
3079  size_t nread;
3080  uint32 hash;
3081 
3082  nread = LogicalTapeRead(tapeset, tapenum, &hash, sizeof(uint32));
3083  if (nread == 0)
3084  return NULL;
3085  if (nread != sizeof(uint32))
3086  ereport(ERROR,
3088  errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
3089  tapenum, sizeof(uint32), nread)));
3090  if (hashp != NULL)
3091  *hashp = hash;
3092 
3093  nread = LogicalTapeRead(tapeset, tapenum, &t_len, sizeof(t_len));
3094  if (nread != sizeof(uint32))
3095  ereport(ERROR,
3097  errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
3098  tapenum, sizeof(uint32), nread)));
3099 
3100  tuple = (MinimalTuple) palloc(t_len);
3101  tuple->t_len = t_len;
3102 
3103  nread = LogicalTapeRead(tapeset, tapenum,
3104  (void *) ((char *) tuple + sizeof(uint32)),
3105  t_len - sizeof(uint32));
3106  if (nread != t_len - sizeof(uint32))
3107  ereport(ERROR,
3109  errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
3110  tapenum, t_len - sizeof(uint32), nread)));
3111 
3112  return tuple;
3113 }
3114 
3115 /*
3116  * hashagg_finish_initial_spills
3117  *
3118  * After a HashAggBatch has been processed, it may have spilled tuples to
3119  * disk. If so, turn the spilled partitions into new batches that must later
3120  * be executed.
3121  */
3122 static void
3124 {
3125  int setno;
3126  int total_npartitions = 0;
3127 
3128  if (aggstate->hash_spills != NULL)
3129  {
3130  for (setno = 0; setno < aggstate->num_hashes; setno++)
3131  {
3132  HashAggSpill *spill = &aggstate->hash_spills[setno];
3133 
3134  total_npartitions += spill->npartitions;
3135  hashagg_spill_finish(aggstate, spill, setno);
3136  }
3137 
3138  /*
3139  * We're not processing tuples from outer plan any more; only
3140  * processing batches of spilled tuples. The initial spill structures
3141  * are no longer needed.
3142  */
3143  pfree(aggstate->hash_spills);
3144  aggstate->hash_spills = NULL;
3145  }
3146 
3147  hash_agg_update_metrics(aggstate, false, total_npartitions);
3148  aggstate->hash_spill_mode = false;
3149 }
3150 
3151 /*
3152  * hashagg_spill_finish
3153  *
3154  * Transform spill partitions into new batches.
3155  */
3156 static void
3157 hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
3158 {
3159  int i;
3160  int used_bits = 32 - spill->shift;
3161 
3162  if (spill->npartitions == 0)
3163  return; /* didn't spill */
3164 
3165  for (i = 0; i < spill->npartitions; i++)
3166  {
3168  int tapenum = spill->partitions[i];
3169  HashAggBatch *new_batch;
3170  double cardinality;
3171 
3172  /* if the partition is empty, don't create a new batch of work */
3173  if (spill->ntuples[i] == 0)
3174  continue;
3175 
3176  cardinality = estimateHyperLogLog(&spill->hll_card[i]);
3177  freeHyperLogLog(&spill->hll_card[i]);
3178 
3179  /* rewinding frees the buffer while not in use */
3180  LogicalTapeRewindForRead(tapeset, tapenum,
3182 
3183  new_batch = hashagg_batch_new(tapeset, tapenum, setno,
3184  spill->ntuples[i], cardinality,
3185  used_bits);
3186  aggstate->hash_batches = lcons(new_batch, aggstate->hash_batches);
3187  aggstate->hash_batches_used++;
3188  }
3189 
3190  pfree(spill->ntuples);
3191  pfree(spill->hll_card);
3192  pfree(spill->partitions);
3193 }
3194 
3195 /*
3196  * Free resources related to a spilled HashAgg.
3197  */
3198 static void
3200 {
3201  ListCell *lc;
3202 
3203  /* free spills from initial pass */
3204  if (aggstate->hash_spills != NULL)
3205  {
3206  int setno;
3207 
3208  for (setno = 0; setno < aggstate->num_hashes; setno++)
3209  {
3210  HashAggSpill *spill = &aggstate->hash_spills[setno];
3211 
3212  pfree(spill->ntuples);
3213  pfree(spill->partitions);
3214  }
3215  pfree(aggstate->hash_spills);
3216  aggstate->hash_spills = NULL;
3217  }
3218 
3219  /* free batches */
3220  foreach(lc, aggstate->hash_batches)
3221  {
3222  HashAggBatch *batch = (HashAggBatch *) lfirst(lc);
3223 
3224  pfree(batch);
3225  }
3226  list_free(aggstate->hash_batches);
3227  aggstate->hash_batches = NIL;
3228 
3229  /* close tape set */
3230  if (aggstate->hash_tapeinfo != NULL)
3231  {
3232  HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
3233 
3234  LogicalTapeSetClose(tapeinfo->tapeset);
3235  pfree(tapeinfo->freetapes);
3236  pfree(tapeinfo);
3237  aggstate->hash_tapeinfo = NULL;
3238  }
3239 }
3240 
3241 
3242 /* -----------------
3243  * ExecInitAgg
3244  *
3245  * Creates the run-time information for the agg node produced by the
3246  * planner and initializes its outer subtree.
3247  *
3248  * -----------------
3249  */
3250 AggState *
3251 ExecInitAgg(Agg *node, EState *estate, int eflags)
3252 {
3253  AggState *aggstate;
3254  AggStatePerAgg peraggs;
3255  AggStatePerTrans pertransstates;
3256  AggStatePerGroup *pergroups;
3257  Plan *outerPlan;
3258  ExprContext *econtext;
3259  TupleDesc scanDesc;
3260  int max_aggno;
3261  int max_transno;
3262  int numaggrefs;
3263  int numaggs;
3264  int numtrans;
3265  int phase;
3266  int phaseidx;
3267  ListCell *l;
3268  Bitmapset *all_grouped_cols = NULL;
3269  int numGroupingSets = 1;
3270  int numPhases;
3271  int numHashes;
3272  int i = 0;
3273  int j = 0;
3274  bool use_hashing = (node->aggstrategy == AGG_HASHED ||
3275  node->aggstrategy == AGG_MIXED);
3276 
3277  /* check for unsupported flags */
3278  Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
3279 
3280  /*
3281  * create state structure
3282  */
3283  aggstate = makeNode(AggState);
3284  aggstate->ss.ps.plan = (Plan *) node;
3285  aggstate->ss.ps.state = estate;
3286  aggstate->ss.ps.ExecProcNode = ExecAgg;
3287 
3288  aggstate->aggs = NIL;
3289  aggstate->numaggs = 0;
3290  aggstate->numtrans = 0;
3291  aggstate->aggstrategy = node->aggstrategy;
3292  aggstate->aggsplit = node->aggsplit;
3293  aggstate->maxsets = 0;
3294  aggstate->projected_set = -1;
3295  aggstate->current_set = 0;
3296  aggstate->peragg = NULL;
3297  aggstate->pertrans = NULL;
3298  aggstate->curperagg = NULL;
3299  aggstate->curpertrans = NULL;
3300  aggstate->input_done = false;
3301  aggstate->agg_done = false;
3302  aggstate->pergroups = NULL;
3303  aggstate->grp_firstTuple = NULL;
3304  aggstate->sort_in = NULL;
3305  aggstate->sort_out = NULL;
3306 
3307  /*
3308  * phases[0] always exists, but is dummy in sorted/plain mode
3309  */
3310  numPhases = (use_hashing ? 1 : 2);
3311  numHashes = (use_hashing ? 1 : 0);
3312 
3313  /*
3314  * Calculate the maximum number of grouping sets in any phase; this
3315  * determines the size of some allocations. Also calculate the number of
3316  * phases, since all hashed/mixed nodes contribute to only a single phase.
3317  */
3318  if (node->groupingSets)
3319  {
3320  numGroupingSets = list_length(node->groupingSets);
3321 
3322  foreach(l, node->chain)
3323  {
3324  Agg *agg = lfirst(l);
3325 
3326  numGroupingSets = Max(numGroupingSets,
3327  list_length(agg->groupingSets));
3328 
3329  /*
3330  * additional AGG_HASHED aggs become part of phase 0, but all
3331  * others add an extra phase.
3332  */
3333  if (agg->aggstrategy != AGG_HASHED)
3334  ++numPhases;
3335  else
3336  ++numHashes;
3337  }
3338  }
3339 
3340  aggstate->maxsets = numGroupingSets;
3341  aggstate->numphases = numPhases;
3342 
3343  aggstate->aggcontexts = (ExprContext **)
3344  palloc0(sizeof(ExprContext *) * numGroupingSets);
3345 
3346  /*
3347  * Create expression contexts. We need three or more, one for
3348  * per-input-tuple processing, one for per-output-tuple processing, one
3349  * for all the hashtables, and one for each grouping set. The per-tuple
3350  * memory context of the per-grouping-set ExprContexts (aggcontexts)
3351  * replaces the standalone memory context formerly used to hold transition
3352  * values. We cheat a little by using ExecAssignExprContext() to build
3353  * all of them.
3354  *
3355  * NOTE: the details of what is stored in aggcontexts and what is stored
3356  * in the regular per-query memory context are driven by a simple
3357  * decision: we want to reset the aggcontext at group boundaries (if not
3358  * hashing) and in ExecReScanAgg to recover no-longer-wanted space.
3359  */
3360  ExecAssignExprContext(estate, &aggstate->ss.ps);
3361  aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
3362 
3363  for (i = 0; i < numGroupingSets; ++i)
3364  {
3365  ExecAssignExprContext(estate, &aggstate->ss.ps);
3366  aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext;
3367  }
3368 
3369  if (use_hashing)
3370  aggstate->hashcontext = CreateWorkExprContext(estate);
3371 
3372  ExecAssignExprContext(estate, &aggstate->ss.ps);
3373 
3374  /*
3375  * Initialize child nodes.
3376  *
3377  * If we are doing a hashed aggregation then the child plan does not need
3378  * to handle REWIND efficiently; see ExecReScanAgg.
3379  */
3380  if (node->aggstrategy == AGG_HASHED)
3381  eflags &= ~EXEC_FLAG_REWIND;
3382  outerPlan = outerPlan(node);
3383  outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
3384 
3385  /*
3386  * initialize source tuple type.
3387  */
3388  aggstate->ss.ps.outerops =
3390  &aggstate->ss.ps.outeropsfixed);
3391  aggstate->ss.ps.outeropsset = true;
3392 
3393  ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
3394  aggstate->ss.ps.outerops);
3395  scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
3396 
3397  /*
3398  * If there are more than two phases (including a potential dummy phase
3399  * 0), input will be resorted using tuplesort. Need a slot for that.
3400  */
3401  if (numPhases > 2)
3402  {
3403  aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc,
3405 
3406  /*
3407  * The output of the tuplesort, and the output from the outer child
3408  * might not use the same type of slot. In most cases the child will
3409  * be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the
3410  * input can also be presorted due an index, in which case it could be
3411  * a different type of slot.
3412  *
3413  * XXX: For efficiency it would be good to instead/additionally
3414  * generate expressions with corresponding settings of outerops* for
3415  * the individual phases - deforming is often a bottleneck for
3416  * aggregations with lots of rows per group. If there's multiple
3417  * sorts, we know that all but the first use TTSOpsMinimalTuple (via
3418  * the nodeAgg.c internal tuplesort).
3419  */
3420  if (aggstate->ss.ps.outeropsfixed &&
3421  aggstate->ss.ps.outerops != &TTSOpsMinimalTuple)
3422  aggstate->ss.ps.outeropsfixed = false;
3423  }
3424 
3425  /*
3426  * Initialize result type, slot and projection.
3427  */
3429  ExecAssignProjectionInfo(&aggstate->ss.ps, NULL);
3430 
3431  /*
3432  * initialize child expressions
3433  *
3434  * We expect the parser to have checked that no aggs contain other agg
3435  * calls in their arguments (and just to be sure, we verify it again while
3436  * initializing the plan node). This would make no sense under SQL
3437  * semantics, and it's forbidden by the spec. Because it is true, we
3438  * don't need to worry about evaluating the aggs in any particular order.
3439  *
3440  * Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs.
3441  * Aggrefs in the qual are found here; Aggrefs in the targetlist are found
3442  * during ExecAssignProjectionInfo, above.
3443  */
3444  aggstate->ss.ps.qual =
3445  ExecInitQual(node->plan.qual, (PlanState *) aggstate);
3446 
3447  /*
3448  * We should now have found all Aggrefs in the targetlist and quals.
3449  */
3450  numaggrefs = list_length(aggstate->aggs);
3451  max_aggno = -1;
3452  max_transno = -1;
3453  foreach(l, aggstate->aggs)
3454  {
3455  Aggref *aggref = (Aggref *) lfirst(l);
3456 
3457  max_aggno = Max(max_aggno, aggref->aggno);
3458  max_transno = Max(max_transno, aggref->aggtransno);
3459  }
3460  numaggs = max_aggno + 1;
3461  numtrans = max_transno + 1;
3462 
3463  /*
3464  * For each phase, prepare grouping set data and fmgr lookup data for
3465  * compare functions. Accumulate all_grouped_cols in passing.
3466  */
3467  aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
3468 
3469  aggstate->num_hashes = numHashes;
3470  if (numHashes)
3471  {
3472  aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes);
3473  aggstate->phases[0].numsets = 0;
3474  aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int));
3475  aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *));
3476  }
3477 
3478  phase = 0;
3479  for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx)
3480  {
3481  Agg *aggnode;
3482  Sort *sortnode;
3483 
3484  if (phaseidx > 0)
3485  {
3486  aggnode = list_nth_node(Agg, node->chain, phaseidx - 1);
3487  sortnode = castNode(Sort, aggnode->plan.lefttree);
3488  }
3489  else
3490  {
3491  aggnode = node;
3492  sortnode = NULL;
3493  }
3494 
3495  Assert(phase <= 1 || sortnode);
3496 
3497  if (aggnode->aggstrategy == AGG_HASHED
3498  || aggnode->aggstrategy == AGG_MIXED)
3499  {
3500  AggStatePerPhase phasedata = &aggstate->phases[0];
3501  AggStatePerHash perhash;
3502  Bitmapset *cols = NULL;
3503 
3504  Assert(phase == 0);
3505  i = phasedata->numsets++;
3506  perhash = &aggstate->perhash[i];
3507 
3508  /* phase 0 always points to the "real" Agg in the hash case */
3509  phasedata->aggnode = node;
3510  phasedata->aggstrategy = node->aggstrategy;
3511 
3512  /* but the actual Agg node representing this hash is saved here */
3513  perhash->aggnode = aggnode;
3514 
3515  phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols;
3516 
3517  for (j = 0; j < aggnode->numCols; ++j)
3518  cols = bms_add_member(cols, aggnode->grpColIdx[j]);
3519 
3520  phasedata->grouped_cols[i] = cols;
3521 
3522  all_grouped_cols = bms_add_members(all_grouped_cols, cols);
3523  continue;
3524  }
3525  else
3526  {
3527  AggStatePerPhase phasedata = &aggstate->phases[++phase];
3528  int num_sets;
3529 
3530  phasedata->numsets = num_sets = list_length(aggnode->groupingSets);
3531 
3532  if (num_sets)
3533  {
3534  phasedata->gset_lengths = palloc(num_sets * sizeof(int));
3535  phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *));
3536 
3537  i = 0;
3538  foreach(l, aggnode->groupingSets)
3539  {
3540  int current_length = list_length(lfirst(l));
3541  Bitmapset *cols = NULL;
3542 
3543  /* planner forces this to be correct */
3544  for (j = 0; j < current_length; ++j)
3545  cols = bms_add_member(cols, aggnode->grpColIdx[j]);
3546 
3547  phasedata->grouped_cols[i] = cols;
3548  phasedata->gset_lengths[i] = current_length;
3549 
3550  ++i;
3551  }
3552 
3553  all_grouped_cols = bms_add_members(all_grouped_cols,
3554  phasedata->grouped_cols[0]);
3555  }
3556  else
3557  {
3558  Assert(phaseidx == 0);
3559 
3560  phasedata->gset_lengths = NULL;
3561  phasedata->grouped_cols = NULL;
3562  }
3563 
3564  /*
3565  * If we are grouping, precompute fmgr lookup data for inner loop.
3566  */
3567  if (aggnode->aggstrategy == AGG_SORTED)
3568  {
3569  int i = 0;
3570 
3571  Assert(aggnode->numCols > 0);
3572 
3573  /*
3574  * Build a separate function for each subset of columns that
3575  * need to be compared.
3576  */
3577  phasedata->eqfunctions =
3578  (ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *));
3579 
3580  /* for each grouping set */
3581  for (i = 0; i < phasedata->numsets; i++)
3582  {
3583  int length = phasedata->gset_lengths[i];
3584 
3585  if (phasedata->eqfunctions[length - 1] != NULL)
3586  continue;
3587 
3588  phasedata->eqfunctions[length - 1] =
3589  execTuplesMatchPrepare(scanDesc,
3590  length,
3591  aggnode->grpColIdx,
3592  aggnode->grpOperators,
3593  aggnode->grpCollations,
3594  (PlanState *) aggstate);
3595  }
3596 
3597  /* and for all grouped columns, unless already computed */
3598  if (phasedata->eqfunctions[aggnode->numCols - 1] == NULL)
3599  {
3600  phasedata->eqfunctions[aggnode->numCols - 1] =
3601  execTuplesMatchPrepare(scanDesc,
3602  aggnode->numCols,
3603  aggnode->grpColIdx,
3604  aggnode->grpOperators,
3605  aggnode->grpCollations,
3606  (PlanState *) aggstate);
3607  }
3608  }
3609 
3610  phasedata->aggnode = aggnode;
3611  phasedata->aggstrategy = aggnode->aggstrategy;
3612  phasedata->sortnode = sortnode;
3613  }
3614  }
3615 
3616  /*
3617  * Convert all_grouped_cols to a descending-order list.
3618  */
3619  i = -1;
3620  while ((i = bms_next_member(all_grouped_cols, i)) >= 0)
3621  aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols);
3622 
3623  /*
3624  * Set up aggregate-result storage in the output expr context, and also
3625  * allocate my private per-agg working storage
3626  */
3627  econtext = aggstate->ss.ps.ps_ExprContext;
3628  econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
3629  econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
3630 
3631  peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
3632  pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans);
3633 
3634  aggstate->peragg = peraggs;
3635  aggstate->pertrans = pertransstates;
3636 
3637 
3638  aggstate->all_pergroups =
3640  * (numGroupingSets + numHashes));
3641  pergroups = aggstate->all_pergroups;
3642 
3643  if (node->aggstrategy != AGG_HASHED)
3644  {
3645  for (i = 0; i < numGroupingSets; i++)
3646  {
3647  pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData)
3648  * numaggs);
3649  }
3650 
3651  aggstate->pergroups = pergroups;
3652  pergroups += numGroupingSets;
3653  }
3654 
3655  /*
3656  * Hashing can only appear in the initial phase.
3657  */
3658  if (use_hashing)
3659  {
3660  Plan *outerplan = outerPlan(node);
3661  uint64 totalGroups = 0;
3662  int i;
3663 
3664  aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
3665  "HashAgg meta context",
3667  aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
3669  aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
3670  &TTSOpsVirtual);
3671 
3672  /* this is an array of pointers, not structures */
3673  aggstate->hash_pergroup = pergroups;
3674 
3675  aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans,
3676  outerplan->plan_width,
3677  node->transitionSpace);
3678 
3679  /*
3680  * Consider all of the grouping sets together when setting the limits
3681  * and estimating the number of partitions. This can be inaccurate
3682  * when there is more than one grouping set, but should still be
3683  * reasonable.
3684  */
3685  for (i = 0; i < aggstate->num_hashes; i++)
3686  totalGroups += aggstate->perhash[i].aggnode->numGroups;
3687 
3688  hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0,
3689  &aggstate->hash_mem_limit,
3690  &aggstate->hash_ngroups_limit,
3691  &aggstate->hash_planned_partitions);
3692  find_hash_columns(aggstate);
3693 
3694  /* Skip massive memory allocation if we are just doing EXPLAIN */
3695  if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
3696  build_hash_tables(aggstate);
3697 
3698  aggstate->table_filled = false;
3699 
3700  /* Initialize this to 1, meaning nothing spilled, yet */
3701  aggstate->hash_batches_used = 1;
3702  }
3703 
3704  /*
3705  * Initialize current phase-dependent values to initial phase. The initial
3706  * phase is 1 (first sort pass) for all strategies that use sorting (if
3707  * hashing is being done too, then phase 0 is processed last); but if only
3708  * hashing is being done, then phase 0 is all there is.
3709  */
3710  if (node->aggstrategy == AGG_HASHED)
3711  {
3712  aggstate->current_phase = 0;
3713  initialize_phase(aggstate, 0);
3714  select_current_set(aggstate, 0, true);
3715  }
3716  else
3717  {
3718  aggstate->current_phase = 1;
3719  initialize_phase(aggstate, 1);
3720  select_current_set(aggstate, 0, false);
3721  }
3722 
3723  /*
3724  * Perform lookups of aggregate function info, and initialize the
3725  * unchanging fields of the per-agg and per-trans data.
3726  */
3727  foreach(l, aggstate->aggs)
3728  {
3729  Aggref *aggref = lfirst(l);
3730  AggStatePerAgg peragg;
3731  AggStatePerTrans pertrans;
3732  Oid aggTransFnInputTypes[FUNC_MAX_ARGS];
3733  int numAggTransFnArgs;
3734  int numDirectArgs;
3735  HeapTuple aggTuple;
3736  Form_pg_aggregate aggform;
3737  AclResult aclresult;
3738  Oid finalfn_oid;
3739  Oid serialfn_oid,
3740  deserialfn_oid;
3741  Oid aggOwner;
3742  Expr *finalfnexpr;
3743  Oid aggtranstype;
3744 
3745  /* Planner should have assigned aggregate to correct level */
3746  Assert(aggref->agglevelsup == 0);
3747  /* ... and the split mode should match */
3748  Assert(aggref->aggsplit == aggstate->aggsplit);
3749 
3750  peragg = &peraggs[aggref->aggno];
3751 
3752  /* Check if we initialized the state for this aggregate already. */
3753  if (peragg->aggref != NULL)
3754  continue;
3755 
3756  peragg->aggref = aggref;
3757  peragg->transno = aggref->aggtransno;
3758 
3759  /* Fetch the pg_aggregate row */
3760  aggTuple = SearchSysCache1(AGGFNOID,
3761  ObjectIdGetDatum(aggref->aggfnoid));
3762  if (!HeapTupleIsValid(aggTuple))
3763  elog(ERROR, "cache lookup failed for aggregate %u",
3764  aggref->aggfnoid);
3765  aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
3766 
3767  /* Check permission to call aggregate function */
3768  aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(),
3769  ACL_EXECUTE);
3770  if (aclresult != ACLCHECK_OK)
3771  aclcheck_error(aclresult, OBJECT_AGGREGATE,
3772  get_func_name(aggref->aggfnoid));
3774 
3775  /* planner recorded transition state type in the Aggref itself */
3776  aggtranstype = aggref->aggtranstype;
3777  Assert(OidIsValid(aggtranstype));
3778 
3779  /* Final function only required if we're finalizing the aggregates */
3780  if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
3781  peragg->finalfn_oid = finalfn_oid = InvalidOid;
3782  else
3783  peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
3784 
3785  serialfn_oid = InvalidOid;
3786  deserialfn_oid = InvalidOid;
3787 
3788  /*
3789  * Check if serialization/deserialization is required. We only do it
3790  * for aggregates that have transtype INTERNAL.
3791  */
3792  if (aggtranstype == INTERNALOID)
3793  {
3794  /*
3795  * The planner should only have generated a serialize agg node if
3796  * every aggregate with an INTERNAL state has a serialization
3797  * function. Verify that.
3798  */
3799  if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit))
3800  {
3801  /* serialization only valid when not running finalfn */
3802  Assert(DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
3803 
3804  if (!OidIsValid(aggform->aggserialfn))
3805  elog(ERROR, "serialfunc not provided for serialization aggregation");
3806  serialfn_oid = aggform->aggserialfn;
3807  }
3808 
3809  /* Likewise for deserialization functions */
3810  if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit))
3811  {
3812  /* deserialization only valid when combining states */
3813  Assert(DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
3814 
3815  if (!OidIsValid(aggform->aggdeserialfn))
3816  elog(ERROR, "deserialfunc not provided for deserialization aggregation");
3817  deserialfn_oid = aggform->aggdeserialfn;
3818  }
3819  }
3820 
3821  /* Check that aggregate owner has permission to call component fns */
3822  {
3823  HeapTuple procTuple;
3824 
3825  procTuple = SearchSysCache1(PROCOID,
3826  ObjectIdGetDatum(aggref->aggfnoid));
3827  if (!HeapTupleIsValid(procTuple))
3828  elog(ERROR, "cache lookup failed for function %u",
3829  aggref->aggfnoid);
3830  aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
3831  ReleaseSysCache(procTuple);
3832 
3833  if (OidIsValid(finalfn_oid))
3834  {
3835  aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
3836  ACL_EXECUTE);
3837  if (aclresult != ACLCHECK_OK)
3838  aclcheck_error(aclresult, OBJECT_FUNCTION,
3839  get_func_name(finalfn_oid));
3840  InvokeFunctionExecuteHook(finalfn_oid);
3841  }
3842  if (OidIsValid(serialfn_oid))
3843  {
3844  aclresult = pg_proc_aclcheck(serialfn_oid, aggOwner,
3845  ACL_EXECUTE);
3846  if (aclresult != ACLCHECK_OK)
3847  aclcheck_error(aclresult, OBJECT_FUNCTION,
3848  get_func_name(serialfn_oid));
3849  InvokeFunctionExecuteHook(serialfn_oid);
3850  }
3851  if (OidIsValid(deserialfn_oid))
3852  {
3853  aclresult = pg_proc_aclcheck(deserialfn_oid, aggOwner,
3854  ACL_EXECUTE);
3855  if (aclresult != ACLCHECK_OK)
3856  aclcheck_error(aclresult, OBJECT_FUNCTION,
3857  get_func_name(deserialfn_oid));
3858  InvokeFunctionExecuteHook(deserialfn_oid);
3859  }
3860  }
3861 
3862  /*
3863  * Get actual datatypes of the (nominal) aggregate inputs. These
3864  * could be different from the agg's declared input types, when the
3865  * agg accepts ANY or a polymorphic type.
3866  */
3867  numAggTransFnArgs = get_aggregate_argtypes(aggref,
3868  aggTransFnInputTypes);
3869 
3870  /* Count the "direct" arguments, if any */
3871  numDirectArgs = list_length(aggref->aggdirectargs);
3872 
3873  /* Detect how many arguments to pass to the finalfn */
3874  if (aggform->aggfinalextra)
3875  peragg->numFinalArgs = numAggTransFnArgs + 1;
3876  else
3877  peragg->numFinalArgs = numDirectArgs + 1;
3878 
3879  /* Initialize any direct-argument expressions */
3880  peragg->aggdirectargs = ExecInitExprList(aggref->aggdirectargs,
3881  (PlanState *) aggstate);
3882 
3883  /*
3884  * build expression trees using actual argument & result types for the
3885  * finalfn, if it exists and is required.
3886  */
3887  if (OidIsValid(finalfn_oid))
3888  {
3889  build_aggregate_finalfn_expr(aggTransFnInputTypes,
3890  peragg->numFinalArgs,
3891  aggtranstype,
3892  aggref->aggtype,
3893  aggref->inputcollid,
3894  finalfn_oid,
3895  &finalfnexpr);
3896  fmgr_info(finalfn_oid, &peragg->finalfn);
3897  fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn);
3898  }
3899 
3900  /* get info about the output value's datatype */
3901  get_typlenbyval(aggref->aggtype,
3902  &peragg->resulttypeLen,
3903  &peragg->resulttypeByVal);
3904 
3905  /*
3906  * Build working state for invoking the transition function, if we
3907  * haven't done it already.
3908  */
3909  pertrans = &pertransstates[aggref->aggtransno];
3910  if (pertrans->aggref == NULL)
3911  {
3912  Datum textInitVal;
3913  Datum initValue;
3914  bool initValueIsNull;
3915  Oid transfn_oid;
3916 
3917  /*
3918  * If this aggregation is performing state combines, then instead
3919  * of using the transition function, we'll use the combine
3920  * function.
3921  */
3922  if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
3923  {
3924  transfn_oid = aggform->aggcombinefn;
3925 
3926  /* If not set then the planner messed up */
3927  if (!OidIsValid(transfn_oid))
3928  elog(ERROR, "combinefn not set for aggregate function");
3929  }
3930  else
3931  transfn_oid = aggform->aggtransfn;
3932 
3933  aclresult = pg_proc_aclcheck(transfn_oid, aggOwner, ACL_EXECUTE);
3934  if (aclresult != ACLCHECK_OK)
3935  aclcheck_error(aclresult, OBJECT_FUNCTION,
3936  get_func_name(transfn_oid));
3937  InvokeFunctionExecuteHook(transfn_oid);
3938 
3939  /*
3940  * initval is potentially null, so don't try to access it as a
3941  * struct field. Must do it the hard way with SysCacheGetAttr.
3942  */
3943  textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
3944  Anum_pg_aggregate_agginitval,
3945  &initValueIsNull);
3946  if (initValueIsNull)
3947  initValue = (Datum) 0;
3948  else
3949  initValue = GetAggInitVal(textInitVal, aggtranstype);
3950 
3951  if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
3952  {
3953  Oid combineFnInputTypes[] = {aggtranstype,
3954  aggtranstype};
3955 
3956  /*
3957  * When combining there's only one input, the to-be-combined
3958  * transition value. The transition value is not counted
3959  * here.
3960  */
3961  pertrans->numTransInputs = 1;
3962 
3963  /* aggcombinefn always has two arguments of aggtranstype */
3964  build_pertrans_for_aggref(pertrans, aggstate, estate,
3965  aggref, transfn_oid, aggtranstype,
3966  serialfn_oid, deserialfn_oid,
3967  initValue, initValueIsNull,
3968  combineFnInputTypes, 2);
3969 
3970  /*
3971  * Ensure that a combine function to combine INTERNAL states
3972  * is not strict. This should have been checked during CREATE
3973  * AGGREGATE, but the strict property could have been changed
3974  * since then.
3975  */
3976  if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID)
3977  ereport(ERROR,
3978  (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
3979  errmsg("combine function with transition type %s must not be declared STRICT",
3980  format_type_be(aggtranstype))));
3981  }
3982  else
3983  {
3984  /* Detect how many arguments to pass to the transfn */
3985  if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
3986  pertrans->numTransInputs = list_length(aggref->args);
3987  else
3988  pertrans->numTransInputs = numAggTransFnArgs;
3989 
3990  build_pertrans_for_aggref(pertrans, aggstate, estate,
3991  aggref, transfn_oid, aggtranstype,
3992  serialfn_oid, deserialfn_oid,
3993  initValue, initValueIsNull,
3994  aggTransFnInputTypes,
3995  numAggTransFnArgs);
3996 
3997  /*
3998  * If the transfn is strict and the initval is NULL, make sure
3999  * input type and transtype are the same (or at least
4000  * binary-compatible), so that it's OK to use the first
4001  * aggregated input value as the initial transValue. This
4002  * should have been checked at agg definition time, but we
4003  * must check again in case the transfn's strictness property
4004  * has been changed.
4005  */
4006  if (pertrans->transfn.fn_strict && pertrans->initValueIsNull)
4007  {
4008  if (numAggTransFnArgs <= numDirectArgs ||
4009  !IsBinaryCoercible(aggTransFnInputTypes[numDirectArgs],
4010  aggtranstype))
4011  ereport(ERROR,
4012  (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
4013  errmsg("aggregate %u needs to have compatible input type and transition type",
4014  aggref->aggfnoid)));
4015  }
4016  }
4017  }
4018  else
4019  pertrans->aggshared = true;
4020  ReleaseSysCache(aggTuple);
4021  }
4022 
4023  /*
4024  * Update aggstate->numaggs to be the number of unique aggregates found.
4025  * Also set numstates to the number of unique transition states found.
4026  */
4027  aggstate->numaggs = numaggs;
4028  aggstate->numtrans = numtrans;
4029 
4030  /*
4031  * Last, check whether any more aggregates got added onto the node while
4032  * we processed the expressions for the aggregate arguments (including not
4033  * only the regular arguments and FILTER expressions handled immediately
4034  * above, but any direct arguments we might've handled earlier). If so,
4035  * we have nested aggregate functions, which is semantically nonsensical,
4036  * so complain. (This should have been caught by the parser, so we don't
4037  * need to work hard on a helpful error message; but we defend against it
4038  * here anyway, just to be sure.)
4039  */
4040  if (numaggrefs != list_length(aggstate->aggs))
4041  ereport(ERROR,
4042  (errcode(ERRCODE_GROUPING_ERROR),
4043  errmsg("aggregate function calls cannot be nested")));
4044 
4045  /*
4046  * Build expressions doing all the transition work at once. We build a
4047  * different one for each phase, as the number of transition function
4048  * invocation can differ between phases. Note this'll work both for
4049  * transition and combination functions (although there'll only be one
4050  * phase in the latter case).
4051  */
4052  for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++)
4053  {
4054  AggStatePerPhase phase = &aggstate->phases[phaseidx];
4055  bool dohash = false;
4056  bool dosort = false;
4057 
4058  /* phase 0 doesn't necessarily exist */
4059  if (!phase->aggnode)
4060  continue;
4061 
4062  if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1)
4063  {
4064  /*
4065  * Phase one, and only phase one, in a mixed agg performs both
4066  * sorting and aggregation.
4067  */
4068  dohash = true;
4069  dosort = true;
4070  }
4071  else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0)
4072  {
4073  /*
4074  * No need to compute a transition function for an AGG_MIXED phase
4075  * 0 - the contents of the hashtables will have been computed
4076  * during phase 1.
4077  */
4078  continue;
4079  }
4080  else if (phase->aggstrategy == AGG_PLAIN ||
4081  phase->aggstrategy == AGG_SORTED)
4082  {
4083  dohash = false;
4084  dosort = true;
4085  }
4086  else if (phase->aggstrategy == AGG_HASHED)
4087  {
4088  dohash = true;
4089  dosort = false;
4090  }
4091  else
4092  Assert(false);
4093 
4094  phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash,
4095  false);
4096 
4097  /* cache compiled expression for outer slot without NULL check */
4098  phase->evaltrans_cache[0][0] = phase->evaltrans;
4099  }
4100 
4101  return aggstate;
4102 }
4103 
4104 /*
4105  * Build the state needed to calculate a state value for an aggregate.
4106  *
4107  * This initializes all the fields in 'pertrans'. 'aggref' is the aggregate
4108  * to initialize the state for. 'transfn_oid', 'aggtranstype', and the rest
4109  * of the arguments could be calculated from 'aggref', but the caller has
4110  * calculated them already, so might as well pass them.
4111  *
4112  * 'transfn_oid' may be either the Oid of the aggtransfn or the aggcombinefn.
4113  */
4114 static void
4116  AggState *aggstate, EState *estate,
4117  Aggref *aggref,
4118  Oid transfn_oid, Oid aggtranstype,
4119  Oid aggserialfn, Oid aggdeserialfn,
4120  Datum initValue, bool initValueIsNull,
4121  Oid *inputTypes, int numArguments)
4122 {
4123  int numGroupingSets = Max(aggstate->maxsets, 1);
4124  Expr *transfnexpr;
4125  int numTransArgs;
4126  Expr *serialfnexpr = NULL;
4127  Expr *deserialfnexpr = NULL;
4128  ListCell *lc;
4129  int numInputs;
4130  int numDirectArgs;
4131  List *sortlist;
4132  int numSortCols;
4133  int numDistinctCols;
4134  int i;
4135 
4136  /* Begin filling in the pertrans data */
4137  pertrans->aggref = aggref;
4138  pertrans->aggshared = false;
4139  pertrans->aggCollation = aggref->inputcollid;
4140  pertrans->transfn_oid = transfn_oid;
4141  pertrans->serialfn_oid = aggserialfn;
4142  pertrans->deserialfn_oid = aggdeserialfn;
4143  pertrans->initValue = initValue;
4144  pertrans->initValueIsNull = initValueIsNull;
4145 
4146  /* Count the "direct" arguments, if any */
4147  numDirectArgs = list_length(aggref->aggdirectargs);
4148 
4149  /* Count the number of aggregated input columns */
4150  pertrans->numInputs = numInputs = list_length(aggref->args);
4151 
4152  pertrans->aggtranstype = aggtranstype;
4153 
4154  /* account for the current transition state */
4155  numTransArgs = pertrans->numTransInputs + 1;
4156 
4157  /*
4158  * Set up infrastructure for calling the transfn. Note that invtrans is
4159  * not needed here.
4160  */
4161  build_aggregate_transfn_expr(inputTypes,
4162  numArguments,
4163  numDirectArgs,
4164  aggref->aggvariadic,
4165  aggtranstype,
4166  aggref->inputcollid,
4167  transfn_oid,
4168  InvalidOid,
4169  &transfnexpr,
4170  NULL);
4171 
4172  fmgr_info(transfn_oid, &pertrans->transfn);
4173  fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn);
4174 
4175  pertrans->transfn_fcinfo =
4178  &pertrans->transfn,
4179  numTransArgs,
4180  pertrans->aggCollation,
4181  (void *) aggstate, NULL);
4182 
4183  /* get info about the state value's datatype */
4184  get_typlenbyval(aggtranstype,
4185  &pertrans->transtypeLen,
4186  &pertrans->transtypeByVal);
4187 
4188  if (OidIsValid(aggserialfn))
4189  {
4190  build_aggregate_serialfn_expr(aggserialfn,
4191  &serialfnexpr);
4192  fmgr_info(aggserialfn, &pertrans->serialfn);
4193  fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn);
4194 
4195  pertrans->serialfn_fcinfo =
4198  &pertrans->serialfn,
4199  1,
4200  InvalidOid,
4201  (void *) aggstate, NULL);
4202  }
4203 
4204  if (OidIsValid(aggdeserialfn))
4205  {
4206  build_aggregate_deserialfn_expr(aggdeserialfn,
4207  &deserialfnexpr);
4208  fmgr_info(aggdeserialfn, &pertrans->deserialfn);
4209  fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn);
4210 
4211  pertrans->deserialfn_fcinfo =
4214  &pertrans->deserialfn,
4215  2,
4216  InvalidOid,
4217  (void *) aggstate, NULL);
4218 
4219  }
4220 
4221  /*
4222  * If we're doing either DISTINCT or ORDER BY for a plain agg, then we
4223  * have a list of SortGroupClause nodes; fish out the data in them and
4224  * stick them into arrays. We ignore ORDER BY for an ordered-set agg,
4225  * however; the agg's transfn and finalfn are responsible for that.
4226  *
4227  * Note that by construction, if there is a DISTINCT clause then the ORDER
4228  * BY clause is a prefix of it (see transformDistinctClause).
4229  */
4230  if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
4231  {
4232  sortlist = NIL;
4233  numSortCols = numDistinctCols = 0;
4234  }
4235  else if (aggref->aggdistinct)
4236  {
4237  sortlist = aggref->aggdistinct;
4238  numSortCols = numDistinctCols = list_length(sortlist);
4239  Assert(numSortCols >= list_length(aggref->aggorder));
4240  }
4241  else
4242  {
4243  sortlist = aggref->aggorder;
4244  numSortCols = list_length(sortlist);
4245  numDistinctCols = 0;
4246  }
4247 
4248  pertrans->numSortCols = numSortCols;
4249  pertrans->numDistinctCols = numDistinctCols;
4250 
4251  /*
4252  * If we have either sorting or filtering to do, create a tupledesc and
4253  * slot corresponding to the aggregated inputs (including sort
4254  * expressions) of the agg.
4255  */
4256  if (numSortCols > 0 || aggref->aggfilter)
4257  {
4258  pertrans->sortdesc = ExecTypeFromTL(aggref->args);
4259  pertrans->sortslot =
4260  ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
4262  }
4263 
4264  if (numSortCols > 0)
4265  {
4266  /*
4267  * We don't implement DISTINCT or ORDER BY aggs in the HASHED case
4268  * (yet)
4269  */
4270  Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
4271 
4272  /* ORDER BY aggregates are not supported with partial aggregation */
4273  Assert(!DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
4274 
4275  /* If we have only one input, we need its len/byval info. */
4276  if (numInputs == 1)
4277  {
4278  get_typlenbyval(inputTypes[numDirectArgs],
4279  &pertrans->inputtypeLen,
4280  &pertrans->inputtypeByVal);
4281  }
4282  else if (numDistinctCols > 0)
4283  {
4284  /* we will need an extra slot to store prior values */
4285  pertrans->uniqslot =
4286  ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
4288  }
4289 
4290  /* Extract the sort information for use later */
4291  pertrans->sortColIdx =
4292  (AttrNumber *) palloc(numSortCols * sizeof(AttrNumber));
4293  pertrans->sortOperators =
4294  (Oid *) palloc(numSortCols * sizeof(Oid));
4295  pertrans->sortCollations =
4296  (Oid *) palloc(numSortCols * sizeof(Oid));
4297  pertrans->sortNullsFirst =
4298  (bool *) palloc(numSortCols * sizeof(bool));
4299 
4300  i = 0;
4301  foreach(lc, sortlist)
4302  {
4303  SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc);
4304  TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args);
4305 
4306  /* the parser should have made sure of this */
4307  Assert(OidIsValid(sortcl->sortop));
4308 
4309  pertrans->sortColIdx[i] = tle->resno;
4310  pertrans->sortOperators[i] = sortcl->sortop;
4311  pertrans->sortCollations[i] = exprCollation((Node *) tle->expr);
4312  pertrans->sortNullsFirst[i] = sortcl->nulls_first;
4313  i++;
4314  }
4315  Assert(i == numSortCols);
4316  }
4317 
4318  if (aggref->aggdistinct)
4319  {
4320  Oid *ops;
4321 
4322  Assert(numArguments > 0);
4323  Assert(list_length(aggref->aggdistinct) == numDistinctCols);
4324 
4325  ops = palloc(numDistinctCols * sizeof(Oid));
4326 
4327  i = 0;
4328  foreach(lc, aggref->aggdistinct)
4329  ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop;
4330 
4331  /* lookup / build the necessary comparators */
4332  if (numDistinctCols == 1)
4333  fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne);
4334  else
4335  pertrans->equalfnMulti =
4336  execTuplesMatchPrepare(pertrans->sortdesc,
4337  numDistinctCols,
4338  pertrans->sortColIdx,
4339  ops,
4340  pertrans->sortCollations,
4341  &aggstate->ss.ps);
4342  pfree(ops);
4343  }
4344 
4345  pertrans->sortstates = (Tuplesortstate **)
4346  palloc0(sizeof(Tuplesortstate *) * numGroupingSets);
4347 }
4348 
4349 
4350 static Datum
4351 GetAggInitVal(Datum textInitVal, Oid transtype)
4352 {
4353  Oid typinput,
4354  typioparam;
4355  char *strInitVal;
4356  Datum initVal;
4357 
4358  getTypeInputInfo(transtype, &typinput, &typioparam);
4359  strInitVal = TextDatumGetCString(textInitVal);
4360  initVal = OidInputFunctionCall(typinput, strInitVal,
4361  typioparam, -1);
4362  pfree(strInitVal);
4363  return initVal;
4364 }
4365 
4366 void
4368 {
4370  int transno;
4371  int numGroupingSets = Max(node->maxsets, 1);
4372  int setno;
4373 
4374  /*
4375  * When ending a parallel worker, copy the statistics gathered by the
4376  * worker back into shared memory so that it can be picked up by the main
4377  * process to report in EXPLAIN ANALYZE.
4378  */
4379  if (node->shared_info && IsParallelWorker())
4380  {
4382 
4383  Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
4386  si->hash_disk_used = node->hash_disk_used;
4387  si->hash_mem_peak = node->hash_mem_peak;
4388  }
4389 
4390  /* Make sure we have closed any open tuplesorts */
4391 
4392  if (node->sort_in)
4393  tuplesort_end(node->sort_in);
4394  if (node->sort_out)
4395  tuplesort_end(node->sort_out);
4396 
4398 
4399  if (node->hash_metacxt != NULL)
4400  {
4402  node->hash_metacxt = NULL;
4403  }
4404 
4405  for (transno = 0; transno < node->numtrans; transno++)
4406  {
4407  AggStatePerTrans pertrans = &node->pertrans[transno];
4408 
4409  for (setno = 0; setno < numGroupingSets; setno++)
4410  {
4411  if (pertrans->sortstates[setno])
4412  tuplesort_end(pertrans->sortstates[setno]);
4413  }
4414  }
4415 
4416  /* And ensure any agg shutdown callbacks have been called */
4417  for (setno = 0; setno < numGroupingSets; setno++)
4418  ReScanExprContext(node->aggcontexts[setno]);
4419  if (node->hashcontext)
4421 
4422  /*
4423  * We don't actually free any ExprContexts here (see comment in
4424  * ExecFreeExprContext), just unlinking the output one from the plan node
4425  * suffices.
4426  */
4427  ExecFreeExprContext(&node->ss.ps);
4428 
4429  /* clean up tuple table */
4431 
4432  outerPlan = outerPlanState(node);
4433  ExecEndNode(outerPlan);
4434 }
4435 
4436 void
4438 {
4439  ExprContext *econtext = node->ss.ps.ps_ExprContext;
4441  Agg *aggnode = (Agg *) node->ss.ps.plan;
4442  int transno;
4443  int numGroupingSets = Max(node->maxsets, 1);
4444  int setno;
4445 
4446  node->agg_done = false;
4447 
4448  if (node->aggstrategy == AGG_HASHED)
4449  {
4450  /*
4451  * In the hashed case, if we haven't yet built the hash table then we
4452  * can just return; nothing done yet, so nothing to undo. If subnode's
4453  * chgParam is not NULL then it will be re-scanned by ExecProcNode,
4454  * else no reason to re-scan it at all.
4455  */
4456  if (!node->table_filled)
4457  return;
4458 
4459  /*
4460  * If we do have the hash table, and it never spilled, and the subplan
4461  * does not have any parameter changes, and none of our own parameter
4462  * changes affect input expressions of the aggregated functions, then
4463  * we can just rescan the existing hash table; no need to build it
4464  * again.
4465  */
4466  if (outerPlan->chgParam == NULL && !node->hash_ever_spilled &&
4467  !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
4468  {
4470  &node->perhash[0].hashiter);
4471  select_current_set(node, 0, true);
4472  return;
4473  }
4474  }
4475 
4476  /* Make sure we have closed any open tuplesorts */
4477  for (transno = 0; transno < node->numtrans; transno++)
4478  {
4479  for (setno = 0; setno < numGroupingSets; setno++)
4480  {
4481  AggStatePerTrans pertrans = &node->pertrans[transno];
4482 
4483  if (pertrans->sortstates[setno])
4484  {
4485  tuplesort_end(pertrans->sortstates[setno]);
4486  pertrans->sortstates[setno] = NULL;
4487  }
4488  }
4489  }
4490 
4491  /*
4492  * We don't need to ReScanExprContext the output tuple context here;
4493  * ExecReScan already did it. But we do need to reset our per-grouping-set
4494  * contexts, which may have transvalues stored in them. (We use rescan
4495  * rather than just reset because transfns may have registered callbacks
4496  * that need to be run now.) For the AGG_HASHED case, see below.
4497  */
4498 
4499  for (setno = 0; setno < numGroupingSets; setno++)
4500  {
4501  ReScanExprContext(node->aggcontexts[setno]);
4502  }
4503 
4504  /* Release first tuple of group, if we have made a copy */
4505  if (node->grp_firstTuple != NULL)
4506  {
4508  node->grp_firstTuple = NULL;
4509  }
4511 
4512  /* Forget current agg values */
4513  MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
4514  MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
4515 
4516  /*
4517  * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
4518  * the hashcontext. This used to be an issue, but now, resetting a context
4519  * automatically deletes sub-contexts too.
4520  */
4521  if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
4522  {
4524 
4525  node->hash_ever_spilled = false;
4526  node->hash_spill_mode = false;
4527  node->hash_ngroups_current = 0;
4528 
4530  /* Rebuild an empty hash table */
4531  build_hash_tables(node);
4532  node->table_filled = false;
4533  /* iterator will be reset when the table is filled */
4534 
4535  hashagg_recompile_expressions(node, false, false);
4536  }
4537 
4538  if (node->aggstrategy != AGG_HASHED)
4539  {
4540  /*
4541  * Reset the per-group state (in particular, mark transvalues null)
4542  */
4543  for (setno = 0; setno < numGroupingSets; setno++)
4544  {
4545  MemSet(node->pergroups[setno], 0,
4546  sizeof(AggStatePerGroupData) * node->numaggs);
4547  }
4548 
4549  /* reset to phase 1 */
4550  initialize_phase(node, 1);
4551 
4552  node->input_done = false;
4553  node->projected_set = -1;
4554  }
4555 
4556  if (outerPlan->chgParam == NULL)
4557  ExecReScan(outerPlan);
4558 }
4559 
4560 
4561 /***********************************************************************
4562  * API exposed to aggregate functions
4563  ***********************************************************************/
4564 
4565 
4566 /*
4567  * AggCheckCallContext - test if a SQL function is being called as an aggregate
4568  *
4569  * The transition and/or final functions of an aggregate may want to verify
4570  * that they are being called as aggregates, rather than as plain SQL
4571  * functions. They should use this function to do so. The return value
4572  * is nonzero if being called as an aggregate, or zero if not. (Specific
4573  * nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more
4574  * values could conceivably appear in future.)
4575  *
4576  * If aggcontext isn't NULL, the function also stores at *aggcontext the
4577  * identity of the memory context that aggregate transition values are being
4578  * stored in. Note that the same aggregate call site (flinfo) may be called
4579  * interleaved on different transition values in different contexts, so it's
4580  * not kosher to cache aggcontext under fn_extra. It is, however, kosher to
4581  * cache it in the transvalue itself (for internal-type transvalues).
4582  */
4583 int
4585 {
4586  if (fcinfo->context && IsA(fcinfo->context, AggState))
4587  {
4588  if (aggcontext)
4589  {
4590  AggState *aggstate = ((AggState *) fcinfo->context);
4591  ExprContext *cxt = aggstate->curaggcontext;
4592 
4593  *aggcontext = cxt->ecxt_per_tuple_memory;
4594  }
4595  return AGG_CONTEXT_AGGREGATE;
4596  }
4597  if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
4598  {
4599  if (aggcontext)
4600  *aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext;
4601  return AGG_CONTEXT_WINDOW;
4602  }
4603 
4604  /* this is just to prevent "uninitialized variable" warnings */
4605  if (aggcontext)
4606  *aggcontext = NULL;
4607  return 0;
4608 }
4609 
4610 /*
4611  * AggGetAggref - allow an aggregate support function to get its Aggref
4612  *
4613  * If the function is being called as an aggregate support function,
4614  * return the Aggref node for the aggregate call. Otherwise, return NULL.
4615  *
4616  * Aggregates sharing the same inputs and transition functions can get
4617  * merged into a single transition calculation. If the transition function
4618  * calls AggGetAggref, it will get some one of the Aggrefs for which it is
4619  * executing. It must therefore not pay attention to the Aggref fields that
4620  * relate to the final function, as those are indeterminate. But if a final
4621  * function calls AggGetAggref, it will get a precise result.
4622  *
4623  * Note that if an aggregate is being used as a window function, this will
4624  * return NULL. We could provide a similar function to return the relevant
4625  * WindowFunc node in such cases, but it's not needed yet.
4626  */
4627 Aggref *
4629 {
4630  if (fcinfo->context && IsA(fcinfo->context, AggState))
4631  {
4632  AggState *aggstate = (AggState *) fcinfo->context;
4633  AggStatePerAgg curperagg;
4634  AggStatePerTrans curpertrans;
4635 
4636  /* check curperagg (valid when in a final function) */
4637  curperagg = aggstate->curperagg;
4638 
4639  if (curperagg)
4640  return curperagg->aggref;
4641 
4642  /* check curpertrans (valid when in a transition function) */
4643  curpertrans = aggstate->curpertrans;
4644 
4645  if (curpertrans)
4646  return curpertrans->aggref;
4647  }
4648  return NULL;
4649 }
4650 
4651 /*
4652  * AggGetTempMemoryContext - fetch short-term memory context for aggregates
4653  *
4654  * This is useful in agg final functions; the context returned is one that
4655  * the final function can safely reset as desired. This isn't useful for
4656  * transition functions, since the context returned MAY (we don't promise)
4657  * be the same as the context those are called in.
4658  *
4659  * As above, this is currently not useful for aggs called as window functions.
4660  */
4663 {
4664  if (fcinfo->context && IsA(fcinfo->context, AggState))
4665  {
4666  AggState *aggstate = (AggState *) fcinfo->context;
4667 
4668  return aggstate->tmpcontext->ecxt_per_tuple_memory;
4669  }
4670  return NULL;
4671 }
4672 
4673 /*
4674  * AggStateIsShared - find out whether transition state is shared
4675  *
4676  * If the function is being called as an aggregate support function,
4677  * return true if the aggregate's transition state is shared across
4678  * multiple aggregates, false if it is not.
4679  *
4680  * Returns true if not called as an aggregate support function.
4681  * This is intended as a conservative answer, ie "no you'd better not
4682  * scribble on your input". In particular, will return true if the
4683  * aggregate is being used as a window function, which is a scenario
4684  * in which changing the transition state is a bad idea. We might
4685  * want to refine the behavior for the window case in future.
4686  */
4687 bool
4689 {
4690  if (fcinfo->context && IsA(fcinfo->context, AggState))
4691  {
4692  AggState *aggstate = (AggState *) fcinfo->context;
4693  AggStatePerAgg curperagg;
4694  AggStatePerTrans curpertrans;
4695 
4696  /* check curperagg (valid when in a final function) */
4697  curperagg = aggstate->curperagg;
4698 
4699  if (curperagg)
4700  return aggstate->pertrans[curperagg->transno].aggshared;
4701 
4702  /* check curpertrans (valid when in a transition function) */
4703  curpertrans = aggstate->curpertrans;
4704 
4705  if (curpertrans)
4706  return curpertrans->aggshared;
4707  }
4708  return true;
4709 }
4710 
4711 /*
4712  * AggRegisterCallback - register a cleanup callback for an aggregate
4713  *
4714  * This is useful for aggs to register shutdown callbacks, which will ensure
4715  * that non-memory resources are freed. The callback will occur just before
4716  * the associated aggcontext (as returned by AggCheckCallContext) is reset,
4717  * either between groups or as a result of rescanning the query. The callback
4718  * will NOT be called on error paths. The typical use-case is for freeing of
4719  * tuplestores or tuplesorts maintained in aggcontext, or pins held by slots
4720  * created by the agg functions. (The callback will not be called until after
4721  * the result of the finalfn is no longer needed, so it's safe for the finalfn
4722  * to return data that will be freed by the callback.)
4723  *
4724  * As above, this is currently not useful for aggs called as window functions.
4725  */
4726 void
4729  Datum arg)
4730 {
4731  if (fcinfo->context && IsA(fcinfo->context, AggState))
4732  {
4733  AggState *aggstate = (AggState *) fcinfo->context;
4734  ExprContext *cxt = aggstate->curaggcontext;
4735 
4736  RegisterExprContextCallback(cxt, func, arg);
4737 
4738  return;
4739  }
4740  elog(ERROR, "aggregate function cannot register a callback in this context");
4741 }
4742 
4743 
4744 /* ----------------------------------------------------------------
4745  * Parallel Query Support
4746  * ----------------------------------------------------------------
4747  */
4748 
4749  /* ----------------------------------------------------------------
4750  * ExecAggEstimate
4751  *
4752  * Estimate space required to propagate aggregate statistics.
4753  * ----------------------------------------------------------------
4754  */
4755 void
4757 {
4758  Size size;
4759 
4760  /* don't need this if not instrumenting or no workers */
4761  if (!node->ss.ps.instrument || pcxt->nworkers == 0)
4762  return;
4763 
4764  size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
4765  size = add_size(size, offsetof(SharedAggInfo, sinstrument));
4766  shm_toc_estimate_chunk(&pcxt->estimator, size);
4767  shm_toc_estimate_keys(&pcxt->estimator, 1);
4768 }
4769 
4770 /* ----------------------------------------------------------------
4771  * ExecAggInitializeDSM
4772  *
4773  * Initialize DSM space for aggregate statistics.
4774  * ----------------------------------------------------------------
4775  */
4776 void
4778 {
4779  Size size;
4780 
4781  /* don't need this if not instrumenting or no workers */
4782  if (!node->ss.ps.instrument || pcxt->nworkers == 0)
4783  return;
4784 
4785  size = offsetof(SharedAggInfo, sinstrument)
4786  + pcxt->nworkers * sizeof(AggregateInstrumentation);
4787  node->shared_info = shm_toc_allocate(pcxt->toc, size);
4788  /* ensure any unfilled slots will contain zeroes */
4789  memset(node->shared_info, 0, size);
4790  node->shared_info->num_workers = pcxt->nworkers;
4791  shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
4792  node->shared_info);
4793 }
4794 
4795 /* ----------------------------------------------------------------
4796  * ExecAggInitializeWorker
4797  *
4798  * Attach worker to DSM space for aggregate statistics.
4799  * ----------------------------------------------------------------
4800  */
4801 void
4803 {
4804  node->shared_info =
4805  shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
4806 }
4807 
4808 /* ----------------------------------------------------------------
4809  * ExecAggRetrieveInstrumentation
4810  *
4811  * Transfer aggregate statistics from DSM to private memory.
4812  * ----------------------------------------------------------------
4813  */
4814 void
4816 {
4817  Size size;
4818  SharedAggInfo *si;
4819 
4820  if (node->shared_info == NULL)
4821  return;
4822 
4823  size = offsetof(SharedAggInfo, sinstrument)
4825  si = palloc(size);
4826  memcpy(si, node->shared_info, size);
4827  node->shared_info = si;
4828 }
static void hashagg_reset_spill_state(AggState *aggstate)
Definition: nodeAgg.c:3199
List * aggdistinct
Definition: primnodes.h:332
struct AggStatePerTransData * AggStatePerTrans
Definition: execnodes.h:2269
ExprState ** eqfunctions
Definition: nodeAgg.h:278
struct HashAggSpill * hash_spills
Definition: execnodes.h:2320
AggStatePerGroup * hash_pergroup
Definition: execnodes.h:2340
LogicalTapeSet * LogicalTapeSetCreate(int ntapes, bool preallocate, TapeShare *shared, SharedFileSet *fileset, int worker)
Definition: logtape.c:685
#define NIL
Definition: pg_list.h:65
size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size)
Definition: logtape.c:977
static TupleTableSlot * fetch_input_tuple(AggState *aggstate)
Definition: nodeAgg.c:568
struct AggStatePerGroupData * AggStatePerGroup
Definition: execnodes.h:2270
#define ScanTupleHashTable(htable, iter)
Definition: execnodes.h:771
static void select_current_set(AggState *aggstate, int setno, bool is_hash)
Definition: nodeAgg.c:476
int numCols
Definition: plannodes.h:862
static int partitions
Definition: pgbench.c:233
static int hash_choose_num_partitions(double input_groups, double hashentrysize, int used_bits, int *log2_npartittions)
Definition: nodeAgg.c:1992
List * qual
Definition: plannodes.h:142
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, Datum *val, bool *isNull, Datum *abbrev)
Definition: tuplesort.c:2494
TupleHashTable BuildTupleHashTableExt(PlanState *parent, TupleDesc inputDesc, int numCols, AttrNumber *keyColIdx, const Oid *eqfuncoids, FmgrInfo *hashfunctions, Oid *collations, long nbuckets, Size additionalsize, MemoryContext metacxt, MemoryContext tablecxt, MemoryContext tempcxt, bool use_variable_hash_iv)
Definition: execGrouping.c:154
bool aggvariadic
Definition: primnodes.h:335
int bms_first_member(Bitmapset *a)
Definition: bitmapset.c:996
AggStatePerPhase phases
Definition: execnodes.h:2307
double hashentrysize
Definition: execnodes.h:2332
#define IsA(nodeptr, _type_)
Definition: nodes.h:590
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2040
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
static void hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum)
Definition: nodeAgg.c:2935
#define AllocSetContextCreate
Definition: memutils.h:173
AttrNumber * hashGrpColIdxInput
Definition: nodeAgg.h:311
Datum * ecxt_aggvalues
Definition: execnodes.h:245
static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
Definition: nodeAgg.c:1918
TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 hash)
Definition: execGrouping.c:361
uint64 hash_ngroups_limit
Definition: execnodes.h:2329
#define HASHAGG_MAX_PARTITIONS
Definition: nodeAgg.c:289
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1446
static Datum ExecEvalExprSwitchContext(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:331
Index varlevelsup
Definition: primnodes.h:196
TargetEntry * get_sortgroupclause_tle(SortGroupClause *sgClause, List *targetList)
Definition: tlist.c:356
TupleTableSlot * ExecInitExtraTupleSlot(EState *estate, TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1831
#define GETSTRUCT(TUP)
Definition: htup_details.h:654
Bitmapset * bms_copy(const Bitmapset *a)
Definition: bitmapset.c:74
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1247
static void hash_agg_check_limits(AggState *aggstate)
Definition: nodeAgg.c:1857
static long hash_choose_num_buckets(double hashentrysize, long estimated_nbuckets, Size memory)
Definition: nodeAgg.c:1967
AttrNumber * grpColIdx
Definition: plannodes.h:863
ProjectionInfo * ps_ProjInfo
Definition: execnodes.h:1006
uint64 transitionSpace
Definition: plannodes.h:867
Instrumentation * instrument
Definition: execnodes.h:976
static void agg_fill_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2541
int aggtransno
Definition: primnodes.h:341
Bitmapset * colnos_needed
Definition: execnodes.h:2302
const TupleTableSlotOps * ExecGetResultSlotOps(PlanState *planstate, bool *isfixed)
Definition: execUtils.c:499
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:425
List * lcons_int(int datum, List *list)
Definition: list.c:486
TupleTableSlot * ExecStoreAllNullTuple(TupleTableSlot *slot)
Definition: execTuples.c:1576
int numaggs
Definition: execnodes.h:2278
int nfreetapes
Definition: nodeAgg.c:331
Oid GetUserId(void)
Definition: miscinit.c:478
bool agg_done
Definition: execnodes.h:2296
#define castNode(_type_, nodeptr)
Definition: nodes.h:608
Oid * grpCollations
Definition: plannodes.h:865
void ExecEndNode(PlanState *node)
Definition: execProcnode.c:556
#define TTS_EMPTY(slot)
Definition: tuptable.h:97
TupleTableSlot * sort_slot
Definition: execnodes.h:2310
List * all_grouped_cols
Definition: execnodes.h:2301
Tuplesortstate * sort_out
Definition: execnodes.h:2309
MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot, bool *shouldFree)
Definition: execTuples.c:1692
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
static void finalize_partialaggregate(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull)
Definition: nodeAgg.c:1160
ScanState ss
Definition: execnodes.h:2276
FmgrInfo equalfnOne
Definition: nodeAgg.h:110
ExprContext * ps_ExprContext
Definition: execnodes.h:1005
MinimalTuple firstTuple
Definition: execnodes.h:724
shm_toc_estimator estimator
Definition: parallel.h:42
MemoryContext ecxt_per_tuple_memory
Definition: execnodes.h:234
static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
Definition: nodeAgg.c:3157
ExprState * evaltrans
Definition: nodeAgg.h:283
#define SizeForFunctionCallInfo(nargs)
Definition: fmgr.h:102
int64 input_tuples
Definition: nodeAgg.c:370
void ExecReScan(PlanState *node)
Definition: execAmi.c:78
int bms_next_member(const Bitmapset *a, int prevbit)
Definition: bitmapset.c:1043
const TupleTableSlotOps TTSOpsVirtual
Definition: execTuples.c:83
int plan_node_id
Definition: plannodes.h:140
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
struct HashTapeInfo HashTapeInfo
Oid inputcollid
Definition: primnodes.h:326
int current_phase
Definition: execnodes.h:2284
static void hashagg_finish_initial_spills(AggState *aggstate)
Definition: nodeAgg.c:3123
static void slot_getsomeattrs(TupleTableSlot *slot, int attnum)
Definition: tuptable.h:341
Definition: nodes.h:539
AggSplit aggsplit
Definition: execnodes.h:2281
static TupleTableSlot * ExecAgg(PlanState *pstate)
Definition: nodeAgg.c:2159
bool * nullsFirst
Definition: plannodes.h:815
int errcode(int sqlerrcode)
Definition: elog.c:698
List * args
Definition: primnodes.h:330
#define MemSet(start, val, len)
Definition: c.h:1008
AttrNumber varattno
Definition: primnodes.h:191
char * format_type_be(Oid type_oid)
Definition: format_type.c:339
fmNodePtr context
Definition: fmgr.h:88
Datum * tts_values
Definition: tuptable.h:126
TupleTableSlot * ss_ScanTupleSlot
Definition: execnodes.h:1380
static void build_pertrans_for_aggref(AggStatePerTrans pertrans, AggState *aggstate, EState *estate, Aggref *aggref, Oid transfn_oid, Oid aggtranstype, Oid aggserialfn, Oid aggdeserialfn, Datum initValue, bool initValueIsNull, Oid *inputTypes, int numArguments)
Definition: nodeAgg.c:4115
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:143
void build_aggregate_deserialfn_expr(Oid deserialfn_oid, Expr **deserialfnexpr)
Definition: parse_agg.c:2057
static void finalize_aggregate(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull)
Definition: nodeAgg.c:1057
bool all_cols_needed
Definition: execnodes.h:2304
void build_aggregate_finalfn_expr(Oid *agg_input_types, int num_finalfn_inputs, Oid agg_state_type, Oid agg_result_type, Oid agg_input_collation, Oid finalfn_oid, Expr **finalfnexpr)
Definition: parse_agg.c:2081
AggregateInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]
Definition: execnodes.h:2252
TupleTableSlot * hash_spill_rslot
Definition: execnodes.h:2322
AggStatePerTrans pertrans
Definition: execnodes.h:2286
EState * state
Definition: execnodes.h:968
int projected_set
Definition: execnodes.h:2297
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1148
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1338
unsigned int Oid
Definition: postgres_ext.h:31
uint32 hash_bytes_uint32(uint32 k)
Definition: hashfn.c:610
static bool ExecQual(ExprState *state, ExprContext *econtext)
Definition: executor.h:396
HeapTuple grp_firstTuple
Definition: execnodes.h:2314
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
Definition: primnodes.h:186
Aggref * aggref
Definition: nodeAgg.h:187
static TupleTableSlot * project_aggregates(AggState *aggstate)
Definition: nodeAgg.c:1373
static void advance_aggregates(AggState *aggstate)
Definition: nodeAgg.c:839
int current_set
Definition: execnodes.h:2299
uint32 mask
Definition: nodeAgg.c:350
#define OidIsValid(objectId)
Definition: c.h:710
#define DO_AGGSPLIT_COMBINE(as)
Definition: nodes.h:801
FunctionCallInfo transfn_fcinfo
Definition: nodeAgg.h:162
TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 *hash)
Definition: execGrouping.c:306
void ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
Definition: nodeAgg.c:4777
struct HashAggBatch HashAggBatch
void ExecFreeExprContext(PlanState *planstate)
Definition: execUtils.c:650
Datum ExecAggTransReparent(AggState *aggstate, AggStatePerTrans pertrans, Datum newValue, bool newValueIsNull, Datum oldValue, bool oldValueIsNull)
int numtrans
Definition: execnodes.h:2279
void ExecForceStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1469
static void hash_agg_enter_spill_mode(AggState *aggstate)
Definition: nodeAgg.c:1883
TupleDesc sortdesc
Definition: nodeAgg.h:138
static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits, double input_groups, double hashentrysize)
Definition: nodeAgg.c:2955
Oid * sortOperators
Definition: plannodes.h:813
void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum)
Definition: logtape.c:951
void execTuplesHashPrepare(int numCols, const Oid *eqOperators, Oid **eqFuncOids, FmgrInfo **hashFunctions)
Definition: execGrouping.c:96
ExprState * ExecInitQual(List *qual, PlanState *parent)
Definition: execExpr.c:209
void ResetTupleHashTable(TupleHashTable hashtable)
Definition: execGrouping.c:285
ExprContext * tmpcontext
Definition: execnodes.h:2289
FmgrInfo transfn
Definition: nodeAgg.h:81
#define HASHAGG_PARTITION_FACTOR
Definition: nodeAgg.c:287
static void build_hash_table(AggState *aggstate, int setno, long nbuckets)
Definition: nodeAgg.c:1505
int max_colno_needed
Definition: execnodes.h:2303
static void prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
Definition: nodeAgg.c:1265
bool hash_spill_mode
Definition: execnodes.h:2326
#define FUNC_MAX_ARGS
static void hashagg_tapeinfo_init(AggState *aggstate)
Definition: nodeAgg.c:2891
List * hash_batches
Definition: execnodes.h:2324
Aggref * aggref
Definition: nodeAgg.h:44
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:897
#define linitial_int(l)
Definition: pg_list.h:175
Bitmapset ** grouped_cols
Definition: nodeAgg.h:277
PlanState ps
Definition: execnodes.h:1377
LogicalTapeSet * tapeset
Definition: nodeAgg.c:368
int maxsets
Definition: execnodes.h:2306
static bool agg_refill_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2595
static bool find_cols_walker(Node *node, FindColsContext *context)
Definition: nodeAgg.c:1422
Size hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
Definition: nodeAgg.c:1695
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:3308
void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
Definition: hyperloglog.c:66
#define DO_AGGSPLIT_SERIALIZE(as)
Definition: nodes.h:803
#define HASHAGG_MIN_PARTITIONS
Definition: nodeAgg.c:288
void pfree(void *pointer)
Definition: mcxt.c:1169
MemoryContext es_query_cxt
Definition: execnodes.h:600
AggStrategy aggstrategy
Definition: plannodes.h:860
AggState * ExecInitAgg(Agg *node, EState *estate, int eflags)
Definition: nodeAgg.c:3251
#define linitial(l)
Definition: pg_list.h:174
bool table_filled
Definition: execnodes.h:2316
AggStrategy aggstrategy
Definition: execnodes.h:2280
#define HASHAGG_HLL_BIT_WIDTH
Definition: nodeAgg.c:306
void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size)
Definition: logtape.c:775
static TupleTableSlot * agg_retrieve_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2746
static void find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
Definition: nodeAgg.c:1399
#define ObjectIdGetDatum(X)
Definition: postgres.h:551
#define ERROR
Definition: elog.h:46
bool fn_strict
Definition: fmgr.h:61
#define lfirst_int(lc)
Definition: pg_list.h:170
static void * list_nth(const List *list, int n)
Definition: pg_list.h:278
char * get_func_name(Oid funcid)
Definition: lsyscache.c:1579
MemoryContext hash_metacxt
Definition: execnodes.h:2318
NullableDatum args[FLEXIBLE_ARRAY_MEMBER]
Definition: fmgr.h:95
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:126
static TupleTableSlot * agg_retrieve_direct(AggState *aggstate)
Definition: nodeAgg.c:2195
#define AGG_CONTEXT_AGGREGATE
Definition: fmgr.h:738
struct TupleHashEntryData TupleHashEntryData
static void slot_getallattrs(TupleTableSlot *slot)
Definition: tuptable.h:354
static void find_hash_columns(AggState *aggstate)
Definition: nodeAgg.c:1565
ExprState * equalfnMulti
Definition: nodeAgg.h:111
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:195
Tuplesortstate * sort_in
Definition: execnodes.h:2308
#define EXEC_FLAG_BACKWARD
Definition: executor.h:58
#define outerPlanState(node)
Definition: execnodes.h:1062
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Definition: tuplesort.c:2408
int bms_num_members(const Bitmapset *a)
Definition: bitmapset.c:646
static void finalize_aggregates(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroup)
Definition: nodeAgg.c:1310
bool AggStateIsShared(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4688
static void hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *dest, int ndest)
Definition: nodeAgg.c:2912
#define list_nth_node(type, list, n)
Definition: pg_list.h:306
Tuplesortstate ** sortstates
Definition: nodeAgg.h:154
#define FunctionCallInvoke(fcinfo)
Definition: fmgr.h:172
Bitmapset * aggParams
Definition: plannodes.h:868
static int initValue(long lng_val)
Definition: informix.c:677
MemoryContext tablecxt
Definition: execnodes.h:746
void ExecAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc)
Definition: execUtils.c:535
bool * tts_isnull
Definition: tuptable.h:128
int npartitions
Definition: nodeAgg.c:347
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:316
MinimalTupleData * MinimalTuple
Definition: htup.h:27
static void process_ordered_aggregate_multi(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:963
List * aggorder
Definition: primnodes.h:331
void ExecAggEstimate(AggState *node, ParallelContext *pcxt)
Definition: nodeAgg.c:4756
int errcode_for_file_access(void)
Definition: elog.c:721
#define fmgr_info_set_expr(expr, finfo)
Definition: fmgr.h:135
AttrNumber resno
Definition: primnodes.h:1455
#define DatumGetBool(X)
Definition: postgres.h:437
int ParallelWorkerNumber
Definition: parallel.c:112
static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, TupleTableSlot *slot, uint32 hash)
Definition: nodeAgg.c:2986
TupleTableSlot * ecxt_innertuple
Definition: execnodes.h:228
List * ExecInitExprList(List *nodes, PlanState *parent)
Definition: execExpr.c:318
#define MakeExpandedObjectReadOnly(d, isnull, typlen)
Index agglevelsup
Definition: primnodes.h:338
int used_bits
Definition: nodeAgg.c:367
struct AggregateInstrumentation AggregateInstrumentation
Bitmapset * unaggregated
Definition: nodeAgg.c:379
#define TupIsNull(slot)
Definition: tuptable.h:292
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:207
unsigned int uint32
Definition: c.h:441
List * aggdirectargs
Definition: primnodes.h:329
static Datum GetAggInitVal(Datum textInitVal, Oid transtype)
Definition: nodeAgg.c:4351
AggStatePerAgg curperagg
Definition: execnodes.h:2292
AttrNumber * sortColIdx
Definition: nodeAgg.h:100
struct AggStatePerGroupData AggStatePerGroupData
struct HashTapeInfo * hash_tapeinfo
Definition: execnodes.h:2319
AggStatePerHash perhash
Definition: execnodes.h:2339
bool outeropsset
Definition: execnodes.h:1049
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
static void initialize_aggregates(AggState *aggstate, AggStatePerGroup *pergroups, int numReset)
Definition: nodeAgg.c:686
AggStrategy aggstrategy
Definition: nodeAgg.h:274
ExprState * evaltrans_cache[2][2]
Definition: nodeAgg.h:291
#define InstrCountFiltered1(node, delta)
Definition: execnodes.h:1070
#define EXEC_FLAG_REWIND
Definition: executor.h:57
hyperLogLogState * hll_card
Definition: nodeAgg.c:352
void getTypeInputInfo(Oid type, Oid *typInput, Oid *typIOParam)
Definition: lsyscache.c:2821
Datum value
Definition: postgres.h:422
Bitmapset * grouped_cols
Definition: execnodes.h:2300
#define IsParallelWorker()
Definition: parallel.h:61
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:131
TupleTableSlot * ExecAllocTableSlot(List **tupleTable, TupleDesc desc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1171
void ExecAggRetrieveInstrumentation(AggState *node)
Definition: nodeAgg.c:4815
int hash_batches_used
Definition: execnodes.h:2337
MemoryContext AggGetTempMemoryContext(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4662
Bitmapset * chgParam
Definition: execnodes.h:998
#define InvokeFunctionExecuteHook(objectId)
Definition: objectaccess.h:191
int ntapes
Definition: nodeAgg.c:329
bool IsBinaryCoercible(Oid srctype, Oid targettype)
int my_log2(long num)
Definition: dynahash.c:1765
double input_card
Definition: nodeAgg.c:371
#define outerPlan(node)
Definition: plannodes.h:171
List * lappend(List *list, void *datum)
Definition: list.c:336
Bitmapset * aggregated
Definition: nodeAgg.c:378
TupleHashIterator hashiter
Definition: nodeAgg.h:304
int numCols
Definition: plannodes.h:811
Index varno
Definition: primnodes.h:189
static void initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:599
int num_hashes
Definition: execnodes.h:2317
Plan plan
Definition: plannodes.h:859
AttrNumber * hashGrpColIdxHash
Definition: nodeAgg.h:312
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:1127
bool input_done
Definition: execnodes.h:2295
#define SizeofMinimalTupleHeader
Definition: htup_details.h:648
TupleDesc tts_tupleDescriptor
Definition: tuptable.h:124
ExprContext * curaggcontext
Definition: execnodes.h:2291
ExprContext * hashcontext
Definition: execnodes.h:2287
bool * ecxt_aggnulls
Definition: execnodes.h:247
static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)
Definition: executor.h:423
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
#define TextDatumGetCString(d)
Definition: builtins.h:83
List * es_tupleTable
Definition: execnodes.h:602
#define HASHAGG_READ_BUFFER_SIZE
Definition: nodeAgg.c:297
AggStatePerPhase phase
Definition: execnodes.h:2282
void * palloc0(Size size)
Definition: mcxt.c:1093
ExecProcNodeMtd ExecProcNode
Definition: execnodes.h:972
AclResult
Definition: acl.h:177
uintptr_t Datum
Definition: postgres.h:411
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:1175
struct FunctionCallInfoBaseData * FunctionCallInfo
Definition: fmgr.h:38
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
static TupleTableSlot * ExecProcNode(PlanState *node)
Definition: executor.h:252
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:1388
FmgrInfo deserialfn
Definition: nodeAgg.h:87
int work_mem
Definition: globals.c:124
List * groupingSets
Definition: plannodes.h:870
int16 resulttypeLen
Definition: nodeAgg.h:216
static void initialize_phase(AggState *aggstate, int newphase)
Definition: nodeAgg.c:498
double estimateHyperLogLog(hyperLogLogState *cState)
Definition: hyperloglog.c:186
LogicalTapeSet * tapeset
Definition: nodeAgg.c:328
struct FindColsContext FindColsContext
FormData_pg_proc * Form_pg_proc
Definition: pg_proc.h:136
Plan * plan
Definition: execnodes.h:966
#define InvalidOid
Definition: postgres_ext.h:36
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1256
Oid aggfnoid
Definition: primnodes.h:323
int16 attnum
Definition: pg_attribute.h:83
#define ResetTupleHashIterator(htable, iter)
Definition: execnodes.h:769
#define ereport(elevel,...)
Definition: elog.h:157
static HeapTuple ExecCopySlotHeapTuple(TupleTableSlot *slot)
Definition: tuptable.h:452
static void advance_transition_function(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:727
#define LOCAL_FCINFO(name, nargs)
Definition: fmgr.h:110
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
Definition: nodeAgg.c:1742
List * lcons(void *datum, List *list)
Definition: list.c:468
static void prepare_hash_slot(AggStatePerHash perhash, TupleTableSlot *inputslot, TupleTableSlot *hashslot)
Definition: nodeAgg.c:1220
int aggno
Definition: primnodes.h:340
uint64 hash_disk_used
Definition: execnodes.h:2336
Size MemoryContextMemAllocated(MemoryContext context, bool recurse)
Definition: mcxt.c:477
void bms_free(Bitmapset *a)
Definition: bitmapset.c:208
#define Max(x, y)
Definition: c.h:980
ExprContext ** aggcontexts
Definition: execnodes.h:2288
#define makeNode(_type_)
Definition: nodes.h:587
TupleTableSlot * ecxt_outertuple
Definition: execnodes.h:230
int plan_width
Definition: plannodes.h:124
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
FmgrInfo * hashfunctions
Definition: nodeAgg.h:306
#define Assert(condition)
Definition: c.h:804
#define lfirst(lc)
Definition: pg_list.h:169
void RegisterExprContextCallback(ExprContext *econtext, ExprContextCallbackFunction function, Datum arg)
Definition: execUtils.c:925
FmgrInfo serialfn
Definition: nodeAgg.h:84
ExprState * execTuplesMatchPrepare(TupleDesc desc, int numCols, const AttrNumber *keyColIdx, const Oid *eqOperators, const Oid *collations, PlanState *parent)
Definition: execGrouping.c:59
int input_tapenum
Definition: nodeAgg.c:369
FunctionCallInfo deserialfn_fcinfo
Definition: nodeAgg.h:167
#define EXEC_FLAG_MARK
Definition: executor.h:59
AggSplit aggsplit
Definition: plannodes.h:861
struct AggStatePerAggData * AggStatePerAgg
Definition: execnodes.h:2268
void ExecReScanAgg(AggState *node)
Definition: nodeAgg.c:4437
void build_aggregate_serialfn_expr(Oid serialfn_oid, Expr **serialfnexpr)
Definition: parse_agg.c:2034
FormData_pg_aggregate * Form_pg_aggregate
Definition: pg_aggregate.h:109
Expr * expr
Definition: primnodes.h:1454
AggSplit aggsplit
Definition: primnodes.h:339
bool MemoryContextContains(MemoryContext context, void *pointer)
Definition: mcxt.c:758
void(* ExprContextCallbackFunction)(Datum arg)
Definition: execnodes.h:188
void build_aggregate_transfn_expr(Oid *agg_input_types, int agg_num_inputs, int agg_num_direct_inputs, bool agg_variadic, Oid agg_state_type, Oid agg_input_collation, Oid transfn_oid, Oid invtransfn_oid, Expr **transfnexpr, Expr **invtransfnexpr)
Definition: parse_agg.c:1973
bool hash_ever_spilled
Definition: execnodes.h:2325
AggStatePerGroup * pergroups
Definition: execnodes.h:2312
void freeHyperLogLog(hyperLogLogState *cState)
Definition: hyperloglog.c:151
size_t Size
Definition: c.h:540
Bitmapset * bms_union(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:225
void ExecAssignExprContext(EState *estate, PlanState *planstate)
Definition: execUtils.c:480
#define AGG_CONTEXT_WINDOW
Definition: fmgr.h:739
#define InitFunctionCallInfoData(Fcinfo, Flinfo, Nargs, Collation, Context, Resultinfo)
Definition: fmgr.h:150
FunctionCallInfo serialfn_fcinfo
Definition: nodeAgg.h:165
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
bool expression_tree_walker(Node *node, bool(*walker)(), void *context)
Definition: nodeFuncs.c:1904
static int list_length(const List *l)
Definition: pg_list.h:149
long numGroups
Definition: plannodes.h:866
Oid exprCollation(const Node *expr)
Definition: nodeFuncs.c:759
#define DO_AGGSPLIT_SKIPFINAL(as)
Definition: nodes.h:802
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2198
void addHyperLogLog(hyperLogLogState *cState, uint32 hash)
Definition: hyperloglog.c:167
Expr * aggfilter
Definition: primnodes.h:333
int AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext)
Definition: nodeAgg.c:4584
TupleDesc ExecTypeFromTL(List *targetList)
Definition: execTuples.c:1938
size_t get_hash_memory_limit(void)
Definition: nodeHash.c:3401
#define MAXALIGN(LEN)
Definition: c.h:757
void ExecInitResultTupleSlotTL(PlanState *planstate, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1799
void ReScanExprContext(ExprContext *econtext)
Definition: execUtils.c:438
static TupleTableSlot * agg_retrieve_hash_table_in_memory(AggState *aggstate)
Definition: nodeAgg.c:2771
LogicalTapeSet * tapeset
Definition: nodeAgg.c:346
bool outeropsfixed
Definition: execnodes.h:1045
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:736
#define DO_AGGSPLIT_DESERIALIZE(as)
Definition: nodes.h:804
Size hash_mem_limit
Definition: execnodes.h:2328
struct Plan * lefttree
Definition: plannodes.h:143
TupleTableSlot * uniqslot
Definition: nodeAgg.h:137
int numphases
Definition: execnodes.h:2283
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1182
TupleDesc ExecGetResultType(PlanState *planstate)
Definition: execUtils.c:490
void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
Definition: logtape.c:863
List * targetlist
Definition: plannodes.h:141
ExprState * qual
Definition: execnodes.h:987
void ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
Definition: nodeAgg.c:4802
#define DatumGetPointer(X)
Definition: postgres.h:593
AttrNumber * sortColIdx
Definition: plannodes.h:812
#define CHUNKHDRSZ
Definition: nodeAgg.c:312
#define HASHAGG_WRITE_BUFFER_SIZE
Definition: nodeAgg.c:298
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:494