PostgreSQL Source Code git master
nodeAgg.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nodeAgg.c
4 * Routines to handle aggregate nodes.
5 *
6 * ExecAgg normally evaluates each aggregate in the following steps:
7 *
8 * transvalue = initcond
9 * foreach input_tuple do
10 * transvalue = transfunc(transvalue, input_value(s))
11 * result = finalfunc(transvalue, direct_argument(s))
12 *
13 * If a finalfunc is not supplied then the result is just the ending
14 * value of transvalue.
15 *
16 * Other behaviors can be selected by the "aggsplit" mode, which exists
17 * to support partial aggregation. It is possible to:
18 * * Skip running the finalfunc, so that the output is always the
19 * final transvalue state.
20 * * Substitute the combinefunc for the transfunc, so that transvalue
21 * states (propagated up from a child partial-aggregation step) are merged
22 * rather than processing raw input rows. (The statements below about
23 * the transfunc apply equally to the combinefunc, when it's selected.)
24 * * Apply the serializefunc to the output values (this only makes sense
25 * when skipping the finalfunc, since the serializefunc works on the
26 * transvalue data type).
27 * * Apply the deserializefunc to the input values (this only makes sense
28 * when using the combinefunc, for similar reasons).
29 * It is the planner's responsibility to connect up Agg nodes using these
30 * alternate behaviors in a way that makes sense, with partial aggregation
31 * results being fed to nodes that expect them.
32 *
33 * If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the
34 * input tuples and eliminate duplicates (if required) before performing
35 * the above-depicted process. (However, we don't do that for ordered-set
36 * aggregates; their "ORDER BY" inputs are ordinary aggregate arguments
37 * so far as this module is concerned.) Note that partial aggregation
38 * is not supported in these cases, since we couldn't ensure global
39 * ordering or distinctness of the inputs.
40 *
41 * If transfunc is marked "strict" in pg_proc and initcond is NULL,
42 * then the first non-NULL input_value is assigned directly to transvalue,
43 * and transfunc isn't applied until the second non-NULL input_value.
44 * The agg's first input type and transtype must be the same in this case!
45 *
46 * If transfunc is marked "strict" then NULL input_values are skipped,
47 * keeping the previous transvalue. If transfunc is not strict then it
48 * is called for every input tuple and must deal with NULL initcond
49 * or NULL input_values for itself.
50 *
51 * If finalfunc is marked "strict" then it is not called when the
52 * ending transvalue is NULL, instead a NULL result is created
53 * automatically (this is just the usual handling of strict functions,
54 * of course). A non-strict finalfunc can make its own choice of
55 * what to return for a NULL ending transvalue.
56 *
57 * Ordered-set aggregates are treated specially in one other way: we
58 * evaluate any "direct" arguments and pass them to the finalfunc along
59 * with the transition value.
60 *
61 * A finalfunc can have additional arguments beyond the transvalue and
62 * any "direct" arguments, corresponding to the input arguments of the
63 * aggregate. These are always just passed as NULL. Such arguments may be
64 * needed to allow resolution of a polymorphic aggregate's result type.
65 *
66 * We compute aggregate input expressions and run the transition functions
67 * in a temporary econtext (aggstate->tmpcontext). This is reset at least
68 * once per input tuple, so when the transvalue datatype is
69 * pass-by-reference, we have to be careful to copy it into a longer-lived
70 * memory context, and free the prior value to avoid memory leakage. We
71 * store transvalues in another set of econtexts, aggstate->aggcontexts
72 * (one per grouping set, see below), which are also used for the hashtable
73 * structures in AGG_HASHED mode. These econtexts are rescanned, not just
74 * reset, at group boundaries so that aggregate transition functions can
75 * register shutdown callbacks via AggRegisterCallback.
76 *
77 * The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to
78 * run finalize functions and compute the output tuple; this context can be
79 * reset once per output tuple.
80 *
81 * The executor's AggState node is passed as the fmgr "context" value in
82 * all transfunc and finalfunc calls. It is not recommended that the
83 * transition functions look at the AggState node directly, but they can
84 * use AggCheckCallContext() to verify that they are being called by
85 * nodeAgg.c (and not as ordinary SQL functions). The main reason a
86 * transition function might want to know this is so that it can avoid
87 * palloc'ing a fixed-size pass-by-ref transition value on every call:
88 * it can instead just scribble on and return its left input. Ordinarily
89 * it is completely forbidden for functions to modify pass-by-ref inputs,
90 * but in the aggregate case we know the left input is either the initial
91 * transition value or a previous function result, and in either case its
92 * value need not be preserved. See int8inc() for an example. Notice that
93 * the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when
94 * the previous transition value pointer is returned. It is also possible
95 * to avoid repeated data copying when the transition value is an expanded
96 * object: to do that, the transition function must take care to return
97 * an expanded object that is in a child context of the memory context
98 * returned by AggCheckCallContext(). Also, some transition functions want
99 * to store working state in addition to the nominal transition value; they
100 * can use the memory context returned by AggCheckCallContext() to do that.
101 *
102 * Note: AggCheckCallContext() is available as of PostgreSQL 9.0. The
103 * AggState is available as context in earlier releases (back to 8.1),
104 * but direct examination of the node is needed to use it before 9.0.
105 *
106 * As of 9.4, aggregate transition functions can also use AggGetAggref()
107 * to get hold of the Aggref expression node for their aggregate call.
108 * This is mainly intended for ordered-set aggregates, which are not
109 * supported as window functions. (A regular aggregate function would
110 * need some fallback logic to use this, since there's no Aggref node
111 * for a window function.)
112 *
113 * Grouping sets:
114 *
115 * A list of grouping sets which is structurally equivalent to a ROLLUP
116 * clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over
117 * ordered data. We do this by keeping a separate set of transition values
118 * for each grouping set being concurrently processed; for each input tuple
119 * we update them all, and on group boundaries we reset those states
120 * (starting at the front of the list) whose grouping values have changed
121 * (the list of grouping sets is ordered from most specific to least
122 * specific).
123 *
124 * Where more complex grouping sets are used, we break them down into
125 * "phases", where each phase has a different sort order (except phase 0
126 * which is reserved for hashing). During each phase but the last, the
127 * input tuples are additionally stored in a tuplesort which is keyed to the
128 * next phase's sort order; during each phase but the first, the input
129 * tuples are drawn from the previously sorted data. (The sorting of the
130 * data for the first phase is handled by the planner, as it might be
131 * satisfied by underlying nodes.)
132 *
133 * Hashing can be mixed with sorted grouping. To do this, we have an
134 * AGG_MIXED strategy that populates the hashtables during the first sorted
135 * phase, and switches to reading them out after completing all sort phases.
136 * We can also support AGG_HASHED with multiple hash tables and no sorting
137 * at all.
138 *
139 * From the perspective of aggregate transition and final functions, the
140 * only issue regarding grouping sets is this: a single call site (flinfo)
141 * of an aggregate function may be used for updating several different
142 * transition values in turn. So the function must not cache in the flinfo
143 * anything which logically belongs as part of the transition value (most
144 * importantly, the memory context in which the transition value exists).
145 * The support API functions (AggCheckCallContext, AggRegisterCallback) are
146 * sensitive to the grouping set for which the aggregate function is
147 * currently being called.
148 *
149 * Plan structure:
150 *
151 * What we get from the planner is actually one "real" Agg node which is
152 * part of the plan tree proper, but which optionally has an additional list
153 * of Agg nodes hung off the side via the "chain" field. This is because an
154 * Agg node happens to be a convenient representation of all the data we
155 * need for grouping sets.
156 *
157 * For many purposes, we treat the "real" node as if it were just the first
158 * node in the chain. The chain must be ordered such that hashed entries
159 * come before sorted/plain entries; the real node is marked AGG_MIXED if
160 * there are both types present (in which case the real node describes one
161 * of the hashed groupings, other AGG_HASHED nodes may optionally follow in
162 * the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If
163 * the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
164 * nodes must be of the same type; if it is AGG_PLAIN, there can be no
165 * chained nodes.
166 *
167 * We collect all hashed nodes into a single "phase", numbered 0, and create
168 * a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
169 * Phase 0 is allocated even if there are no hashes, but remains unused in
170 * that case.
171 *
172 * AGG_HASHED nodes actually refer to only a single grouping set each,
173 * because for each hashed grouping we need a separate grpColIdx and
174 * numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of
175 * grouping sets that share a sort order. Each AGG_SORTED node other than
176 * the first one has an associated Sort node which describes the sort order
177 * to be used; the first sorted node takes its input from the outer subtree,
178 * which the planner has already arranged to provide ordered data.
179 *
180 * Memory and ExprContext usage:
181 *
182 * Because we're accumulating aggregate values across input rows, we need to
183 * use more memory contexts than just simple input/output tuple contexts.
184 * In fact, for a rollup, we need a separate context for each grouping set
185 * so that we can reset the inner (finer-grained) aggregates on their group
186 * boundaries while continuing to accumulate values for outer
187 * (coarser-grained) groupings. On top of this, we might be simultaneously
188 * populating hashtables; however, we only need one context for all the
189 * hashtables.
190 *
191 * So we create an array, aggcontexts, with an ExprContext for each grouping
192 * set in the largest rollup that we're going to process, and use the
193 * per-tuple memory context of those ExprContexts to store the aggregate
194 * transition values. hashcontext is the single context created to support
195 * all hash tables.
196 *
197 * Spilling To Disk
198 *
199 * When performing hash aggregation, if the hash table memory exceeds the
200 * limit (see hash_agg_check_limits()), we enter "spill mode". In spill
201 * mode, we advance the transition states only for groups already in the
202 * hash table. For tuples that would need to create a new hash table
203 * entries (and initialize new transition states), we instead spill them to
204 * disk to be processed later. The tuples are spilled in a partitioned
205 * manner, so that subsequent batches are smaller and less likely to exceed
206 * hash_mem (if a batch does exceed hash_mem, it must be spilled
207 * recursively).
208 *
209 * Spilled data is written to logical tapes. These provide better control
210 * over memory usage, disk space, and the number of files than if we were
211 * to use a BufFile for each spill. We don't know the number of tapes needed
212 * at the start of the algorithm (because it can recurse), so a tape set is
213 * allocated at the beginning, and individual tapes are created as needed.
214 * As a particular tape is read, logtape.c recycles its disk space. When a
215 * tape is read to completion, it is destroyed entirely.
216 *
217 * Tapes' buffers can take up substantial memory when many tapes are open at
218 * once. We only need one tape open at a time in read mode (using a buffer
219 * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
220 * requiring a buffer of size BLCKSZ) for each partition.
221 *
222 * Note that it's possible for transition states to start small but then
223 * grow very large; for instance in the case of ARRAY_AGG. In such cases,
224 * it's still possible to significantly exceed hash_mem. We try to avoid
225 * this situation by estimating what will fit in the available memory, and
226 * imposing a limit on the number of groups separately from the amount of
227 * memory consumed.
228 *
229 * Transition / Combine function invocation:
230 *
231 * For performance reasons transition functions, including combine
232 * functions, aren't invoked one-by-one from nodeAgg.c after computing
233 * arguments using the expression evaluation engine. Instead
234 * ExecBuildAggTrans() builds one large expression that does both argument
235 * evaluation and transition function invocation. That avoids performance
236 * issues due to repeated uses of expression evaluation, complications due
237 * to filter expressions having to be evaluated early, and allows to JIT
238 * the entire expression into one native function.
239 *
240 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
241 * Portions Copyright (c) 1994, Regents of the University of California
242 *
243 * IDENTIFICATION
244 * src/backend/executor/nodeAgg.c
245 *
246 *-------------------------------------------------------------------------
247 */
248
249#include "postgres.h"
250
251#include "access/htup_details.h"
252#include "access/parallel.h"
253#include "catalog/objectaccess.h"
254#include "catalog/pg_aggregate.h"
255#include "catalog/pg_proc.h"
256#include "catalog/pg_type.h"
257#include "common/hashfn.h"
258#include "executor/execExpr.h"
259#include "executor/executor.h"
260#include "executor/nodeAgg.h"
261#include "lib/hyperloglog.h"
262#include "miscadmin.h"
263#include "nodes/nodeFuncs.h"
264#include "optimizer/optimizer.h"
265#include "parser/parse_agg.h"
266#include "parser/parse_coerce.h"
267#include "utils/acl.h"
268#include "utils/builtins.h"
269#include "utils/datum.h"
270#include "utils/dynahash.h"
271#include "utils/expandeddatum.h"
273#include "utils/logtape.h"
274#include "utils/lsyscache.h"
275#include "utils/memutils.h"
277#include "utils/syscache.h"
278#include "utils/tuplesort.h"
279
280/*
281 * Control how many partitions are created when spilling HashAgg to
282 * disk.
283 *
284 * HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of
285 * partitions needed such that each partition will fit in memory. The factor
286 * is set higher than one because there's not a high cost to having a few too
287 * many partitions, and it makes it less likely that a partition will need to
288 * be spilled recursively. Another benefit of having more, smaller partitions
289 * is that small hash tables may perform better than large ones due to memory
290 * caching effects.
291 *
292 * We also specify a min and max number of partitions per spill. Too few might
293 * mean a lot of wasted I/O from repeated spilling of the same tuples. Too
294 * many will result in lots of memory wasted buffering the spill files (which
295 * could instead be spent on a larger hash table).
296 */
297#define HASHAGG_PARTITION_FACTOR 1.50
298#define HASHAGG_MIN_PARTITIONS 4
299#define HASHAGG_MAX_PARTITIONS 1024
300
301/*
302 * For reading from tapes, the buffer size must be a multiple of
303 * BLCKSZ. Larger values help when reading from multiple tapes concurrently,
304 * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a
305 * tape always uses a buffer of size BLCKSZ.
306 */
307#define HASHAGG_READ_BUFFER_SIZE BLCKSZ
308#define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
309
310/*
311 * HyperLogLog is used for estimating the cardinality of the spilled tuples in
312 * a given partition. 5 bits corresponds to a size of about 32 bytes and a
313 * worst-case error of around 18%. That's effective enough to choose a
314 * reasonable number of partitions when recursing.
315 */
316#define HASHAGG_HLL_BIT_WIDTH 5
317
318/*
319 * Assume the palloc overhead always uses sizeof(MemoryChunk) bytes.
320 */
321#define CHUNKHDRSZ sizeof(MemoryChunk)
322
323/*
324 * Represents partitioned spill data for a single hashtable. Contains the
325 * necessary information to route tuples to the correct partition, and to
326 * transform the spilled data into new batches.
327 *
328 * The high bits are used for partition selection (when recursing, we ignore
329 * the bits that have already been used for partition selection at an earlier
330 * level).
331 */
332typedef struct HashAggSpill
333{
334 int npartitions; /* number of partitions */
335 LogicalTape **partitions; /* spill partition tapes */
336 int64 *ntuples; /* number of tuples in each partition */
337 uint32 mask; /* mask to find partition from hash value */
338 int shift; /* after masking, shift by this amount */
339 hyperLogLogState *hll_card; /* cardinality estimate for contents */
341
342/*
343 * Represents work to be done for one pass of hash aggregation (with only one
344 * grouping set).
345 *
346 * Also tracks the bits of the hash already used for partition selection by
347 * earlier iterations, so that this batch can use new bits. If all bits have
348 * already been used, no partitioning will be done (any spilled data will go
349 * to a single output tape).
350 */
351typedef struct HashAggBatch
352{
353 int setno; /* grouping set */
354 int used_bits; /* number of bits of hash already used */
355 LogicalTape *input_tape; /* input partition tape */
356 int64 input_tuples; /* number of tuples in this batch */
357 double input_card; /* estimated group cardinality */
359
360/* used to find referenced colnos */
361typedef struct FindColsContext
362{
363 bool is_aggref; /* is under an aggref */
364 Bitmapset *aggregated; /* column references under an aggref */
365 Bitmapset *unaggregated; /* other column references */
367
368static void select_current_set(AggState *aggstate, int setno, bool is_hash);
369static void initialize_phase(AggState *aggstate, int newphase);
370static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
371static void initialize_aggregates(AggState *aggstate,
372 AggStatePerGroup *pergroups,
373 int numReset);
374static void advance_transition_function(AggState *aggstate,
375 AggStatePerTrans pertrans,
376 AggStatePerGroup pergroupstate);
377static void advance_aggregates(AggState *aggstate);
378static void process_ordered_aggregate_single(AggState *aggstate,
379 AggStatePerTrans pertrans,
380 AggStatePerGroup pergroupstate);
381static void process_ordered_aggregate_multi(AggState *aggstate,
382 AggStatePerTrans pertrans,
383 AggStatePerGroup pergroupstate);
384static void finalize_aggregate(AggState *aggstate,
385 AggStatePerAgg peragg,
386 AggStatePerGroup pergroupstate,
387 Datum *resultVal, bool *resultIsNull);
388static void finalize_partialaggregate(AggState *aggstate,
389 AggStatePerAgg peragg,
390 AggStatePerGroup pergroupstate,
391 Datum *resultVal, bool *resultIsNull);
392static inline void prepare_hash_slot(AggStatePerHash perhash,
393 TupleTableSlot *inputslot,
394 TupleTableSlot *hashslot);
395static void prepare_projection_slot(AggState *aggstate,
396 TupleTableSlot *slot,
397 int currentSet);
398static void finalize_aggregates(AggState *aggstate,
399 AggStatePerAgg peraggs,
400 AggStatePerGroup pergroup);
402static void find_cols(AggState *aggstate, Bitmapset **aggregated,
403 Bitmapset **unaggregated);
404static bool find_cols_walker(Node *node, FindColsContext *context);
405static void build_hash_tables(AggState *aggstate);
406static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
407static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
408 bool nullcheck);
409static long hash_choose_num_buckets(double hashentrysize,
410 long ngroups, Size memory);
411static int hash_choose_num_partitions(double input_groups,
412 double hashentrysize,
413 int used_bits,
414 int *log2_npartitions);
415static void initialize_hash_entry(AggState *aggstate,
416 TupleHashTable hashtable,
417 TupleHashEntry entry);
418static void lookup_hash_entries(AggState *aggstate);
420static void agg_fill_hash_table(AggState *aggstate);
421static bool agg_refill_hash_table(AggState *aggstate);
424static void hash_agg_check_limits(AggState *aggstate);
425static void hash_agg_enter_spill_mode(AggState *aggstate);
426static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
427 int npartitions);
428static void hashagg_finish_initial_spills(AggState *aggstate);
429static void hashagg_reset_spill_state(AggState *aggstate);
430static HashAggBatch *hashagg_batch_new(LogicalTape *input_tape, int setno,
431 int64 input_tuples, double input_card,
432 int used_bits);
434static void hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset,
435 int used_bits, double input_groups,
436 double hashentrysize);
437static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
438 TupleTableSlot *inputslot, uint32 hash);
439static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
440 int setno);
441static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
443 AggState *aggstate, EState *estate,
444 Aggref *aggref, Oid transfn_oid,
445 Oid aggtranstype, Oid aggserialfn,
446 Oid aggdeserialfn, Datum initValue,
447 bool initValueIsNull, Oid *inputTypes,
448 int numArguments);
449
450
451/*
452 * Select the current grouping set; affects current_set and
453 * curaggcontext.
454 */
455static void
456select_current_set(AggState *aggstate, int setno, bool is_hash)
457{
458 /*
459 * When changing this, also adapt ExecAggPlainTransByVal() and
460 * ExecAggPlainTransByRef().
461 */
462 if (is_hash)
463 aggstate->curaggcontext = aggstate->hashcontext;
464 else
465 aggstate->curaggcontext = aggstate->aggcontexts[setno];
466
467 aggstate->current_set = setno;
468}
469
470/*
471 * Switch to phase "newphase", which must either be 0 or 1 (to reset) or
472 * current_phase + 1. Juggle the tuplesorts accordingly.
473 *
474 * Phase 0 is for hashing, which we currently handle last in the AGG_MIXED
475 * case, so when entering phase 0, all we need to do is drop open sorts.
476 */
477static void
478initialize_phase(AggState *aggstate, int newphase)
479{
480 Assert(newphase <= 1 || newphase == aggstate->current_phase + 1);
481
482 /*
483 * Whatever the previous state, we're now done with whatever input
484 * tuplesort was in use.
485 */
486 if (aggstate->sort_in)
487 {
488 tuplesort_end(aggstate->sort_in);
489 aggstate->sort_in = NULL;
490 }
491
492 if (newphase <= 1)
493 {
494 /*
495 * Discard any existing output tuplesort.
496 */
497 if (aggstate->sort_out)
498 {
499 tuplesort_end(aggstate->sort_out);
500 aggstate->sort_out = NULL;
501 }
502 }
503 else
504 {
505 /*
506 * The old output tuplesort becomes the new input one, and this is the
507 * right time to actually sort it.
508 */
509 aggstate->sort_in = aggstate->sort_out;
510 aggstate->sort_out = NULL;
511 Assert(aggstate->sort_in);
513 }
514
515 /*
516 * If this isn't the last phase, we need to sort appropriately for the
517 * next phase in sequence.
518 */
519 if (newphase > 0 && newphase < aggstate->numphases - 1)
520 {
521 Sort *sortnode = aggstate->phases[newphase + 1].sortnode;
522 PlanState *outerNode = outerPlanState(aggstate);
523 TupleDesc tupDesc = ExecGetResultType(outerNode);
524
525 aggstate->sort_out = tuplesort_begin_heap(tupDesc,
526 sortnode->numCols,
527 sortnode->sortColIdx,
528 sortnode->sortOperators,
529 sortnode->collations,
530 sortnode->nullsFirst,
531 work_mem,
532 NULL, TUPLESORT_NONE);
533 }
534
535 aggstate->current_phase = newphase;
536 aggstate->phase = &aggstate->phases[newphase];
537}
538
539/*
540 * Fetch a tuple from either the outer plan (for phase 1) or from the sorter
541 * populated by the previous phase. Copy it to the sorter for the next phase
542 * if any.
543 *
544 * Callers cannot rely on memory for tuple in returned slot remaining valid
545 * past any subsequently fetched tuple.
546 */
547static TupleTableSlot *
549{
550 TupleTableSlot *slot;
551
552 if (aggstate->sort_in)
553 {
554 /* make sure we check for interrupts in either path through here */
556 if (!tuplesort_gettupleslot(aggstate->sort_in, true, false,
557 aggstate->sort_slot, NULL))
558 return NULL;
559 slot = aggstate->sort_slot;
560 }
561 else
562 slot = ExecProcNode(outerPlanState(aggstate));
563
564 if (!TupIsNull(slot) && aggstate->sort_out)
565 tuplesort_puttupleslot(aggstate->sort_out, slot);
566
567 return slot;
568}
569
570/*
571 * (Re)Initialize an individual aggregate.
572 *
573 * This function handles only one grouping set, already set in
574 * aggstate->current_set.
575 *
576 * When called, CurrentMemoryContext should be the per-query context.
577 */
578static void
580 AggStatePerGroup pergroupstate)
581{
582 /*
583 * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
584 */
585 if (pertrans->aggsortrequired)
586 {
587 /*
588 * In case of rescan, maybe there could be an uncompleted sort
589 * operation? Clean it up if so.
590 */
591 if (pertrans->sortstates[aggstate->current_set])
592 tuplesort_end(pertrans->sortstates[aggstate->current_set]);
593
594
595 /*
596 * We use a plain Datum sorter when there's a single input column;
597 * otherwise sort the full tuple. (See comments for
598 * process_ordered_aggregate_single.)
599 */
600 if (pertrans->numInputs == 1)
601 {
602 Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0);
603
604 pertrans->sortstates[aggstate->current_set] =
605 tuplesort_begin_datum(attr->atttypid,
606 pertrans->sortOperators[0],
607 pertrans->sortCollations[0],
608 pertrans->sortNullsFirst[0],
609 work_mem, NULL, TUPLESORT_NONE);
610 }
611 else
612 pertrans->sortstates[aggstate->current_set] =
614 pertrans->numSortCols,
615 pertrans->sortColIdx,
616 pertrans->sortOperators,
617 pertrans->sortCollations,
618 pertrans->sortNullsFirst,
619 work_mem, NULL, TUPLESORT_NONE);
620 }
621
622 /*
623 * (Re)set transValue to the initial value.
624 *
625 * Note that when the initial value is pass-by-ref, we must copy it (into
626 * the aggcontext) since we will pfree the transValue later.
627 */
628 if (pertrans->initValueIsNull)
629 pergroupstate->transValue = pertrans->initValue;
630 else
631 {
632 MemoryContext oldContext;
633
635 pergroupstate->transValue = datumCopy(pertrans->initValue,
636 pertrans->transtypeByVal,
637 pertrans->transtypeLen);
638 MemoryContextSwitchTo(oldContext);
639 }
640 pergroupstate->transValueIsNull = pertrans->initValueIsNull;
641
642 /*
643 * If the initial value for the transition state doesn't exist in the
644 * pg_aggregate table then we will let the first non-NULL value returned
645 * from the outer procNode become the initial value. (This is useful for
646 * aggregates like max() and min().) The noTransValue flag signals that we
647 * still need to do this.
648 */
649 pergroupstate->noTransValue = pertrans->initValueIsNull;
650}
651
652/*
653 * Initialize all aggregate transition states for a new group of input values.
654 *
655 * If there are multiple grouping sets, we initialize only the first numReset
656 * of them (the grouping sets are ordered so that the most specific one, which
657 * is reset most often, is first). As a convenience, if numReset is 0, we
658 * reinitialize all sets.
659 *
660 * NB: This cannot be used for hash aggregates, as for those the grouping set
661 * number has to be specified from further up.
662 *
663 * When called, CurrentMemoryContext should be the per-query context.
664 */
665static void
667 AggStatePerGroup *pergroups,
668 int numReset)
669{
670 int transno;
671 int numGroupingSets = Max(aggstate->phase->numsets, 1);
672 int setno = 0;
673 int numTrans = aggstate->numtrans;
674 AggStatePerTrans transstates = aggstate->pertrans;
675
676 if (numReset == 0)
677 numReset = numGroupingSets;
678
679 for (setno = 0; setno < numReset; setno++)
680 {
681 AggStatePerGroup pergroup = pergroups[setno];
682
683 select_current_set(aggstate, setno, false);
684
685 for (transno = 0; transno < numTrans; transno++)
686 {
687 AggStatePerTrans pertrans = &transstates[transno];
688 AggStatePerGroup pergroupstate = &pergroup[transno];
689
690 initialize_aggregate(aggstate, pertrans, pergroupstate);
691 }
692 }
693}
694
695/*
696 * Given new input value(s), advance the transition function of one aggregate
697 * state within one grouping set only (already set in aggstate->current_set)
698 *
699 * The new values (and null flags) have been preloaded into argument positions
700 * 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to
701 * pass to the transition function. We also expect that the static fields of
702 * the fcinfo are already initialized; that was done by ExecInitAgg().
703 *
704 * It doesn't matter which memory context this is called in.
705 */
706static void
708 AggStatePerTrans pertrans,
709 AggStatePerGroup pergroupstate)
710{
711 FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
712 MemoryContext oldContext;
713 Datum newVal;
714
715 if (pertrans->transfn.fn_strict)
716 {
717 /*
718 * For a strict transfn, nothing happens when there's a NULL input; we
719 * just keep the prior transValue.
720 */
721 int numTransInputs = pertrans->numTransInputs;
722 int i;
723
724 for (i = 1; i <= numTransInputs; i++)
725 {
726 if (fcinfo->args[i].isnull)
727 return;
728 }
729 if (pergroupstate->noTransValue)
730 {
731 /*
732 * transValue has not been initialized. This is the first non-NULL
733 * input value. We use it as the initial value for transValue. (We
734 * already checked that the agg's input type is binary-compatible
735 * with its transtype, so straight copy here is OK.)
736 *
737 * We must copy the datum into aggcontext if it is pass-by-ref. We
738 * do not need to pfree the old transValue, since it's NULL.
739 */
741 pergroupstate->transValue = datumCopy(fcinfo->args[1].value,
742 pertrans->transtypeByVal,
743 pertrans->transtypeLen);
744 pergroupstate->transValueIsNull = false;
745 pergroupstate->noTransValue = false;
746 MemoryContextSwitchTo(oldContext);
747 return;
748 }
749 if (pergroupstate->transValueIsNull)
750 {
751 /*
752 * Don't call a strict function with NULL inputs. Note it is
753 * possible to get here despite the above tests, if the transfn is
754 * strict *and* returned a NULL on a prior cycle. If that happens
755 * we will propagate the NULL all the way to the end.
756 */
757 return;
758 }
759 }
760
761 /* We run the transition functions in per-input-tuple memory context */
763
764 /* set up aggstate->curpertrans for AggGetAggref() */
765 aggstate->curpertrans = pertrans;
766
767 /*
768 * OK to call the transition function
769 */
770 fcinfo->args[0].value = pergroupstate->transValue;
771 fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
772 fcinfo->isnull = false; /* just in case transfn doesn't set it */
773
774 newVal = FunctionCallInvoke(fcinfo);
775
776 aggstate->curpertrans = NULL;
777
778 /*
779 * If pass-by-ref datatype, must copy the new value into aggcontext and
780 * free the prior transValue. But if transfn returned a pointer to its
781 * first input, we don't need to do anything.
782 *
783 * It's safe to compare newVal with pergroup->transValue without regard
784 * for either being NULL, because ExecAggCopyTransValue takes care to set
785 * transValue to 0 when NULL. Otherwise we could end up accidentally not
786 * reparenting, when the transValue has the same numerical value as
787 * newValue, despite being NULL. This is a somewhat hot path, making it
788 * undesirable to instead solve this with another branch for the common
789 * case of the transition function returning its (modified) input
790 * argument.
791 */
792 if (!pertrans->transtypeByVal &&
793 DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
794 newVal = ExecAggCopyTransValue(aggstate, pertrans,
795 newVal, fcinfo->isnull,
796 pergroupstate->transValue,
797 pergroupstate->transValueIsNull);
798
799 pergroupstate->transValue = newVal;
800 pergroupstate->transValueIsNull = fcinfo->isnull;
801
802 MemoryContextSwitchTo(oldContext);
803}
804
805/*
806 * Advance each aggregate transition state for one input tuple. The input
807 * tuple has been stored in tmpcontext->ecxt_outertuple, so that it is
808 * accessible to ExecEvalExpr.
809 *
810 * We have two sets of transition states to handle: one for sorted aggregation
811 * and one for hashed; we do them both here, to avoid multiple evaluation of
812 * the inputs.
813 *
814 * When called, CurrentMemoryContext should be the per-query context.
815 */
816static void
818{
819 bool dummynull;
820
822 aggstate->tmpcontext,
823 &dummynull);
824}
825
826/*
827 * Run the transition function for a DISTINCT or ORDER BY aggregate
828 * with only one input. This is called after we have completed
829 * entering all the input values into the sort object. We complete the
830 * sort, read out the values in sorted order, and run the transition
831 * function on each value (applying DISTINCT if appropriate).
832 *
833 * Note that the strictness of the transition function was checked when
834 * entering the values into the sort, so we don't check it again here;
835 * we just apply standard SQL DISTINCT logic.
836 *
837 * The one-input case is handled separately from the multi-input case
838 * for performance reasons: for single by-value inputs, such as the
839 * common case of count(distinct id), the tuplesort_getdatum code path
840 * is around 300% faster. (The speedup for by-reference types is less
841 * but still noticeable.)
842 *
843 * This function handles only one grouping set (already set in
844 * aggstate->current_set).
845 *
846 * When called, CurrentMemoryContext should be the per-query context.
847 */
848static void
850 AggStatePerTrans pertrans,
851 AggStatePerGroup pergroupstate)
852{
853 Datum oldVal = (Datum) 0;
854 bool oldIsNull = true;
855 bool haveOldVal = false;
856 MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
857 MemoryContext oldContext;
858 bool isDistinct = (pertrans->numDistinctCols > 0);
859 Datum newAbbrevVal = (Datum) 0;
860 Datum oldAbbrevVal = (Datum) 0;
861 FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
862 Datum *newVal;
863 bool *isNull;
864
865 Assert(pertrans->numDistinctCols < 2);
866
867 tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
868
869 /* Load the column into argument 1 (arg 0 will be transition value) */
870 newVal = &fcinfo->args[1].value;
871 isNull = &fcinfo->args[1].isnull;
872
873 /*
874 * Note: if input type is pass-by-ref, the datums returned by the sort are
875 * freshly palloc'd in the per-query context, so we must be careful to
876 * pfree them when they are no longer needed.
877 */
878
879 while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
880 true, false, newVal, isNull, &newAbbrevVal))
881 {
882 /*
883 * Clear and select the working context for evaluation of the equality
884 * function and transition function.
885 */
886 MemoryContextReset(workcontext);
887 oldContext = MemoryContextSwitchTo(workcontext);
888
889 /*
890 * If DISTINCT mode, and not distinct from prior, skip it.
891 */
892 if (isDistinct &&
893 haveOldVal &&
894 ((oldIsNull && *isNull) ||
895 (!oldIsNull && !*isNull &&
896 oldAbbrevVal == newAbbrevVal &&
898 pertrans->aggCollation,
899 oldVal, *newVal)))))
900 {
901 MemoryContextSwitchTo(oldContext);
902 continue;
903 }
904 else
905 {
906 advance_transition_function(aggstate, pertrans, pergroupstate);
907
908 MemoryContextSwitchTo(oldContext);
909
910 /*
911 * Forget the old value, if any, and remember the new one for
912 * subsequent equality checks.
913 */
914 if (!pertrans->inputtypeByVal)
915 {
916 if (!oldIsNull)
917 pfree(DatumGetPointer(oldVal));
918 if (!*isNull)
919 oldVal = datumCopy(*newVal, pertrans->inputtypeByVal,
920 pertrans->inputtypeLen);
921 }
922 else
923 oldVal = *newVal;
924 oldAbbrevVal = newAbbrevVal;
925 oldIsNull = *isNull;
926 haveOldVal = true;
927 }
928 }
929
930 if (!oldIsNull && !pertrans->inputtypeByVal)
931 pfree(DatumGetPointer(oldVal));
932
933 tuplesort_end(pertrans->sortstates[aggstate->current_set]);
934 pertrans->sortstates[aggstate->current_set] = NULL;
935}
936
937/*
938 * Run the transition function for a DISTINCT or ORDER BY aggregate
939 * with more than one input. This is called after we have completed
940 * entering all the input values into the sort object. We complete the
941 * sort, read out the values in sorted order, and run the transition
942 * function on each value (applying DISTINCT if appropriate).
943 *
944 * This function handles only one grouping set (already set in
945 * aggstate->current_set).
946 *
947 * When called, CurrentMemoryContext should be the per-query context.
948 */
949static void
951 AggStatePerTrans pertrans,
952 AggStatePerGroup pergroupstate)
953{
954 ExprContext *tmpcontext = aggstate->tmpcontext;
955 FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
956 TupleTableSlot *slot1 = pertrans->sortslot;
957 TupleTableSlot *slot2 = pertrans->uniqslot;
958 int numTransInputs = pertrans->numTransInputs;
959 int numDistinctCols = pertrans->numDistinctCols;
960 Datum newAbbrevVal = (Datum) 0;
961 Datum oldAbbrevVal = (Datum) 0;
962 bool haveOldValue = false;
963 TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple;
964 int i;
965
966 tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
967
968 ExecClearTuple(slot1);
969 if (slot2)
970 ExecClearTuple(slot2);
971
972 while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set],
973 true, true, slot1, &newAbbrevVal))
974 {
976
977 tmpcontext->ecxt_outertuple = slot1;
978 tmpcontext->ecxt_innertuple = slot2;
979
980 if (numDistinctCols == 0 ||
981 !haveOldValue ||
982 newAbbrevVal != oldAbbrevVal ||
983 !ExecQual(pertrans->equalfnMulti, tmpcontext))
984 {
985 /*
986 * Extract the first numTransInputs columns as datums to pass to
987 * the transfn.
988 */
989 slot_getsomeattrs(slot1, numTransInputs);
990
991 /* Load values into fcinfo */
992 /* Start from 1, since the 0th arg will be the transition value */
993 for (i = 0; i < numTransInputs; i++)
994 {
995 fcinfo->args[i + 1].value = slot1->tts_values[i];
996 fcinfo->args[i + 1].isnull = slot1->tts_isnull[i];
997 }
998
999 advance_transition_function(aggstate, pertrans, pergroupstate);
1000
1001 if (numDistinctCols > 0)
1002 {
1003 /* swap the slot pointers to retain the current tuple */
1004 TupleTableSlot *tmpslot = slot2;
1005
1006 slot2 = slot1;
1007 slot1 = tmpslot;
1008 /* avoid ExecQual() calls by reusing abbreviated keys */
1009 oldAbbrevVal = newAbbrevVal;
1010 haveOldValue = true;
1011 }
1012 }
1013
1014 /* Reset context each time */
1015 ResetExprContext(tmpcontext);
1016
1017 ExecClearTuple(slot1);
1018 }
1019
1020 if (slot2)
1021 ExecClearTuple(slot2);
1022
1023 tuplesort_end(pertrans->sortstates[aggstate->current_set]);
1024 pertrans->sortstates[aggstate->current_set] = NULL;
1025
1026 /* restore previous slot, potentially in use for grouping sets */
1027 tmpcontext->ecxt_outertuple = save;
1028}
1029
1030/*
1031 * Compute the final value of one aggregate.
1032 *
1033 * This function handles only one grouping set (already set in
1034 * aggstate->current_set).
1035 *
1036 * The finalfn will be run, and the result delivered, in the
1037 * output-tuple context; caller's CurrentMemoryContext does not matter.
1038 * (But note that in some cases, such as when there is no finalfn, the
1039 * result might be a pointer to or into the agg's transition value.)
1040 *
1041 * The finalfn uses the state as set in the transno. This also might be
1042 * being used by another aggregate function, so it's important that we do
1043 * nothing destructive here. Moreover, the aggregate's final value might
1044 * get used in multiple places, so we mustn't return a R/W expanded datum.
1045 */
1046static void
1048 AggStatePerAgg peragg,
1049 AggStatePerGroup pergroupstate,
1050 Datum *resultVal, bool *resultIsNull)
1051{
1052 LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
1053 bool anynull = false;
1054 MemoryContext oldContext;
1055 int i;
1056 ListCell *lc;
1057 AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
1058
1060
1061 /*
1062 * Evaluate any direct arguments. We do this even if there's no finalfn
1063 * (which is unlikely anyway), so that side-effects happen as expected.
1064 * The direct arguments go into arg positions 1 and up, leaving position 0
1065 * for the transition state value.
1066 */
1067 i = 1;
1068 foreach(lc, peragg->aggdirectargs)
1069 {
1070 ExprState *expr = (ExprState *) lfirst(lc);
1071
1072 fcinfo->args[i].value = ExecEvalExpr(expr,
1073 aggstate->ss.ps.ps_ExprContext,
1074 &fcinfo->args[i].isnull);
1075 anynull |= fcinfo->args[i].isnull;
1076 i++;
1077 }
1078
1079 /*
1080 * Apply the agg's finalfn if one is provided, else return transValue.
1081 */
1082 if (OidIsValid(peragg->finalfn_oid))
1083 {
1084 int numFinalArgs = peragg->numFinalArgs;
1085
1086 /* set up aggstate->curperagg for AggGetAggref() */
1087 aggstate->curperagg = peragg;
1088
1089 InitFunctionCallInfoData(*fcinfo, &peragg->finalfn,
1090 numFinalArgs,
1091 pertrans->aggCollation,
1092 (Node *) aggstate, NULL);
1093
1094 /* Fill in the transition state value */
1095 fcinfo->args[0].value =
1097 pergroupstate->transValueIsNull,
1098 pertrans->transtypeLen);
1099 fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
1100 anynull |= pergroupstate->transValueIsNull;
1101
1102 /* Fill any remaining argument positions with nulls */
1103 for (; i < numFinalArgs; i++)
1104 {
1105 fcinfo->args[i].value = (Datum) 0;
1106 fcinfo->args[i].isnull = true;
1107 anynull = true;
1108 }
1109
1110 if (fcinfo->flinfo->fn_strict && anynull)
1111 {
1112 /* don't call a strict function with NULL inputs */
1113 *resultVal = (Datum) 0;
1114 *resultIsNull = true;
1115 }
1116 else
1117 {
1118 Datum result;
1119
1120 result = FunctionCallInvoke(fcinfo);
1121 *resultIsNull = fcinfo->isnull;
1122 *resultVal = MakeExpandedObjectReadOnly(result,
1123 fcinfo->isnull,
1124 peragg->resulttypeLen);
1125 }
1126 aggstate->curperagg = NULL;
1127 }
1128 else
1129 {
1130 *resultVal =
1132 pergroupstate->transValueIsNull,
1133 pertrans->transtypeLen);
1134 *resultIsNull = pergroupstate->transValueIsNull;
1135 }
1136
1137 MemoryContextSwitchTo(oldContext);
1138}
1139
1140/*
1141 * Compute the output value of one partial aggregate.
1142 *
1143 * The serialization function will be run, and the result delivered, in the
1144 * output-tuple context; caller's CurrentMemoryContext does not matter.
1145 */
1146static void
1148 AggStatePerAgg peragg,
1149 AggStatePerGroup pergroupstate,
1150 Datum *resultVal, bool *resultIsNull)
1151{
1152 AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
1153 MemoryContext oldContext;
1154
1156
1157 /*
1158 * serialfn_oid will be set if we must serialize the transvalue before
1159 * returning it
1160 */
1161 if (OidIsValid(pertrans->serialfn_oid))
1162 {
1163 /* Don't call a strict serialization function with NULL input. */
1164 if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull)
1165 {
1166 *resultVal = (Datum) 0;
1167 *resultIsNull = true;
1168 }
1169 else
1170 {
1171 FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo;
1172 Datum result;
1173
1174 fcinfo->args[0].value =
1176 pergroupstate->transValueIsNull,
1177 pertrans->transtypeLen);
1178 fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
1179 fcinfo->isnull = false;
1180
1181 result = FunctionCallInvoke(fcinfo);
1182 *resultIsNull = fcinfo->isnull;
1183 *resultVal = MakeExpandedObjectReadOnly(result,
1184 fcinfo->isnull,
1185 peragg->resulttypeLen);
1186 }
1187 }
1188 else
1189 {
1190 *resultVal =
1192 pergroupstate->transValueIsNull,
1193 pertrans->transtypeLen);
1194 *resultIsNull = pergroupstate->transValueIsNull;
1195 }
1196
1197 MemoryContextSwitchTo(oldContext);
1198}
1199
1200/*
1201 * Extract the attributes that make up the grouping key into the
1202 * hashslot. This is necessary to compute the hash or perform a lookup.
1203 */
1204static inline void
1206 TupleTableSlot *inputslot,
1207 TupleTableSlot *hashslot)
1208{
1209 int i;
1210
1211 /* transfer just the needed columns into hashslot */
1212 slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
1213 ExecClearTuple(hashslot);
1214
1215 for (i = 0; i < perhash->numhashGrpCols; i++)
1216 {
1217 int varNumber = perhash->hashGrpColIdxInput[i] - 1;
1218
1219 hashslot->tts_values[i] = inputslot->tts_values[varNumber];
1220 hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
1221 }
1222 ExecStoreVirtualTuple(hashslot);
1223}
1224
1225/*
1226 * Prepare to finalize and project based on the specified representative tuple
1227 * slot and grouping set.
1228 *
1229 * In the specified tuple slot, force to null all attributes that should be
1230 * read as null in the context of the current grouping set. Also stash the
1231 * current group bitmap where GroupingExpr can get at it.
1232 *
1233 * This relies on three conditions:
1234 *
1235 * 1) Nothing is ever going to try and extract the whole tuple from this slot,
1236 * only reference it in evaluations, which will only access individual
1237 * attributes.
1238 *
1239 * 2) No system columns are going to need to be nulled. (If a system column is
1240 * referenced in a group clause, it is actually projected in the outer plan
1241 * tlist.)
1242 *
1243 * 3) Within a given phase, we never need to recover the value of an attribute
1244 * once it has been set to null.
1245 *
1246 * Poking into the slot this way is a bit ugly, but the consensus is that the
1247 * alternative was worse.
1248 */
1249static void
1250prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
1251{
1252 if (aggstate->phase->grouped_cols)
1253 {
1254 Bitmapset *grouped_cols = aggstate->phase->grouped_cols[currentSet];
1255
1256 aggstate->grouped_cols = grouped_cols;
1257
1258 if (TTS_EMPTY(slot))
1259 {
1260 /*
1261 * Force all values to be NULL if working on an empty input tuple
1262 * (i.e. an empty grouping set for which no input rows were
1263 * supplied).
1264 */
1266 }
1267 else if (aggstate->all_grouped_cols)
1268 {
1269 ListCell *lc;
1270
1271 /* all_grouped_cols is arranged in desc order */
1273
1274 foreach(lc, aggstate->all_grouped_cols)
1275 {
1276 int attnum = lfirst_int(lc);
1277
1278 if (!bms_is_member(attnum, grouped_cols))
1279 slot->tts_isnull[attnum - 1] = true;
1280 }
1281 }
1282 }
1283}
1284
1285/*
1286 * Compute the final value of all aggregates for one group.
1287 *
1288 * This function handles only one grouping set at a time, which the caller must
1289 * have selected. It's also the caller's responsibility to adjust the supplied
1290 * pergroup parameter to point to the current set's transvalues.
1291 *
1292 * Results are stored in the output econtext aggvalues/aggnulls.
1293 */
1294static void
1296 AggStatePerAgg peraggs,
1297 AggStatePerGroup pergroup)
1298{
1299 ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
1300 Datum *aggvalues = econtext->ecxt_aggvalues;
1301 bool *aggnulls = econtext->ecxt_aggnulls;
1302 int aggno;
1303
1304 /*
1305 * If there were any DISTINCT and/or ORDER BY aggregates, sort their
1306 * inputs and run the transition functions.
1307 */
1308 for (int transno = 0; transno < aggstate->numtrans; transno++)
1309 {
1310 AggStatePerTrans pertrans = &aggstate->pertrans[transno];
1311 AggStatePerGroup pergroupstate;
1312
1313 pergroupstate = &pergroup[transno];
1314
1315 if (pertrans->aggsortrequired)
1316 {
1317 Assert(aggstate->aggstrategy != AGG_HASHED &&
1318 aggstate->aggstrategy != AGG_MIXED);
1319
1320 if (pertrans->numInputs == 1)
1322 pertrans,
1323 pergroupstate);
1324 else
1326 pertrans,
1327 pergroupstate);
1328 }
1329 else if (pertrans->numDistinctCols > 0 && pertrans->haslast)
1330 {
1331 pertrans->haslast = false;
1332
1333 if (pertrans->numDistinctCols == 1)
1334 {
1335 if (!pertrans->inputtypeByVal && !pertrans->lastisnull)
1336 pfree(DatumGetPointer(pertrans->lastdatum));
1337
1338 pertrans->lastisnull = false;
1339 pertrans->lastdatum = (Datum) 0;
1340 }
1341 else
1342 ExecClearTuple(pertrans->uniqslot);
1343 }
1344 }
1345
1346 /*
1347 * Run the final functions.
1348 */
1349 for (aggno = 0; aggno < aggstate->numaggs; aggno++)
1350 {
1351 AggStatePerAgg peragg = &peraggs[aggno];
1352 int transno = peragg->transno;
1353 AggStatePerGroup pergroupstate;
1354
1355 pergroupstate = &pergroup[transno];
1356
1357 if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
1358 finalize_partialaggregate(aggstate, peragg, pergroupstate,
1359 &aggvalues[aggno], &aggnulls[aggno]);
1360 else
1361 finalize_aggregate(aggstate, peragg, pergroupstate,
1362 &aggvalues[aggno], &aggnulls[aggno]);
1363 }
1364}
1365
1366/*
1367 * Project the result of a group (whose aggs have already been calculated by
1368 * finalize_aggregates). Returns the result slot, or NULL if no row is
1369 * projected (suppressed by qual).
1370 */
1371static TupleTableSlot *
1373{
1374 ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
1375
1376 /*
1377 * Check the qual (HAVING clause); if the group does not match, ignore it.
1378 */
1379 if (ExecQual(aggstate->ss.ps.qual, econtext))
1380 {
1381 /*
1382 * Form and return projection tuple using the aggregate results and
1383 * the representative input tuple.
1384 */
1385 return ExecProject(aggstate->ss.ps.ps_ProjInfo);
1386 }
1387 else
1388 InstrCountFiltered1(aggstate, 1);
1389
1390 return NULL;
1391}
1392
1393/*
1394 * Find input-tuple columns that are needed, dividing them into
1395 * aggregated and unaggregated sets.
1396 */
1397static void
1398find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
1399{
1400 Agg *agg = (Agg *) aggstate->ss.ps.plan;
1401 FindColsContext context;
1402
1403 context.is_aggref = false;
1404 context.aggregated = NULL;
1405 context.unaggregated = NULL;
1406
1407 /* Examine tlist and quals */
1408 (void) find_cols_walker((Node *) agg->plan.targetlist, &context);
1409 (void) find_cols_walker((Node *) agg->plan.qual, &context);
1410
1411 /* In some cases, grouping columns will not appear in the tlist */
1412 for (int i = 0; i < agg->numCols; i++)
1413 context.unaggregated = bms_add_member(context.unaggregated,
1414 agg->grpColIdx[i]);
1415
1416 *aggregated = context.aggregated;
1417 *unaggregated = context.unaggregated;
1418}
1419
1420static bool
1422{
1423 if (node == NULL)
1424 return false;
1425 if (IsA(node, Var))
1426 {
1427 Var *var = (Var *) node;
1428
1429 /* setrefs.c should have set the varno to OUTER_VAR */
1430 Assert(var->varno == OUTER_VAR);
1431 Assert(var->varlevelsup == 0);
1432 if (context->is_aggref)
1433 context->aggregated = bms_add_member(context->aggregated,
1434 var->varattno);
1435 else
1436 context->unaggregated = bms_add_member(context->unaggregated,
1437 var->varattno);
1438 return false;
1439 }
1440 if (IsA(node, Aggref))
1441 {
1442 Assert(!context->is_aggref);
1443 context->is_aggref = true;
1445 context->is_aggref = false;
1446 return false;
1447 }
1448 return expression_tree_walker(node, find_cols_walker, context);
1449}
1450
1451/*
1452 * (Re-)initialize the hash table(s) to empty.
1453 *
1454 * To implement hashed aggregation, we need a hashtable that stores a
1455 * representative tuple and an array of AggStatePerGroup structs for each
1456 * distinct set of GROUP BY column values. We compute the hash key from the
1457 * GROUP BY columns. The per-group data is allocated in lookup_hash_entry(),
1458 * for each entry.
1459 *
1460 * We have a separate hashtable and associated perhash data structure for each
1461 * grouping set for which we're doing hashing.
1462 *
1463 * The contents of the hash tables always live in the hashcontext's per-tuple
1464 * memory context (there is only one of these for all tables together, since
1465 * they are all reset at the same time).
1466 */
1467static void
1469{
1470 int setno;
1471
1472 for (setno = 0; setno < aggstate->num_hashes; ++setno)
1473 {
1474 AggStatePerHash perhash = &aggstate->perhash[setno];
1475 long nbuckets;
1476 Size memory;
1477
1478 if (perhash->hashtable != NULL)
1479 {
1481 continue;
1482 }
1483
1484 Assert(perhash->aggnode->numGroups > 0);
1485
1486 memory = aggstate->hash_mem_limit / aggstate->num_hashes;
1487
1488 /* choose reasonable number of buckets per hashtable */
1489 nbuckets = hash_choose_num_buckets(aggstate->hashentrysize,
1490 perhash->aggnode->numGroups,
1491 memory);
1492
1493#ifdef USE_INJECTION_POINTS
1494 if (IS_INJECTION_POINT_ATTACHED("hash-aggregate-oversize-table"))
1495 {
1496 nbuckets = memory / sizeof(TupleHashEntryData);
1497 INJECTION_POINT_CACHED("hash-aggregate-oversize-table");
1498 }
1499#endif
1500
1501 build_hash_table(aggstate, setno, nbuckets);
1502 }
1503
1504 aggstate->hash_ngroups_current = 0;
1505}
1506
1507/*
1508 * Build a single hashtable for this grouping set.
1509 */
1510static void
1511build_hash_table(AggState *aggstate, int setno, long nbuckets)
1512{
1513 AggStatePerHash perhash = &aggstate->perhash[setno];
1514 MemoryContext metacxt = aggstate->hash_metacxt;
1515 MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
1517 Size additionalsize;
1518
1519 Assert(aggstate->aggstrategy == AGG_HASHED ||
1520 aggstate->aggstrategy == AGG_MIXED);
1521
1522 /*
1523 * Used to make sure initial hash table allocation does not exceed
1524 * hash_mem. Note that the estimate does not include space for
1525 * pass-by-reference transition data values, nor for the representative
1526 * tuple of each group.
1527 */
1528 additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
1529
1530 perhash->hashtable = BuildTupleHashTable(&aggstate->ss.ps,
1531 perhash->hashslot->tts_tupleDescriptor,
1532 perhash->hashslot->tts_ops,
1533 perhash->numCols,
1534 perhash->hashGrpColIdxHash,
1535 perhash->eqfuncoids,
1536 perhash->hashfunctions,
1537 perhash->aggnode->grpCollations,
1538 nbuckets,
1539 additionalsize,
1540 metacxt,
1541 hashcxt,
1542 tmpcxt,
1543 DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
1544}
1545
1546/*
1547 * Compute columns that actually need to be stored in hashtable entries. The
1548 * incoming tuples from the child plan node will contain grouping columns,
1549 * other columns referenced in our targetlist and qual, columns used to
1550 * compute the aggregate functions, and perhaps just junk columns we don't use
1551 * at all. Only columns of the first two types need to be stored in the
1552 * hashtable, and getting rid of the others can make the table entries
1553 * significantly smaller. The hashtable only contains the relevant columns,
1554 * and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table()
1555 * into the format of the normal input descriptor.
1556 *
1557 * Additional columns, in addition to the columns grouped by, come from two
1558 * sources: Firstly functionally dependent columns that we don't need to group
1559 * by themselves, and secondly ctids for row-marks.
1560 *
1561 * To eliminate duplicates, we build a bitmapset of the needed columns, and
1562 * then build an array of the columns included in the hashtable. We might
1563 * still have duplicates if the passed-in grpColIdx has them, which can happen
1564 * in edge cases from semijoins/distinct; these can't always be removed,
1565 * because it's not certain that the duplicate cols will be using the same
1566 * hash function.
1567 *
1568 * Note that the array is preserved over ExecReScanAgg, so we allocate it in
1569 * the per-query context (unlike the hash table itself).
1570 */
1571static void
1573{
1574 Bitmapset *base_colnos;
1575 Bitmapset *aggregated_colnos;
1576 TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
1577 List *outerTlist = outerPlanState(aggstate)->plan->targetlist;
1578 int numHashes = aggstate->num_hashes;
1579 EState *estate = aggstate->ss.ps.state;
1580 int j;
1581
1582 /* Find Vars that will be needed in tlist and qual */
1583 find_cols(aggstate, &aggregated_colnos, &base_colnos);
1584 aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
1585 aggstate->max_colno_needed = 0;
1586 aggstate->all_cols_needed = true;
1587
1588 for (int i = 0; i < scanDesc->natts; i++)
1589 {
1590 int colno = i + 1;
1591
1592 if (bms_is_member(colno, aggstate->colnos_needed))
1593 aggstate->max_colno_needed = colno;
1594 else
1595 aggstate->all_cols_needed = false;
1596 }
1597
1598 for (j = 0; j < numHashes; ++j)
1599 {
1600 AggStatePerHash perhash = &aggstate->perhash[j];
1601 Bitmapset *colnos = bms_copy(base_colnos);
1602 AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
1603 List *hashTlist = NIL;
1604 TupleDesc hashDesc;
1605 int maxCols;
1606 int i;
1607
1608 perhash->largestGrpColIdx = 0;
1609
1610 /*
1611 * If we're doing grouping sets, then some Vars might be referenced in
1612 * tlist/qual for the benefit of other grouping sets, but not needed
1613 * when hashing; i.e. prepare_projection_slot will null them out, so
1614 * there'd be no point storing them. Use prepare_projection_slot's
1615 * logic to determine which.
1616 */
1617 if (aggstate->phases[0].grouped_cols)
1618 {
1619 Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[j];
1620 ListCell *lc;
1621
1622 foreach(lc, aggstate->all_grouped_cols)
1623 {
1624 int attnum = lfirst_int(lc);
1625
1626 if (!bms_is_member(attnum, grouped_cols))
1627 colnos = bms_del_member(colnos, attnum);
1628 }
1629 }
1630
1631 /*
1632 * Compute maximum number of input columns accounting for possible
1633 * duplications in the grpColIdx array, which can happen in some edge
1634 * cases where HashAggregate was generated as part of a semijoin or a
1635 * DISTINCT.
1636 */
1637 maxCols = bms_num_members(colnos) + perhash->numCols;
1638
1639 perhash->hashGrpColIdxInput =
1640 palloc(maxCols * sizeof(AttrNumber));
1641 perhash->hashGrpColIdxHash =
1642 palloc(perhash->numCols * sizeof(AttrNumber));
1643
1644 /* Add all the grouping columns to colnos */
1645 for (i = 0; i < perhash->numCols; i++)
1646 colnos = bms_add_member(colnos, grpColIdx[i]);
1647
1648 /*
1649 * First build mapping for columns directly hashed. These are the
1650 * first, because they'll be accessed when computing hash values and
1651 * comparing tuples for exact matches. We also build simple mapping
1652 * for execGrouping, so it knows where to find the to-be-hashed /
1653 * compared columns in the input.
1654 */
1655 for (i = 0; i < perhash->numCols; i++)
1656 {
1657 perhash->hashGrpColIdxInput[i] = grpColIdx[i];
1658 perhash->hashGrpColIdxHash[i] = i + 1;
1659 perhash->numhashGrpCols++;
1660 /* delete already mapped columns */
1661 colnos = bms_del_member(colnos, grpColIdx[i]);
1662 }
1663
1664 /* and add the remaining columns */
1665 i = -1;
1666 while ((i = bms_next_member(colnos, i)) >= 0)
1667 {
1668 perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i;
1669 perhash->numhashGrpCols++;
1670 }
1671
1672 /* and build a tuple descriptor for the hashtable */
1673 for (i = 0; i < perhash->numhashGrpCols; i++)
1674 {
1675 int varNumber = perhash->hashGrpColIdxInput[i] - 1;
1676
1677 hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber));
1678 perhash->largestGrpColIdx =
1679 Max(varNumber + 1, perhash->largestGrpColIdx);
1680 }
1681
1682 hashDesc = ExecTypeFromTL(hashTlist);
1683
1685 perhash->aggnode->grpOperators,
1686 &perhash->eqfuncoids,
1687 &perhash->hashfunctions);
1688 perhash->hashslot =
1689 ExecAllocTableSlot(&estate->es_tupleTable, hashDesc,
1691
1692 list_free(hashTlist);
1693 bms_free(colnos);
1694 }
1695
1696 bms_free(base_colnos);
1697}
1698
1699/*
1700 * Estimate per-hash-table-entry overhead.
1701 */
1702Size
1703hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
1704{
1705 Size tupleChunkSize;
1706 Size pergroupChunkSize;
1707 Size transitionChunkSize;
1708 Size tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) +
1709 tupleWidth);
1710 Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
1711
1712 tupleChunkSize = CHUNKHDRSZ + tupleSize;
1713
1714 if (pergroupSize > 0)
1715 pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
1716 else
1717 pergroupChunkSize = 0;
1718
1719 if (transitionSpace > 0)
1720 transitionChunkSize = CHUNKHDRSZ + transitionSpace;
1721 else
1722 transitionChunkSize = 0;
1723
1724 return
1725 sizeof(TupleHashEntryData) +
1726 tupleChunkSize +
1727 pergroupChunkSize +
1728 transitionChunkSize;
1729}
1730
1731/*
1732 * hashagg_recompile_expressions()
1733 *
1734 * Identifies the right phase, compiles the right expression given the
1735 * arguments, and then sets phase->evalfunc to that expression.
1736 *
1737 * Different versions of the compiled expression are needed depending on
1738 * whether hash aggregation has spilled or not, and whether it's reading from
1739 * the outer plan or a tape. Before spilling to disk, the expression reads
1740 * from the outer plan and does not need to perform a NULL check. After
1741 * HashAgg begins to spill, new groups will not be created in the hash table,
1742 * and the AggStatePerGroup array may be NULL; therefore we need to add a null
1743 * pointer check to the expression. Then, when reading spilled data from a
1744 * tape, we change the outer slot type to be a fixed minimal tuple slot.
1745 *
1746 * It would be wasteful to recompile every time, so cache the compiled
1747 * expressions in the AggStatePerPhase, and reuse when appropriate.
1748 */
1749static void
1750hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
1751{
1752 AggStatePerPhase phase;
1753 int i = minslot ? 1 : 0;
1754 int j = nullcheck ? 1 : 0;
1755
1756 Assert(aggstate->aggstrategy == AGG_HASHED ||
1757 aggstate->aggstrategy == AGG_MIXED);
1758
1759 if (aggstate->aggstrategy == AGG_HASHED)
1760 phase = &aggstate->phases[0];
1761 else /* AGG_MIXED */
1762 phase = &aggstate->phases[1];
1763
1764 if (phase->evaltrans_cache[i][j] == NULL)
1765 {
1766 const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops;
1767 bool outerfixed = aggstate->ss.ps.outeropsfixed;
1768 bool dohash = true;
1769 bool dosort = false;
1770
1771 /*
1772 * If minslot is true, that means we are processing a spilled batch
1773 * (inside agg_refill_hash_table()), and we must not advance the
1774 * sorted grouping sets.
1775 */
1776 if (aggstate->aggstrategy == AGG_MIXED && !minslot)
1777 dosort = true;
1778
1779 /* temporarily change the outerops while compiling the expression */
1780 if (minslot)
1781 {
1782 aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
1783 aggstate->ss.ps.outeropsfixed = true;
1784 }
1785
1786 phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase,
1787 dosort, dohash,
1788 nullcheck);
1789
1790 /* change back */
1791 aggstate->ss.ps.outerops = outerops;
1792 aggstate->ss.ps.outeropsfixed = outerfixed;
1793 }
1794
1795 phase->evaltrans = phase->evaltrans_cache[i][j];
1796}
1797
1798/*
1799 * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
1800 * number of partitions we expect to create (if we do spill).
1801 *
1802 * There are two limits: a memory limit, and also an ngroups limit. The
1803 * ngroups limit becomes important when we expect transition values to grow
1804 * substantially larger than the initial value.
1805 */
1806void
1807hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
1808 Size *mem_limit, uint64 *ngroups_limit,
1809 int *num_partitions)
1810{
1811 int npartitions;
1812 Size partition_mem;
1813 Size hash_mem_limit = get_hash_memory_limit();
1814
1815 /* if not expected to spill, use all of hash_mem */
1816 if (input_groups * hashentrysize <= hash_mem_limit)
1817 {
1818 if (num_partitions != NULL)
1819 *num_partitions = 0;
1820 *mem_limit = hash_mem_limit;
1821 *ngroups_limit = hash_mem_limit / hashentrysize;
1822 return;
1823 }
1824
1825 /*
1826 * Calculate expected memory requirements for spilling, which is the size
1827 * of the buffers needed for all the tapes that need to be open at once.
1828 * Then, subtract that from the memory available for holding hash tables.
1829 */
1830 npartitions = hash_choose_num_partitions(input_groups,
1831 hashentrysize,
1832 used_bits,
1833 NULL);
1834 if (num_partitions != NULL)
1835 *num_partitions = npartitions;
1836
1837 partition_mem =
1839 HASHAGG_WRITE_BUFFER_SIZE * npartitions;
1840
1841 /*
1842 * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
1843 * minimum number of partitions, so we aren't going to dramatically exceed
1844 * work mem anyway.
1845 */
1846 if (hash_mem_limit > 4 * partition_mem)
1847 *mem_limit = hash_mem_limit - partition_mem;
1848 else
1849 *mem_limit = hash_mem_limit * 0.75;
1850
1851 if (*mem_limit > hashentrysize)
1852 *ngroups_limit = *mem_limit / hashentrysize;
1853 else
1854 *ngroups_limit = 1;
1855}
1856
1857/*
1858 * hash_agg_check_limits
1859 *
1860 * After adding a new group to the hash table, check whether we need to enter
1861 * spill mode. Allocations may happen without adding new groups (for instance,
1862 * if the transition state size grows), so this check is imperfect.
1863 */
1864static void
1866{
1867 uint64 ngroups = aggstate->hash_ngroups_current;
1868 Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
1869 true);
1871 true);
1872 bool do_spill = false;
1873
1874#ifdef USE_INJECTION_POINTS
1875 if (ngroups >= 1000)
1876 {
1877 if (IS_INJECTION_POINT_ATTACHED("hash-aggregate-spill-1000"))
1878 {
1879 do_spill = true;
1880 INJECTION_POINT_CACHED("hash-aggregate-spill-1000");
1881 }
1882 }
1883#endif
1884
1885 /*
1886 * Don't spill unless there's at least one group in the hash table so we
1887 * can be sure to make progress even in edge cases.
1888 */
1889 if (aggstate->hash_ngroups_current > 0 &&
1890 (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
1891 ngroups > aggstate->hash_ngroups_limit))
1892 {
1893 do_spill = true;
1894 }
1895
1896 if (do_spill)
1897 hash_agg_enter_spill_mode(aggstate);
1898}
1899
1900/*
1901 * Enter "spill mode", meaning that no new groups are added to any of the hash
1902 * tables. Tuples that would create a new group are instead spilled, and
1903 * processed later.
1904 */
1905static void
1907{
1908 INJECTION_POINT("hash-aggregate-enter-spill-mode");
1909 aggstate->hash_spill_mode = true;
1910 hashagg_recompile_expressions(aggstate, aggstate->table_filled, true);
1911
1912 if (!aggstate->hash_ever_spilled)
1913 {
1914 Assert(aggstate->hash_tapeset == NULL);
1915 Assert(aggstate->hash_spills == NULL);
1916
1917 aggstate->hash_ever_spilled = true;
1918
1919 aggstate->hash_tapeset = LogicalTapeSetCreate(true, NULL, -1);
1920
1921 aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes);
1922
1923 for (int setno = 0; setno < aggstate->num_hashes; setno++)
1924 {
1925 AggStatePerHash perhash = &aggstate->perhash[setno];
1926 HashAggSpill *spill = &aggstate->hash_spills[setno];
1927
1928 hashagg_spill_init(spill, aggstate->hash_tapeset, 0,
1929 perhash->aggnode->numGroups,
1930 aggstate->hashentrysize);
1931 }
1932 }
1933}
1934
1935/*
1936 * Update metrics after filling the hash table.
1937 *
1938 * If reading from the outer plan, from_tape should be false; if reading from
1939 * another tape, from_tape should be true.
1940 */
1941static void
1942hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
1943{
1944 Size meta_mem;
1945 Size hashkey_mem;
1946 Size buffer_mem;
1947 Size total_mem;
1948
1949 if (aggstate->aggstrategy != AGG_MIXED &&
1950 aggstate->aggstrategy != AGG_HASHED)
1951 return;
1952
1953 /* memory for the hash table itself */
1954 meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
1955
1956 /* memory for the group keys and transition states */
1957 hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
1958
1959 /* memory for read/write tape buffers, if spilled */
1960 buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
1961 if (from_tape)
1962 buffer_mem += HASHAGG_READ_BUFFER_SIZE;
1963
1964 /* update peak mem */
1965 total_mem = meta_mem + hashkey_mem + buffer_mem;
1966 if (total_mem > aggstate->hash_mem_peak)
1967 aggstate->hash_mem_peak = total_mem;
1968
1969 /* update disk usage */
1970 if (aggstate->hash_tapeset != NULL)
1971 {
1972 uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeset) * (BLCKSZ / 1024);
1973
1974 if (aggstate->hash_disk_used < disk_used)
1975 aggstate->hash_disk_used = disk_used;
1976 }
1977
1978 /* update hashentrysize estimate based on contents */
1979 if (aggstate->hash_ngroups_current > 0)
1980 {
1981 aggstate->hashentrysize =
1982 sizeof(TupleHashEntryData) +
1983 (hashkey_mem / (double) aggstate->hash_ngroups_current);
1984 }
1985}
1986
1987/*
1988 * Choose a reasonable number of buckets for the initial hash table size.
1989 */
1990static long
1991hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
1992{
1993 long max_nbuckets;
1994 long nbuckets = ngroups;
1995
1996 max_nbuckets = memory / hashentrysize;
1997
1998 /*
1999 * Underestimating is better than overestimating. Too many buckets crowd
2000 * out space for group keys and transition state values.
2001 */
2002 max_nbuckets >>= 1;
2003
2004 if (nbuckets > max_nbuckets)
2005 nbuckets = max_nbuckets;
2006
2007 return Max(nbuckets, 1);
2008}
2009
2010/*
2011 * Determine the number of partitions to create when spilling, which will
2012 * always be a power of two. If log2_npartitions is non-NULL, set
2013 * *log2_npartitions to the log2() of the number of partitions.
2014 */
2015static int
2016hash_choose_num_partitions(double input_groups, double hashentrysize,
2017 int used_bits, int *log2_npartitions)
2018{
2019 Size hash_mem_limit = get_hash_memory_limit();
2020 double partition_limit;
2021 double mem_wanted;
2022 double dpartitions;
2023 int npartitions;
2024 int partition_bits;
2025
2026 /*
2027 * Avoid creating so many partitions that the memory requirements of the
2028 * open partition files are greater than 1/4 of hash_mem.
2029 */
2030 partition_limit =
2031 (hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
2033
2034 mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
2035
2036 /* make enough partitions so that each one is likely to fit in memory */
2037 dpartitions = 1 + (mem_wanted / hash_mem_limit);
2038
2039 if (dpartitions > partition_limit)
2040 dpartitions = partition_limit;
2041
2042 if (dpartitions < HASHAGG_MIN_PARTITIONS)
2043 dpartitions = HASHAGG_MIN_PARTITIONS;
2044 if (dpartitions > HASHAGG_MAX_PARTITIONS)
2045 dpartitions = HASHAGG_MAX_PARTITIONS;
2046
2047 /* HASHAGG_MAX_PARTITIONS limit makes this safe */
2048 npartitions = (int) dpartitions;
2049
2050 /* ceil(log2(npartitions)) */
2051 partition_bits = my_log2(npartitions);
2052
2053 /* make sure that we don't exhaust the hash bits */
2054 if (partition_bits + used_bits >= 32)
2055 partition_bits = 32 - used_bits;
2056
2057 if (log2_npartitions != NULL)
2058 *log2_npartitions = partition_bits;
2059
2060 /* number of partitions will be a power of two */
2061 npartitions = 1 << partition_bits;
2062
2063 return npartitions;
2064}
2065
2066/*
2067 * Initialize a freshly-created TupleHashEntry.
2068 */
2069static void
2071 TupleHashEntry entry)
2072{
2073 AggStatePerGroup pergroup;
2074 int transno;
2075
2076 aggstate->hash_ngroups_current++;
2077 hash_agg_check_limits(aggstate);
2078
2079 /* no need to allocate or initialize per-group state */
2080 if (aggstate->numtrans == 0)
2081 return;
2082
2083 pergroup = (AggStatePerGroup)
2084 MemoryContextAlloc(hashtable->tablecxt,
2085 sizeof(AggStatePerGroupData) * aggstate->numtrans);
2086
2087 entry->additional = pergroup;
2088
2089 /*
2090 * Initialize aggregates for new tuple group, lookup_hash_entries()
2091 * already has selected the relevant grouping set.
2092 */
2093 for (transno = 0; transno < aggstate->numtrans; transno++)
2094 {
2095 AggStatePerTrans pertrans = &aggstate->pertrans[transno];
2096 AggStatePerGroup pergroupstate = &pergroup[transno];
2097
2098 initialize_aggregate(aggstate, pertrans, pergroupstate);
2099 }
2100}
2101
2102/*
2103 * Look up hash entries for the current tuple in all hashed grouping sets.
2104 *
2105 * Be aware that lookup_hash_entry can reset the tmpcontext.
2106 *
2107 * Some entries may be left NULL if we are in "spill mode". The same tuple
2108 * will belong to different groups for each grouping set, so may match a group
2109 * already in memory for one set and match a group not in memory for another
2110 * set. When in "spill mode", the tuple will be spilled for each grouping set
2111 * where it doesn't match a group in memory.
2112 *
2113 * NB: It's possible to spill the same tuple for several different grouping
2114 * sets. This may seem wasteful, but it's actually a trade-off: if we spill
2115 * the tuple multiple times for multiple grouping sets, it can be partitioned
2116 * for each grouping set, making the refilling of the hash table very
2117 * efficient.
2118 */
2119static void
2121{
2122 AggStatePerGroup *pergroup = aggstate->hash_pergroup;
2123 TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
2124 int setno;
2125
2126 for (setno = 0; setno < aggstate->num_hashes; setno++)
2127 {
2128 AggStatePerHash perhash = &aggstate->perhash[setno];
2129 TupleHashTable hashtable = perhash->hashtable;
2130 TupleTableSlot *hashslot = perhash->hashslot;
2131 TupleHashEntry entry;
2132 uint32 hash;
2133 bool isnew = false;
2134 bool *p_isnew;
2135
2136 /* if hash table already spilled, don't create new entries */
2137 p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
2138
2139 select_current_set(aggstate, setno, true);
2140 prepare_hash_slot(perhash,
2141 outerslot,
2142 hashslot);
2143
2144 entry = LookupTupleHashEntry(hashtable, hashslot,
2145 p_isnew, &hash);
2146
2147 if (entry != NULL)
2148 {
2149 if (isnew)
2150 initialize_hash_entry(aggstate, hashtable, entry);
2151 pergroup[setno] = entry->additional;
2152 }
2153 else
2154 {
2155 HashAggSpill *spill = &aggstate->hash_spills[setno];
2156 TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
2157
2158 if (spill->partitions == NULL)
2159 hashagg_spill_init(spill, aggstate->hash_tapeset, 0,
2160 perhash->aggnode->numGroups,
2161 aggstate->hashentrysize);
2162
2163 hashagg_spill_tuple(aggstate, spill, slot, hash);
2164 pergroup[setno] = NULL;
2165 }
2166 }
2167}
2168
2169/*
2170 * ExecAgg -
2171 *
2172 * ExecAgg receives tuples from its outer subplan and aggregates over
2173 * the appropriate attribute for each aggregate function use (Aggref
2174 * node) appearing in the targetlist or qual of the node. The number
2175 * of tuples to aggregate over depends on whether grouped or plain
2176 * aggregation is selected. In grouped aggregation, we produce a result
2177 * row for each group; in plain aggregation there's a single result row
2178 * for the whole query. In either case, the value of each aggregate is
2179 * stored in the expression context to be used when ExecProject evaluates
2180 * the result tuple.
2181 */
2182static TupleTableSlot *
2184{
2185 AggState *node = castNode(AggState, pstate);
2186 TupleTableSlot *result = NULL;
2187
2189
2190 if (!node->agg_done)
2191 {
2192 /* Dispatch based on strategy */
2193 switch (node->phase->aggstrategy)
2194 {
2195 case AGG_HASHED:
2196 if (!node->table_filled)
2197 agg_fill_hash_table(node);
2198 /* FALLTHROUGH */
2199 case AGG_MIXED:
2200 result = agg_retrieve_hash_table(node);
2201 break;
2202 case AGG_PLAIN:
2203 case AGG_SORTED:
2204 result = agg_retrieve_direct(node);
2205 break;
2206 }
2207
2208 if (!TupIsNull(result))
2209 return result;
2210 }
2211
2212 return NULL;
2213}
2214
2215/*
2216 * ExecAgg for non-hashed case
2217 */
2218static TupleTableSlot *
2220{
2221 Agg *node = aggstate->phase->aggnode;
2222 ExprContext *econtext;
2223 ExprContext *tmpcontext;
2224 AggStatePerAgg peragg;
2225 AggStatePerGroup *pergroups;
2226 TupleTableSlot *outerslot;
2227 TupleTableSlot *firstSlot;
2228 TupleTableSlot *result;
2229 bool hasGroupingSets = aggstate->phase->numsets > 0;
2230 int numGroupingSets = Max(aggstate->phase->numsets, 1);
2231 int currentSet;
2232 int nextSetSize;
2233 int numReset;
2234 int i;
2235
2236 /*
2237 * get state info from node
2238 *
2239 * econtext is the per-output-tuple expression context
2240 *
2241 * tmpcontext is the per-input-tuple expression context
2242 */
2243 econtext = aggstate->ss.ps.ps_ExprContext;
2244 tmpcontext = aggstate->tmpcontext;
2245
2246 peragg = aggstate->peragg;
2247 pergroups = aggstate->pergroups;
2248 firstSlot = aggstate->ss.ss_ScanTupleSlot;
2249
2250 /*
2251 * We loop retrieving groups until we find one matching
2252 * aggstate->ss.ps.qual
2253 *
2254 * For grouping sets, we have the invariant that aggstate->projected_set
2255 * is either -1 (initial call) or the index (starting from 0) in
2256 * gset_lengths for the group we just completed (either by projecting a
2257 * row or by discarding it in the qual).
2258 */
2259 while (!aggstate->agg_done)
2260 {
2261 /*
2262 * Clear the per-output-tuple context for each group, as well as
2263 * aggcontext (which contains any pass-by-ref transvalues of the old
2264 * group). Some aggregate functions store working state in child
2265 * contexts; those now get reset automatically without us needing to
2266 * do anything special.
2267 *
2268 * We use ReScanExprContext not just ResetExprContext because we want
2269 * any registered shutdown callbacks to be called. That allows
2270 * aggregate functions to ensure they've cleaned up any non-memory
2271 * resources.
2272 */
2273 ReScanExprContext(econtext);
2274
2275 /*
2276 * Determine how many grouping sets need to be reset at this boundary.
2277 */
2278 if (aggstate->projected_set >= 0 &&
2279 aggstate->projected_set < numGroupingSets)
2280 numReset = aggstate->projected_set + 1;
2281 else
2282 numReset = numGroupingSets;
2283
2284 /*
2285 * numReset can change on a phase boundary, but that's OK; we want to
2286 * reset the contexts used in _this_ phase, and later, after possibly
2287 * changing phase, initialize the right number of aggregates for the
2288 * _new_ phase.
2289 */
2290
2291 for (i = 0; i < numReset; i++)
2292 {
2293 ReScanExprContext(aggstate->aggcontexts[i]);
2294 }
2295
2296 /*
2297 * Check if input is complete and there are no more groups to project
2298 * in this phase; move to next phase or mark as done.
2299 */
2300 if (aggstate->input_done == true &&
2301 aggstate->projected_set >= (numGroupingSets - 1))
2302 {
2303 if (aggstate->current_phase < aggstate->numphases - 1)
2304 {
2305 initialize_phase(aggstate, aggstate->current_phase + 1);
2306 aggstate->input_done = false;
2307 aggstate->projected_set = -1;
2308 numGroupingSets = Max(aggstate->phase->numsets, 1);
2309 node = aggstate->phase->aggnode;
2310 numReset = numGroupingSets;
2311 }
2312 else if (aggstate->aggstrategy == AGG_MIXED)
2313 {
2314 /*
2315 * Mixed mode; we've output all the grouped stuff and have
2316 * full hashtables, so switch to outputting those.
2317 */
2318 initialize_phase(aggstate, 0);
2319 aggstate->table_filled = true;
2321 &aggstate->perhash[0].hashiter);
2322 select_current_set(aggstate, 0, true);
2323 return agg_retrieve_hash_table(aggstate);
2324 }
2325 else
2326 {
2327 aggstate->agg_done = true;
2328 break;
2329 }
2330 }
2331
2332 /*
2333 * Get the number of columns in the next grouping set after the last
2334 * projected one (if any). This is the number of columns to compare to
2335 * see if we reached the boundary of that set too.
2336 */
2337 if (aggstate->projected_set >= 0 &&
2338 aggstate->projected_set < (numGroupingSets - 1))
2339 nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1];
2340 else
2341 nextSetSize = 0;
2342
2343 /*----------
2344 * If a subgroup for the current grouping set is present, project it.
2345 *
2346 * We have a new group if:
2347 * - we're out of input but haven't projected all grouping sets
2348 * (checked above)
2349 * OR
2350 * - we already projected a row that wasn't from the last grouping
2351 * set
2352 * AND
2353 * - the next grouping set has at least one grouping column (since
2354 * empty grouping sets project only once input is exhausted)
2355 * AND
2356 * - the previous and pending rows differ on the grouping columns
2357 * of the next grouping set
2358 *----------
2359 */
2360 tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple;
2361 if (aggstate->input_done ||
2362 (node->aggstrategy != AGG_PLAIN &&
2363 aggstate->projected_set != -1 &&
2364 aggstate->projected_set < (numGroupingSets - 1) &&
2365 nextSetSize > 0 &&
2366 !ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1],
2367 tmpcontext)))
2368 {
2369 aggstate->projected_set += 1;
2370
2371 Assert(aggstate->projected_set < numGroupingSets);
2372 Assert(nextSetSize > 0 || aggstate->input_done);
2373 }
2374 else
2375 {
2376 /*
2377 * We no longer care what group we just projected, the next
2378 * projection will always be the first (or only) grouping set
2379 * (unless the input proves to be empty).
2380 */
2381 aggstate->projected_set = 0;
2382
2383 /*
2384 * If we don't already have the first tuple of the new group,
2385 * fetch it from the outer plan.
2386 */
2387 if (aggstate->grp_firstTuple == NULL)
2388 {
2389 outerslot = fetch_input_tuple(aggstate);
2390 if (!TupIsNull(outerslot))
2391 {
2392 /*
2393 * Make a copy of the first input tuple; we will use this
2394 * for comparisons (in group mode) and for projection.
2395 */
2396 aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
2397 }
2398 else
2399 {
2400 /* outer plan produced no tuples at all */
2401 if (hasGroupingSets)
2402 {
2403 /*
2404 * If there was no input at all, we need to project
2405 * rows only if there are grouping sets of size 0.
2406 * Note that this implies that there can't be any
2407 * references to ungrouped Vars, which would otherwise
2408 * cause issues with the empty output slot.
2409 *
2410 * XXX: This is no longer true, we currently deal with
2411 * this in finalize_aggregates().
2412 */
2413 aggstate->input_done = true;
2414
2415 while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0)
2416 {
2417 aggstate->projected_set += 1;
2418 if (aggstate->projected_set >= numGroupingSets)
2419 {
2420 /*
2421 * We can't set agg_done here because we might
2422 * have more phases to do, even though the
2423 * input is empty. So we need to restart the
2424 * whole outer loop.
2425 */
2426 break;
2427 }
2428 }
2429
2430 if (aggstate->projected_set >= numGroupingSets)
2431 continue;
2432 }
2433 else
2434 {
2435 aggstate->agg_done = true;
2436 /* If we are grouping, we should produce no tuples too */
2437 if (node->aggstrategy != AGG_PLAIN)
2438 return NULL;
2439 }
2440 }
2441 }
2442
2443 /*
2444 * Initialize working state for a new input tuple group.
2445 */
2446 initialize_aggregates(aggstate, pergroups, numReset);
2447
2448 if (aggstate->grp_firstTuple != NULL)
2449 {
2450 /*
2451 * Store the copied first input tuple in the tuple table slot
2452 * reserved for it. The tuple will be deleted when it is
2453 * cleared from the slot.
2454 */
2456 firstSlot, true);
2457 aggstate->grp_firstTuple = NULL; /* don't keep two pointers */
2458
2459 /* set up for first advance_aggregates call */
2460 tmpcontext->ecxt_outertuple = firstSlot;
2461
2462 /*
2463 * Process each outer-plan tuple, and then fetch the next one,
2464 * until we exhaust the outer plan or cross a group boundary.
2465 */
2466 for (;;)
2467 {
2468 /*
2469 * During phase 1 only of a mixed agg, we need to update
2470 * hashtables as well in advance_aggregates.
2471 */
2472 if (aggstate->aggstrategy == AGG_MIXED &&
2473 aggstate->current_phase == 1)
2474 {
2475 lookup_hash_entries(aggstate);
2476 }
2477
2478 /* Advance the aggregates (or combine functions) */
2479 advance_aggregates(aggstate);
2480
2481 /* Reset per-input-tuple context after each tuple */
2482 ResetExprContext(tmpcontext);
2483
2484 outerslot = fetch_input_tuple(aggstate);
2485 if (TupIsNull(outerslot))
2486 {
2487 /* no more outer-plan tuples available */
2488
2489 /* if we built hash tables, finalize any spills */
2490 if (aggstate->aggstrategy == AGG_MIXED &&
2491 aggstate->current_phase == 1)
2493
2494 if (hasGroupingSets)
2495 {
2496 aggstate->input_done = true;
2497 break;
2498 }
2499 else
2500 {
2501 aggstate->agg_done = true;
2502 break;
2503 }
2504 }
2505 /* set up for next advance_aggregates call */
2506 tmpcontext->ecxt_outertuple = outerslot;
2507
2508 /*
2509 * If we are grouping, check whether we've crossed a group
2510 * boundary.
2511 */
2512 if (node->aggstrategy != AGG_PLAIN && node->numCols > 0)
2513 {
2514 tmpcontext->ecxt_innertuple = firstSlot;
2515 if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1],
2516 tmpcontext))
2517 {
2518 aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
2519 break;
2520 }
2521 }
2522 }
2523 }
2524
2525 /*
2526 * Use the representative input tuple for any references to
2527 * non-aggregated input columns in aggregate direct args, the node
2528 * qual, and the tlist. (If we are not grouping, and there are no
2529 * input rows at all, we will come here with an empty firstSlot
2530 * ... but if not grouping, there can't be any references to
2531 * non-aggregated input columns, so no problem.)
2532 */
2533 econtext->ecxt_outertuple = firstSlot;
2534 }
2535
2536 Assert(aggstate->projected_set >= 0);
2537
2538 currentSet = aggstate->projected_set;
2539
2540 prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet);
2541
2542 select_current_set(aggstate, currentSet, false);
2543
2544 finalize_aggregates(aggstate,
2545 peragg,
2546 pergroups[currentSet]);
2547
2548 /*
2549 * If there's no row to project right now, we must continue rather
2550 * than returning a null since there might be more groups.
2551 */
2552 result = project_aggregates(aggstate);
2553 if (result)
2554 return result;
2555 }
2556
2557 /* No more groups */
2558 return NULL;
2559}
2560
2561/*
2562 * ExecAgg for hashed case: read input and build hash table
2563 */
2564static void
2566{
2567 TupleTableSlot *outerslot;
2568 ExprContext *tmpcontext = aggstate->tmpcontext;
2569
2570 /*
2571 * Process each outer-plan tuple, and then fetch the next one, until we
2572 * exhaust the outer plan.
2573 */
2574 for (;;)
2575 {
2576 outerslot = fetch_input_tuple(aggstate);
2577 if (TupIsNull(outerslot))
2578 break;
2579
2580 /* set up for lookup_hash_entries and advance_aggregates */
2581 tmpcontext->ecxt_outertuple = outerslot;
2582
2583 /* Find or build hashtable entries */
2584 lookup_hash_entries(aggstate);
2585
2586 /* Advance the aggregates (or combine functions) */
2587 advance_aggregates(aggstate);
2588
2589 /*
2590 * Reset per-input-tuple context after each tuple, but note that the
2591 * hash lookups do this too
2592 */
2593 ResetExprContext(aggstate->tmpcontext);
2594 }
2595
2596 /* finalize spills, if any */
2598
2599 aggstate->table_filled = true;
2600 /* Initialize to walk the first hash table */
2601 select_current_set(aggstate, 0, true);
2603 &aggstate->perhash[0].hashiter);
2604}
2605
2606/*
2607 * If any data was spilled during hash aggregation, reset the hash table and
2608 * reprocess one batch of spilled data. After reprocessing a batch, the hash
2609 * table will again contain data, ready to be consumed by
2610 * agg_retrieve_hash_table_in_memory().
2611 *
2612 * Should only be called after all in memory hash table entries have been
2613 * finalized and emitted.
2614 *
2615 * Return false when input is exhausted and there's no more work to be done;
2616 * otherwise return true.
2617 */
2618static bool
2620{
2621 HashAggBatch *batch;
2622 AggStatePerHash perhash;
2623 HashAggSpill spill;
2624 LogicalTapeSet *tapeset = aggstate->hash_tapeset;
2625 bool spill_initialized = false;
2626
2627 if (aggstate->hash_batches == NIL)
2628 return false;
2629
2630 /* hash_batches is a stack, with the top item at the end of the list */
2631 batch = llast(aggstate->hash_batches);
2632 aggstate->hash_batches = list_delete_last(aggstate->hash_batches);
2633
2635 batch->used_bits, &aggstate->hash_mem_limit,
2636 &aggstate->hash_ngroups_limit, NULL);
2637
2638 /*
2639 * Each batch only processes one grouping set; set the rest to NULL so
2640 * that advance_aggregates() knows to ignore them. We don't touch
2641 * pergroups for sorted grouping sets here, because they will be needed if
2642 * we rescan later. The expressions for sorted grouping sets will not be
2643 * evaluated after we recompile anyway.
2644 */
2645 MemSet(aggstate->hash_pergroup, 0,
2646 sizeof(AggStatePerGroup) * aggstate->num_hashes);
2647
2648 /* free memory and reset hash tables */
2649 ReScanExprContext(aggstate->hashcontext);
2650 for (int setno = 0; setno < aggstate->num_hashes; setno++)
2651 ResetTupleHashTable(aggstate->perhash[setno].hashtable);
2652
2653 aggstate->hash_ngroups_current = 0;
2654
2655 /*
2656 * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output
2657 * happens in phase 0. So, we switch to phase 1 when processing a batch,
2658 * and back to phase 0 after the batch is done.
2659 */
2660 Assert(aggstate->current_phase == 0);
2661 if (aggstate->phase->aggstrategy == AGG_MIXED)
2662 {
2663 aggstate->current_phase = 1;
2664 aggstate->phase = &aggstate->phases[aggstate->current_phase];
2665 }
2666
2667 select_current_set(aggstate, batch->setno, true);
2668
2669 perhash = &aggstate->perhash[aggstate->current_set];
2670
2671 /*
2672 * Spilled tuples are always read back as MinimalTuples, which may be
2673 * different from the outer plan, so recompile the aggregate expressions.
2674 *
2675 * We still need the NULL check, because we are only processing one
2676 * grouping set at a time and the rest will be NULL.
2677 */
2678 hashagg_recompile_expressions(aggstate, true, true);
2679
2680 INJECTION_POINT("hash-aggregate-process-batch");
2681 for (;;)
2682 {
2683 TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
2684 TupleTableSlot *hashslot = perhash->hashslot;
2685 TupleHashEntry entry;
2686 MinimalTuple tuple;
2687 uint32 hash;
2688 bool isnew = false;
2689 bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
2690
2692
2693 tuple = hashagg_batch_read(batch, &hash);
2694 if (tuple == NULL)
2695 break;
2696
2697 ExecStoreMinimalTuple(tuple, spillslot, true);
2698 aggstate->tmpcontext->ecxt_outertuple = spillslot;
2699
2700 prepare_hash_slot(perhash,
2701 aggstate->tmpcontext->ecxt_outertuple,
2702 hashslot);
2703 entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot,
2704 p_isnew, hash);
2705
2706 if (entry != NULL)
2707 {
2708 if (isnew)
2709 initialize_hash_entry(aggstate, perhash->hashtable, entry);
2710 aggstate->hash_pergroup[batch->setno] = entry->additional;
2711 advance_aggregates(aggstate);
2712 }
2713 else
2714 {
2715 if (!spill_initialized)
2716 {
2717 /*
2718 * Avoid initializing the spill until we actually need it so
2719 * that we don't assign tapes that will never be used.
2720 */
2721 spill_initialized = true;
2722 hashagg_spill_init(&spill, tapeset, batch->used_bits,
2723 batch->input_card, aggstate->hashentrysize);
2724 }
2725 /* no memory for a new group, spill */
2726 hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
2727
2728 aggstate->hash_pergroup[batch->setno] = NULL;
2729 }
2730
2731 /*
2732 * Reset per-input-tuple context after each tuple, but note that the
2733 * hash lookups do this too
2734 */
2735 ResetExprContext(aggstate->tmpcontext);
2736 }
2737
2739
2740 /* change back to phase 0 */
2741 aggstate->current_phase = 0;
2742 aggstate->phase = &aggstate->phases[aggstate->current_phase];
2743
2744 if (spill_initialized)
2745 {
2746 hashagg_spill_finish(aggstate, &spill, batch->setno);
2747 hash_agg_update_metrics(aggstate, true, spill.npartitions);
2748 }
2749 else
2750 hash_agg_update_metrics(aggstate, true, 0);
2751
2752 aggstate->hash_spill_mode = false;
2753
2754 /* prepare to walk the first hash table */
2755 select_current_set(aggstate, batch->setno, true);
2757 &aggstate->perhash[batch->setno].hashiter);
2758
2759 pfree(batch);
2760
2761 return true;
2762}
2763
2764/*
2765 * ExecAgg for hashed case: retrieving groups from hash table
2766 *
2767 * After exhausting in-memory tuples, also try refilling the hash table using
2768 * previously-spilled tuples. Only returns NULL after all in-memory and
2769 * spilled tuples are exhausted.
2770 */
2771static TupleTableSlot *
2773{
2774 TupleTableSlot *result = NULL;
2775
2776 while (result == NULL)
2777 {
2778 result = agg_retrieve_hash_table_in_memory(aggstate);
2779 if (result == NULL)
2780 {
2781 if (!agg_refill_hash_table(aggstate))
2782 {
2783 aggstate->agg_done = true;
2784 break;
2785 }
2786 }
2787 }
2788
2789 return result;
2790}
2791
2792/*
2793 * Retrieve the groups from the in-memory hash tables without considering any
2794 * spilled tuples.
2795 */
2796static TupleTableSlot *
2798{
2799 ExprContext *econtext;
2800 AggStatePerAgg peragg;
2801 AggStatePerGroup pergroup;
2802 TupleHashEntryData *entry;
2803 TupleTableSlot *firstSlot;
2804 TupleTableSlot *result;
2805 AggStatePerHash perhash;
2806
2807 /*
2808 * get state info from node.
2809 *
2810 * econtext is the per-output-tuple expression context.
2811 */
2812 econtext = aggstate->ss.ps.ps_ExprContext;
2813 peragg = aggstate->peragg;
2814 firstSlot = aggstate->ss.ss_ScanTupleSlot;
2815
2816 /*
2817 * Note that perhash (and therefore anything accessed through it) can
2818 * change inside the loop, as we change between grouping sets.
2819 */
2820 perhash = &aggstate->perhash[aggstate->current_set];
2821
2822 /*
2823 * We loop retrieving groups until we find one satisfying
2824 * aggstate->ss.ps.qual
2825 */
2826 for (;;)
2827 {
2828 TupleTableSlot *hashslot = perhash->hashslot;
2829 int i;
2830
2832
2833 /*
2834 * Find the next entry in the hash table
2835 */
2836 entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
2837 if (entry == NULL)
2838 {
2839 int nextset = aggstate->current_set + 1;
2840
2841 if (nextset < aggstate->num_hashes)
2842 {
2843 /*
2844 * Switch to next grouping set, reinitialize, and restart the
2845 * loop.
2846 */
2847 select_current_set(aggstate, nextset, true);
2848
2849 perhash = &aggstate->perhash[aggstate->current_set];
2850
2851 ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
2852
2853 continue;
2854 }
2855 else
2856 {
2857 return NULL;
2858 }
2859 }
2860
2861 /*
2862 * Clear the per-output-tuple context for each group
2863 *
2864 * We intentionally don't use ReScanExprContext here; if any aggs have
2865 * registered shutdown callbacks, they mustn't be called yet, since we
2866 * might not be done with that agg.
2867 */
2868 ResetExprContext(econtext);
2869
2870 /*
2871 * Transform representative tuple back into one with the right
2872 * columns.
2873 */
2874 ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
2875 slot_getallattrs(hashslot);
2876
2877 ExecClearTuple(firstSlot);
2878 memset(firstSlot->tts_isnull, true,
2879 firstSlot->tts_tupleDescriptor->natts * sizeof(bool));
2880
2881 for (i = 0; i < perhash->numhashGrpCols; i++)
2882 {
2883 int varNumber = perhash->hashGrpColIdxInput[i] - 1;
2884
2885 firstSlot->tts_values[varNumber] = hashslot->tts_values[i];
2886 firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i];
2887 }
2888 ExecStoreVirtualTuple(firstSlot);
2889
2890 pergroup = (AggStatePerGroup) entry->additional;
2891
2892 /*
2893 * Use the representative input tuple for any references to
2894 * non-aggregated input columns in the qual and tlist.
2895 */
2896 econtext->ecxt_outertuple = firstSlot;
2897
2898 prepare_projection_slot(aggstate,
2899 econtext->ecxt_outertuple,
2900 aggstate->current_set);
2901
2902 finalize_aggregates(aggstate, peragg, pergroup);
2903
2904 result = project_aggregates(aggstate);
2905 if (result)
2906 return result;
2907 }
2908
2909 /* No more groups */
2910 return NULL;
2911}
2912
2913/*
2914 * hashagg_spill_init
2915 *
2916 * Called after we determined that spilling is necessary. Chooses the number
2917 * of partitions to create, and initializes them.
2918 */
2919static void
2920hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits,
2921 double input_groups, double hashentrysize)
2922{
2923 int npartitions;
2924 int partition_bits;
2925
2926 npartitions = hash_choose_num_partitions(input_groups, hashentrysize,
2927 used_bits, &partition_bits);
2928
2929#ifdef USE_INJECTION_POINTS
2930 if (IS_INJECTION_POINT_ATTACHED("hash-aggregate-single-partition"))
2931 {
2932 npartitions = 1;
2933 partition_bits = 0;
2934 INJECTION_POINT_CACHED("hash-aggregate-single-partition");
2935 }
2936#endif
2937
2938 spill->partitions = palloc0(sizeof(LogicalTape *) * npartitions);
2939 spill->ntuples = palloc0(sizeof(int64) * npartitions);
2940 spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
2941
2942 for (int i = 0; i < npartitions; i++)
2943 spill->partitions[i] = LogicalTapeCreate(tapeset);
2944
2945 spill->shift = 32 - used_bits - partition_bits;
2946 if (spill->shift < 32)
2947 spill->mask = (npartitions - 1) << spill->shift;
2948 else
2949 spill->mask = 0;
2950 spill->npartitions = npartitions;
2951
2952 for (int i = 0; i < npartitions; i++)
2954}
2955
2956/*
2957 * hashagg_spill_tuple
2958 *
2959 * No room for new groups in the hash table. Save for later in the appropriate
2960 * partition.
2961 */
2962static Size
2964 TupleTableSlot *inputslot, uint32 hash)
2965{
2966 TupleTableSlot *spillslot;
2967 int partition;
2968 MinimalTuple tuple;
2969 LogicalTape *tape;
2970 int total_written = 0;
2971 bool shouldFree;
2972
2973 Assert(spill->partitions != NULL);
2974
2975 /* spill only attributes that we actually need */
2976 if (!aggstate->all_cols_needed)
2977 {
2978 spillslot = aggstate->hash_spill_wslot;
2979 slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
2980 ExecClearTuple(spillslot);
2981 for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
2982 {
2983 if (bms_is_member(i + 1, aggstate->colnos_needed))
2984 {
2985 spillslot->tts_values[i] = inputslot->tts_values[i];
2986 spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
2987 }
2988 else
2989 spillslot->tts_isnull[i] = true;
2990 }
2991 ExecStoreVirtualTuple(spillslot);
2992 }
2993 else
2994 spillslot = inputslot;
2995
2996 tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
2997
2998 if (spill->shift < 32)
2999 partition = (hash & spill->mask) >> spill->shift;
3000 else
3001 partition = 0;
3002
3003 spill->ntuples[partition]++;
3004
3005 /*
3006 * All hash values destined for a given partition have some bits in
3007 * common, which causes bad HLL cardinality estimates. Hash the hash to
3008 * get a more uniform distribution.
3009 */
3010 addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
3011
3012 tape = spill->partitions[partition];
3013
3014 LogicalTapeWrite(tape, &hash, sizeof(uint32));
3015 total_written += sizeof(uint32);
3016
3017 LogicalTapeWrite(tape, tuple, tuple->t_len);
3018 total_written += tuple->t_len;
3019
3020 if (shouldFree)
3021 pfree(tuple);
3022
3023 return total_written;
3024}
3025
3026/*
3027 * hashagg_batch_new
3028 *
3029 * Construct a HashAggBatch item, which represents one iteration of HashAgg to
3030 * be done.
3031 */
3032static HashAggBatch *
3033hashagg_batch_new(LogicalTape *input_tape, int setno,
3034 int64 input_tuples, double input_card, int used_bits)
3035{
3036 HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
3037
3038 batch->setno = setno;
3039 batch->used_bits = used_bits;
3040 batch->input_tape = input_tape;
3041 batch->input_tuples = input_tuples;
3042 batch->input_card = input_card;
3043
3044 return batch;
3045}
3046
3047/*
3048 * hashagg_batch_read
3049 * read the next tuple from a batch's tape. Return NULL if no more.
3050 */
3051static MinimalTuple
3053{
3054 LogicalTape *tape = batch->input_tape;
3055 MinimalTuple tuple;
3056 uint32 t_len;
3057 size_t nread;
3058 uint32 hash;
3059
3060 nread = LogicalTapeRead(tape, &hash, sizeof(uint32));
3061 if (nread == 0)
3062 return NULL;
3063 if (nread != sizeof(uint32))
3064 ereport(ERROR,
3066 errmsg_internal("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
3067 tape, sizeof(uint32), nread)));
3068 if (hashp != NULL)
3069 *hashp = hash;
3070
3071 nread = LogicalTapeRead(tape, &t_len, sizeof(t_len));
3072 if (nread != sizeof(uint32))
3073 ereport(ERROR,
3075 errmsg_internal("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
3076 tape, sizeof(uint32), nread)));
3077
3078 tuple = (MinimalTuple) palloc(t_len);
3079 tuple->t_len = t_len;
3080
3081 nread = LogicalTapeRead(tape,
3082 (char *) tuple + sizeof(uint32),
3083 t_len - sizeof(uint32));
3084 if (nread != t_len - sizeof(uint32))
3085 ereport(ERROR,
3087 errmsg_internal("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
3088 tape, t_len - sizeof(uint32), nread)));
3089
3090 return tuple;
3091}
3092
3093/*
3094 * hashagg_finish_initial_spills
3095 *
3096 * After a HashAggBatch has been processed, it may have spilled tuples to
3097 * disk. If so, turn the spilled partitions into new batches that must later
3098 * be executed.
3099 */
3100static void
3102{
3103 int setno;
3104 int total_npartitions = 0;
3105
3106 if (aggstate->hash_spills != NULL)
3107 {
3108 for (setno = 0; setno < aggstate->num_hashes; setno++)
3109 {
3110 HashAggSpill *spill = &aggstate->hash_spills[setno];
3111
3112 total_npartitions += spill->npartitions;
3113 hashagg_spill_finish(aggstate, spill, setno);
3114 }
3115
3116 /*
3117 * We're not processing tuples from outer plan any more; only
3118 * processing batches of spilled tuples. The initial spill structures
3119 * are no longer needed.
3120 */
3121 pfree(aggstate->hash_spills);
3122 aggstate->hash_spills = NULL;
3123 }
3124
3125 hash_agg_update_metrics(aggstate, false, total_npartitions);
3126 aggstate->hash_spill_mode = false;
3127}
3128
3129/*
3130 * hashagg_spill_finish
3131 *
3132 * Transform spill partitions into new batches.
3133 */
3134static void
3135hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
3136{
3137 int i;
3138 int used_bits = 32 - spill->shift;
3139
3140 if (spill->npartitions == 0)
3141 return; /* didn't spill */
3142
3143 for (i = 0; i < spill->npartitions; i++)
3144 {
3145 LogicalTape *tape = spill->partitions[i];
3146 HashAggBatch *new_batch;
3147 double cardinality;
3148
3149 /* if the partition is empty, don't create a new batch of work */
3150 if (spill->ntuples[i] == 0)
3151 continue;
3152
3153 cardinality = estimateHyperLogLog(&spill->hll_card[i]);
3154 freeHyperLogLog(&spill->hll_card[i]);
3155
3156 /* rewinding frees the buffer while not in use */
3158
3159 new_batch = hashagg_batch_new(tape, setno,
3160 spill->ntuples[i], cardinality,
3161 used_bits);
3162 aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch);
3163 aggstate->hash_batches_used++;
3164 }
3165
3166 pfree(spill->ntuples);
3167 pfree(spill->hll_card);
3168 pfree(spill->partitions);
3169}
3170
3171/*
3172 * Free resources related to a spilled HashAgg.
3173 */
3174static void
3176{
3177 /* free spills from initial pass */
3178 if (aggstate->hash_spills != NULL)
3179 {
3180 int setno;
3181
3182 for (setno = 0; setno < aggstate->num_hashes; setno++)
3183 {
3184 HashAggSpill *spill = &aggstate->hash_spills[setno];
3185
3186 pfree(spill->ntuples);
3187 pfree(spill->partitions);
3188 }
3189 pfree(aggstate->hash_spills);
3190 aggstate->hash_spills = NULL;
3191 }
3192
3193 /* free batches */
3194 list_free_deep(aggstate->hash_batches);
3195 aggstate->hash_batches = NIL;
3196
3197 /* close tape set */
3198 if (aggstate->hash_tapeset != NULL)
3199 {
3201 aggstate->hash_tapeset = NULL;
3202 }
3203}
3204
3205
3206/* -----------------
3207 * ExecInitAgg
3208 *
3209 * Creates the run-time information for the agg node produced by the
3210 * planner and initializes its outer subtree.
3211 *
3212 * -----------------
3213 */
3214AggState *
3215ExecInitAgg(Agg *node, EState *estate, int eflags)
3216{
3217 AggState *aggstate;
3218 AggStatePerAgg peraggs;
3219 AggStatePerTrans pertransstates;
3220 AggStatePerGroup *pergroups;
3221 Plan *outerPlan;
3222 ExprContext *econtext;
3223 TupleDesc scanDesc;
3224 int max_aggno;
3225 int max_transno;
3226 int numaggrefs;
3227 int numaggs;
3228 int numtrans;
3229 int phase;
3230 int phaseidx;
3231 ListCell *l;
3232 Bitmapset *all_grouped_cols = NULL;
3233 int numGroupingSets = 1;
3234 int numPhases;
3235 int numHashes;
3236 int i = 0;
3237 int j = 0;
3238 bool use_hashing = (node->aggstrategy == AGG_HASHED ||
3239 node->aggstrategy == AGG_MIXED);
3240
3241 /* check for unsupported flags */
3242 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
3243
3244 /*
3245 * create state structure
3246 */
3247 aggstate = makeNode(AggState);
3248 aggstate->ss.ps.plan = (Plan *) node;
3249 aggstate->ss.ps.state = estate;
3250 aggstate->ss.ps.ExecProcNode = ExecAgg;
3251
3252 aggstate->aggs = NIL;
3253 aggstate->numaggs = 0;
3254 aggstate->numtrans = 0;
3255 aggstate->aggstrategy = node->aggstrategy;
3256 aggstate->aggsplit = node->aggsplit;
3257 aggstate->maxsets = 0;
3258 aggstate->projected_set = -1;
3259 aggstate->current_set = 0;
3260 aggstate->peragg = NULL;
3261 aggstate->pertrans = NULL;
3262 aggstate->curperagg = NULL;
3263 aggstate->curpertrans = NULL;
3264 aggstate->input_done = false;
3265 aggstate->agg_done = false;
3266 aggstate->pergroups = NULL;
3267 aggstate->grp_firstTuple = NULL;
3268 aggstate->sort_in = NULL;
3269 aggstate->sort_out = NULL;
3270
3271 /*
3272 * phases[0] always exists, but is dummy in sorted/plain mode
3273 */
3274 numPhases = (use_hashing ? 1 : 2);
3275 numHashes = (use_hashing ? 1 : 0);
3276
3277 /*
3278 * Calculate the maximum number of grouping sets in any phase; this
3279 * determines the size of some allocations. Also calculate the number of
3280 * phases, since all hashed/mixed nodes contribute to only a single phase.
3281 */
3282 if (node->groupingSets)
3283 {
3284 numGroupingSets = list_length(node->groupingSets);
3285
3286 foreach(l, node->chain)
3287 {
3288 Agg *agg = lfirst(l);
3289
3290 numGroupingSets = Max(numGroupingSets,
3292
3293 /*
3294 * additional AGG_HASHED aggs become part of phase 0, but all
3295 * others add an extra phase.
3296 */
3297 if (agg->aggstrategy != AGG_HASHED)
3298 ++numPhases;
3299 else
3300 ++numHashes;
3301 }
3302 }
3303
3304 aggstate->maxsets = numGroupingSets;
3305 aggstate->numphases = numPhases;
3306
3307 aggstate->aggcontexts = (ExprContext **)
3308 palloc0(sizeof(ExprContext *) * numGroupingSets);
3309
3310 /*
3311 * Create expression contexts. We need three or more, one for
3312 * per-input-tuple processing, one for per-output-tuple processing, one
3313 * for all the hashtables, and one for each grouping set. The per-tuple
3314 * memory context of the per-grouping-set ExprContexts (aggcontexts)
3315 * replaces the standalone memory context formerly used to hold transition
3316 * values. We cheat a little by using ExecAssignExprContext() to build
3317 * all of them.
3318 *
3319 * NOTE: the details of what is stored in aggcontexts and what is stored
3320 * in the regular per-query memory context are driven by a simple
3321 * decision: we want to reset the aggcontext at group boundaries (if not
3322 * hashing) and in ExecReScanAgg to recover no-longer-wanted space.
3323 */
3324 ExecAssignExprContext(estate, &aggstate->ss.ps);
3325 aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
3326
3327 for (i = 0; i < numGroupingSets; ++i)
3328 {
3329 ExecAssignExprContext(estate, &aggstate->ss.ps);
3330 aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext;
3331 }
3332
3333 if (use_hashing)
3334 aggstate->hashcontext = CreateWorkExprContext(estate);
3335
3336 ExecAssignExprContext(estate, &aggstate->ss.ps);
3337
3338 /*
3339 * Initialize child nodes.
3340 *
3341 * If we are doing a hashed aggregation then the child plan does not need
3342 * to handle REWIND efficiently; see ExecReScanAgg.
3343 */
3344 if (node->aggstrategy == AGG_HASHED)
3345 eflags &= ~EXEC_FLAG_REWIND;
3346 outerPlan = outerPlan(node);
3347 outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
3348
3349 /*
3350 * initialize source tuple type.
3351 */
3352 aggstate->ss.ps.outerops =
3354 &aggstate->ss.ps.outeropsfixed);
3355 aggstate->ss.ps.outeropsset = true;
3356
3357 ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
3358 aggstate->ss.ps.outerops);
3359 scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
3360
3361 /*
3362 * If there are more than two phases (including a potential dummy phase
3363 * 0), input will be resorted using tuplesort. Need a slot for that.
3364 */
3365 if (numPhases > 2)
3366 {
3367 aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc,
3369
3370 /*
3371 * The output of the tuplesort, and the output from the outer child
3372 * might not use the same type of slot. In most cases the child will
3373 * be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the
3374 * input can also be presorted due an index, in which case it could be
3375 * a different type of slot.
3376 *
3377 * XXX: For efficiency it would be good to instead/additionally
3378 * generate expressions with corresponding settings of outerops* for
3379 * the individual phases - deforming is often a bottleneck for
3380 * aggregations with lots of rows per group. If there's multiple
3381 * sorts, we know that all but the first use TTSOpsMinimalTuple (via
3382 * the nodeAgg.c internal tuplesort).
3383 */
3384 if (aggstate->ss.ps.outeropsfixed &&
3385 aggstate->ss.ps.outerops != &TTSOpsMinimalTuple)
3386 aggstate->ss.ps.outeropsfixed = false;
3387 }
3388
3389 /*
3390 * Initialize result type, slot and projection.
3391 */
3393 ExecAssignProjectionInfo(&aggstate->ss.ps, NULL);
3394
3395 /*
3396 * initialize child expressions
3397 *
3398 * We expect the parser to have checked that no aggs contain other agg
3399 * calls in their arguments (and just to be sure, we verify it again while
3400 * initializing the plan node). This would make no sense under SQL
3401 * semantics, and it's forbidden by the spec. Because it is true, we
3402 * don't need to worry about evaluating the aggs in any particular order.
3403 *
3404 * Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs.
3405 * Aggrefs in the qual are found here; Aggrefs in the targetlist are found
3406 * during ExecAssignProjectionInfo, above.
3407 */
3408 aggstate->ss.ps.qual =
3409 ExecInitQual(node->plan.qual, (PlanState *) aggstate);
3410
3411 /*
3412 * We should now have found all Aggrefs in the targetlist and quals.
3413 */
3414 numaggrefs = list_length(aggstate->aggs);
3415 max_aggno = -1;
3416 max_transno = -1;
3417 foreach(l, aggstate->aggs)
3418 {
3419 Aggref *aggref = (Aggref *) lfirst(l);
3420
3421 max_aggno = Max(max_aggno, aggref->aggno);
3422 max_transno = Max(max_transno, aggref->aggtransno);
3423 }
3424 aggstate->numaggs = numaggs = max_aggno + 1;
3425 aggstate->numtrans = numtrans = max_transno + 1;
3426
3427 /*
3428 * For each phase, prepare grouping set data and fmgr lookup data for
3429 * compare functions. Accumulate all_grouped_cols in passing.
3430 */
3431 aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
3432
3433 aggstate->num_hashes = numHashes;
3434 if (numHashes)
3435 {
3436 aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes);
3437 aggstate->phases[0].numsets = 0;
3438 aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int));
3439 aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *));
3440 }
3441
3442 phase = 0;
3443 for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx)
3444 {
3445 Agg *aggnode;
3446 Sort *sortnode;
3447
3448 if (phaseidx > 0)
3449 {
3450 aggnode = list_nth_node(Agg, node->chain, phaseidx - 1);
3451 sortnode = castNode(Sort, outerPlan(aggnode));
3452 }
3453 else
3454 {
3455 aggnode = node;
3456 sortnode = NULL;
3457 }
3458
3459 Assert(phase <= 1 || sortnode);
3460
3461 if (aggnode->aggstrategy == AGG_HASHED
3462 || aggnode->aggstrategy == AGG_MIXED)
3463 {
3464 AggStatePerPhase phasedata = &aggstate->phases[0];
3465 AggStatePerHash perhash;
3466 Bitmapset *cols = NULL;
3467
3468 Assert(phase == 0);
3469 i = phasedata->numsets++;
3470 perhash = &aggstate->perhash[i];
3471
3472 /* phase 0 always points to the "real" Agg in the hash case */
3473 phasedata->aggnode = node;
3474 phasedata->aggstrategy = node->aggstrategy;
3475
3476 /* but the actual Agg node representing this hash is saved here */
3477 perhash->aggnode = aggnode;
3478
3479 phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols;
3480
3481 for (j = 0; j < aggnode->numCols; ++j)
3482 cols = bms_add_member(cols, aggnode->grpColIdx[j]);
3483
3484 phasedata->grouped_cols[i] = cols;
3485
3486 all_grouped_cols = bms_add_members(all_grouped_cols, cols);
3487 continue;
3488 }
3489 else
3490 {
3491 AggStatePerPhase phasedata = &aggstate->phases[++phase];
3492 int num_sets;
3493
3494 phasedata->numsets = num_sets = list_length(aggnode->groupingSets);
3495
3496 if (num_sets)
3497 {
3498 phasedata->gset_lengths = palloc(num_sets * sizeof(int));
3499 phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *));
3500
3501 i = 0;
3502 foreach(l, aggnode->groupingSets)
3503 {
3504 int current_length = list_length(lfirst(l));
3505 Bitmapset *cols = NULL;
3506
3507 /* planner forces this to be correct */
3508 for (j = 0; j < current_length; ++j)
3509 cols = bms_add_member(cols, aggnode->grpColIdx[j]);
3510
3511 phasedata->grouped_cols[i] = cols;
3512 phasedata->gset_lengths[i] = current_length;
3513
3514 ++i;
3515 }
3516
3517 all_grouped_cols = bms_add_members(all_grouped_cols,
3518 phasedata->grouped_cols[0]);
3519 }
3520 else
3521 {
3522 Assert(phaseidx == 0);
3523
3524 phasedata->gset_lengths = NULL;
3525 phasedata->grouped_cols = NULL;
3526 }
3527
3528 /*
3529 * If we are grouping, precompute fmgr lookup data for inner loop.
3530 */
3531 if (aggnode->aggstrategy == AGG_SORTED)
3532 {
3533 /*
3534 * Build a separate function for each subset of columns that
3535 * need to be compared.
3536 */
3537 phasedata->eqfunctions =
3538 (ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *));
3539
3540 /* for each grouping set */
3541 for (int k = 0; k < phasedata->numsets; k++)
3542 {
3543 int length = phasedata->gset_lengths[k];
3544
3545 /* nothing to do for empty grouping set */
3546 if (length == 0)
3547 continue;
3548
3549 /* if we already had one of this length, it'll do */
3550 if (phasedata->eqfunctions[length - 1] != NULL)
3551 continue;
3552
3553 phasedata->eqfunctions[length - 1] =
3554 execTuplesMatchPrepare(scanDesc,
3555 length,
3556 aggnode->grpColIdx,
3557 aggnode->grpOperators,
3558 aggnode->grpCollations,
3559 (PlanState *) aggstate);
3560 }
3561
3562 /* and for all grouped columns, unless already computed */
3563 if (aggnode->numCols > 0 &&
3564 phasedata->eqfunctions[aggnode->numCols - 1] == NULL)
3565 {
3566 phasedata->eqfunctions[aggnode->numCols - 1] =
3567 execTuplesMatchPrepare(scanDesc,
3568 aggnode->numCols,
3569 aggnode->grpColIdx,
3570 aggnode->grpOperators,
3571 aggnode->grpCollations,
3572 (PlanState *) aggstate);
3573 }
3574 }
3575
3576 phasedata->aggnode = aggnode;
3577 phasedata->aggstrategy = aggnode->aggstrategy;
3578 phasedata->sortnode = sortnode;
3579 }
3580 }
3581
3582 /*
3583 * Convert all_grouped_cols to a descending-order list.
3584 */
3585 i = -1;
3586 while ((i = bms_next_member(all_grouped_cols, i)) >= 0)
3587 aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols);
3588
3589 /*
3590 * Set up aggregate-result storage in the output expr context, and also
3591 * allocate my private per-agg working storage
3592 */
3593 econtext = aggstate->ss.ps.ps_ExprContext;
3594 econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
3595 econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
3596
3597 peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
3598 pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans);
3599
3600 aggstate->peragg = peraggs;
3601 aggstate->pertrans = pertransstates;
3602
3603
3604 aggstate->all_pergroups =
3606 * (numGroupingSets + numHashes));
3607 pergroups = aggstate->all_pergroups;
3608
3609 if (node->aggstrategy != AGG_HASHED)
3610 {
3611 for (i = 0; i < numGroupingSets; i++)
3612 {
3613 pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData)
3614 * numaggs);
3615 }
3616
3617 aggstate->pergroups = pergroups;
3618 pergroups += numGroupingSets;
3619 }
3620
3621 /*
3622 * Hashing can only appear in the initial phase.
3623 */
3624 if (use_hashing)
3625 {
3626 Plan *outerplan = outerPlan(node);
3627 uint64 totalGroups = 0;
3628
3630 "HashAgg meta context",
3632 aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
3634 aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
3635 &TTSOpsVirtual);
3636
3637 /* this is an array of pointers, not structures */
3638 aggstate->hash_pergroup = pergroups;
3639
3640 aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans,
3641 outerplan->plan_width,
3642 node->transitionSpace);
3643
3644 /*
3645 * Consider all of the grouping sets together when setting the limits
3646 * and estimating the number of partitions. This can be inaccurate
3647 * when there is more than one grouping set, but should still be
3648 * reasonable.
3649 */
3650 for (int k = 0; k < aggstate->num_hashes; k++)
3651 totalGroups += aggstate->perhash[k].aggnode->numGroups;
3652
3653 hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0,
3654 &aggstate->hash_mem_limit,
3655 &aggstate->hash_ngroups_limit,
3656 &aggstate->hash_planned_partitions);
3657 find_hash_columns(aggstate);
3658
3659 /* Skip massive memory allocation if we are just doing EXPLAIN */
3660 if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
3661 build_hash_tables(aggstate);
3662
3663 aggstate->table_filled = false;
3664
3665 /* Initialize this to 1, meaning nothing spilled, yet */
3666 aggstate->hash_batches_used = 1;
3667 }
3668
3669 /*
3670 * Initialize current phase-dependent values to initial phase. The initial
3671 * phase is 1 (first sort pass) for all strategies that use sorting (if
3672 * hashing is being done too, then phase 0 is processed last); but if only
3673 * hashing is being done, then phase 0 is all there is.
3674 */
3675 if (node->aggstrategy == AGG_HASHED)
3676 {
3677 aggstate->current_phase = 0;
3678 initialize_phase(aggstate, 0);
3679 select_current_set(aggstate, 0, true);
3680 }
3681 else
3682 {
3683 aggstate->current_phase = 1;
3684 initialize_phase(aggstate, 1);
3685 select_current_set(aggstate, 0, false);
3686 }
3687
3688 /*
3689 * Perform lookups of aggregate function info, and initialize the
3690 * unchanging fields of the per-agg and per-trans data.
3691 */
3692 foreach(l, aggstate->aggs)
3693 {
3694 Aggref *aggref = lfirst(l);
3695 AggStatePerAgg peragg;
3696 AggStatePerTrans pertrans;
3697 Oid aggTransFnInputTypes[FUNC_MAX_ARGS];
3698 int numAggTransFnArgs;
3699 int numDirectArgs;
3700 HeapTuple aggTuple;
3701 Form_pg_aggregate aggform;
3702 AclResult aclresult;
3703 Oid finalfn_oid;
3704 Oid serialfn_oid,
3705 deserialfn_oid;
3706 Oid aggOwner;
3707 Expr *finalfnexpr;
3708 Oid aggtranstype;
3709
3710 /* Planner should have assigned aggregate to correct level */
3711 Assert(aggref->agglevelsup == 0);
3712 /* ... and the split mode should match */
3713 Assert(aggref->aggsplit == aggstate->aggsplit);
3714
3715 peragg = &peraggs[aggref->aggno];
3716
3717 /* Check if we initialized the state for this aggregate already. */
3718 if (peragg->aggref != NULL)
3719 continue;
3720
3721 peragg->aggref = aggref;
3722 peragg->transno = aggref->aggtransno;
3723
3724 /* Fetch the pg_aggregate row */
3725 aggTuple = SearchSysCache1(AGGFNOID,
3726 ObjectIdGetDatum(aggref->aggfnoid));
3727 if (!HeapTupleIsValid(aggTuple))
3728 elog(ERROR, "cache lookup failed for aggregate %u",
3729 aggref->aggfnoid);
3730 aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
3731
3732 /* Check permission to call aggregate function */
3733 aclresult = object_aclcheck(ProcedureRelationId, aggref->aggfnoid, GetUserId(),
3734 ACL_EXECUTE);
3735 if (aclresult != ACLCHECK_OK)
3737 get_func_name(aggref->aggfnoid));
3739
3740 /* planner recorded transition state type in the Aggref itself */
3741 aggtranstype = aggref->aggtranstype;
3742 Assert(OidIsValid(aggtranstype));
3743
3744 /* Final function only required if we're finalizing the aggregates */
3745 if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
3746 peragg->finalfn_oid = finalfn_oid = InvalidOid;
3747 else
3748 peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
3749
3750 serialfn_oid = InvalidOid;
3751 deserialfn_oid = InvalidOid;
3752
3753 /*
3754 * Check if serialization/deserialization is required. We only do it
3755 * for aggregates that have transtype INTERNAL.
3756 */
3757 if (aggtranstype == INTERNALOID)
3758 {
3759 /*
3760 * The planner should only have generated a serialize agg node if
3761 * every aggregate with an INTERNAL state has a serialization
3762 * function. Verify that.
3763 */
3764 if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit))
3765 {
3766 /* serialization only valid when not running finalfn */
3768
3769 if (!OidIsValid(aggform->aggserialfn))
3770 elog(ERROR, "serialfunc not provided for serialization aggregation");
3771 serialfn_oid = aggform->aggserialfn;
3772 }
3773
3774 /* Likewise for deserialization functions */
3775 if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit))
3776 {
3777 /* deserialization only valid when combining states */
3779
3780 if (!OidIsValid(aggform->aggdeserialfn))
3781 elog(ERROR, "deserialfunc not provided for deserialization aggregation");
3782 deserialfn_oid = aggform->aggdeserialfn;
3783 }
3784 }
3785
3786 /* Check that aggregate owner has permission to call component fns */
3787 {
3788 HeapTuple procTuple;
3789
3790 procTuple = SearchSysCache1(PROCOID,
3791 ObjectIdGetDatum(aggref->aggfnoid));
3792 if (!HeapTupleIsValid(procTuple))
3793 elog(ERROR, "cache lookup failed for function %u",
3794 aggref->aggfnoid);
3795 aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
3796 ReleaseSysCache(procTuple);
3797
3798 if (OidIsValid(finalfn_oid))
3799 {
3800 aclresult = object_aclcheck(ProcedureRelationId, finalfn_oid, aggOwner,
3801 ACL_EXECUTE);
3802 if (aclresult != ACLCHECK_OK)
3804 get_func_name(finalfn_oid));
3805 InvokeFunctionExecuteHook(finalfn_oid);
3806 }
3807 if (OidIsValid(serialfn_oid))
3808 {
3809 aclresult = object_aclcheck(ProcedureRelationId, serialfn_oid, aggOwner,
3810 ACL_EXECUTE);
3811 if (aclresult != ACLCHECK_OK)
3813 get_func_name(serialfn_oid));
3814 InvokeFunctionExecuteHook(serialfn_oid);
3815 }
3816 if (OidIsValid(deserialfn_oid))
3817 {
3818 aclresult = object_aclcheck(ProcedureRelationId, deserialfn_oid, aggOwner,
3819 ACL_EXECUTE);
3820 if (aclresult != ACLCHECK_OK)
3822 get_func_name(deserialfn_oid));
3823 InvokeFunctionExecuteHook(deserialfn_oid);
3824 }
3825 }
3826
3827 /*
3828 * Get actual datatypes of the (nominal) aggregate inputs. These
3829 * could be different from the agg's declared input types, when the
3830 * agg accepts ANY or a polymorphic type.
3831 */
3832 numAggTransFnArgs = get_aggregate_argtypes(aggref,
3833 aggTransFnInputTypes);
3834
3835 /* Count the "direct" arguments, if any */
3836 numDirectArgs = list_length(aggref->aggdirectargs);
3837
3838 /* Detect how many arguments to pass to the finalfn */
3839 if (aggform->aggfinalextra)
3840 peragg->numFinalArgs = numAggTransFnArgs + 1;
3841 else
3842 peragg->numFinalArgs = numDirectArgs + 1;
3843
3844 /* Initialize any direct-argument expressions */
3846 (PlanState *) aggstate);
3847
3848 /*
3849 * build expression trees using actual argument & result types for the
3850 * finalfn, if it exists and is required.
3851 */
3852 if (OidIsValid(finalfn_oid))
3853 {
3854 build_aggregate_finalfn_expr(aggTransFnInputTypes,
3855 peragg->numFinalArgs,
3856 aggtranstype,
3857 aggref->aggtype,
3858 aggref->inputcollid,
3859 finalfn_oid,
3860 &finalfnexpr);
3861 fmgr_info(finalfn_oid, &peragg->finalfn);
3862 fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn);
3863 }
3864
3865 /* get info about the output value's datatype */
3866 get_typlenbyval(aggref->aggtype,
3867 &peragg->resulttypeLen,
3868 &peragg->resulttypeByVal);
3869
3870 /*
3871 * Build working state for invoking the transition function, if we
3872 * haven't done it already.
3873 */
3874 pertrans = &pertransstates[aggref->aggtransno];
3875 if (pertrans->aggref == NULL)
3876 {
3877 Datum textInitVal;
3879 bool initValueIsNull;
3880 Oid transfn_oid;
3881
3882 /*
3883 * If this aggregation is performing state combines, then instead
3884 * of using the transition function, we'll use the combine
3885 * function.
3886 */
3887 if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
3888 {
3889 transfn_oid = aggform->aggcombinefn;
3890
3891 /* If not set then the planner messed up */
3892 if (!OidIsValid(transfn_oid))
3893 elog(ERROR, "combinefn not set for aggregate function");
3894 }
3895 else
3896 transfn_oid = aggform->aggtransfn;
3897
3898 aclresult = object_aclcheck(ProcedureRelationId, transfn_oid, aggOwner, ACL_EXECUTE);
3899 if (aclresult != ACLCHECK_OK)
3901 get_func_name(transfn_oid));
3902 InvokeFunctionExecuteHook(transfn_oid);
3903
3904 /*
3905 * initval is potentially null, so don't try to access it as a
3906 * struct field. Must do it the hard way with SysCacheGetAttr.
3907 */
3908 textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
3909 Anum_pg_aggregate_agginitval,
3910 &initValueIsNull);
3911 if (initValueIsNull)
3912 initValue = (Datum) 0;
3913 else
3914 initValue = GetAggInitVal(textInitVal, aggtranstype);
3915
3916 if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
3917 {
3918 Oid combineFnInputTypes[] = {aggtranstype,
3919 aggtranstype};
3920
3921 /*
3922 * When combining there's only one input, the to-be-combined
3923 * transition value. The transition value is not counted
3924 * here.
3925 */
3926 pertrans->numTransInputs = 1;
3927
3928 /* aggcombinefn always has two arguments of aggtranstype */
3929 build_pertrans_for_aggref(pertrans, aggstate, estate,
3930 aggref, transfn_oid, aggtranstype,
3931 serialfn_oid, deserialfn_oid,
3932 initValue, initValueIsNull,
3933 combineFnInputTypes, 2);
3934
3935 /*
3936 * Ensure that a combine function to combine INTERNAL states
3937 * is not strict. This should have been checked during CREATE
3938 * AGGREGATE, but the strict property could have been changed
3939 * since then.
3940 */
3941 if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID)
3942 ereport(ERROR,
3943 (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
3944 errmsg("combine function with transition type %s must not be declared STRICT",
3945 format_type_be(aggtranstype))));
3946 }
3947 else
3948 {
3949 /* Detect how many arguments to pass to the transfn */
3950 if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
3951 pertrans->numTransInputs = list_length(aggref->args);
3952 else
3953 pertrans->numTransInputs = numAggTransFnArgs;
3954
3955 build_pertrans_for_aggref(pertrans, aggstate, estate,
3956 aggref, transfn_oid, aggtranstype,
3957 serialfn_oid, deserialfn_oid,
3958 initValue, initValueIsNull,
3959 aggTransFnInputTypes,
3960 numAggTransFnArgs);
3961
3962 /*
3963 * If the transfn is strict and the initval is NULL, make sure
3964 * input type and transtype are the same (or at least
3965 * binary-compatible), so that it's OK to use the first
3966 * aggregated input value as the initial transValue. This
3967 * should have been checked at agg definition time, but we
3968 * must check again in case the transfn's strictness property
3969 * has been changed.
3970 */
3971 if (pertrans->transfn.fn_strict && pertrans->initValueIsNull)
3972 {
3973 if (numAggTransFnArgs <= numDirectArgs ||
3974 !IsBinaryCoercible(aggTransFnInputTypes[numDirectArgs],
3975 aggtranstype))
3976 ereport(ERROR,
3977 (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
3978 errmsg("aggregate %u needs to have compatible input type and transition type",
3979 aggref->aggfnoid)));
3980 }
3981 }
3982 }
3983 else
3984 pertrans->aggshared = true;
3985 ReleaseSysCache(aggTuple);
3986 }
3987
3988 /*
3989 * Last, check whether any more aggregates got added onto the node while
3990 * we processed the expressions for the aggregate arguments (including not
3991 * only the regular arguments and FILTER expressions handled immediately
3992 * above, but any direct arguments we might've handled earlier). If so,
3993 * we have nested aggregate functions, which is semantically nonsensical,
3994 * so complain. (This should have been caught by the parser, so we don't
3995 * need to work hard on a helpful error message; but we defend against it
3996 * here anyway, just to be sure.)
3997 */
3998 if (numaggrefs != list_length(aggstate->aggs))
3999 ereport(ERROR,
4000 (errcode(ERRCODE_GROUPING_ERROR),
4001 errmsg("aggregate function calls cannot be nested")));
4002
4003 /*
4004 * Build expressions doing all the transition work at once. We build a
4005 * different one for each phase, as the number of transition function
4006 * invocation can differ between phases. Note this'll work both for
4007 * transition and combination functions (although there'll only be one
4008 * phase in the latter case).
4009 */
4010 for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++)
4011 {
4012 AggStatePerPhase phase = &aggstate->phases[phaseidx];
4013 bool dohash = false;
4014 bool dosort = false;
4015
4016 /* phase 0 doesn't necessarily exist */
4017 if (!phase->aggnode)
4018 continue;
4019
4020 if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1)
4021 {
4022 /*
4023 * Phase one, and only phase one, in a mixed agg performs both
4024 * sorting and aggregation.
4025 */
4026 dohash = true;
4027 dosort = true;
4028 }
4029 else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0)
4030 {
4031 /*
4032 * No need to compute a transition function for an AGG_MIXED phase
4033 * 0 - the contents of the hashtables will have been computed
4034 * during phase 1.
4035 */
4036 continue;
4037 }
4038 else if (phase->aggstrategy == AGG_PLAIN ||
4039 phase->aggstrategy == AGG_SORTED)
4040 {
4041 dohash = false;
4042 dosort = true;
4043 }
4044 else if (phase->aggstrategy == AGG_HASHED)
4045 {
4046 dohash = true;
4047 dosort = false;
4048 }
4049 else
4050 Assert(false);
4051
4052 phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash,
4053 false);
4054
4055 /* cache compiled expression for outer slot without NULL check */
4056 phase->evaltrans_cache[0][0] = phase->evaltrans;
4057 }
4058
4059 return aggstate;
4060}
4061
4062/*
4063 * Build the state needed to calculate a state value for an aggregate.
4064 *
4065 * This initializes all the fields in 'pertrans'. 'aggref' is the aggregate
4066 * to initialize the state for. 'transfn_oid', 'aggtranstype', and the rest
4067 * of the arguments could be calculated from 'aggref', but the caller has
4068 * calculated them already, so might as well pass them.
4069 *
4070 * 'transfn_oid' may be either the Oid of the aggtransfn or the aggcombinefn.
4071 */
4072static void
4074 AggState *aggstate, EState *estate,
4075 Aggref *aggref,
4076 Oid transfn_oid, Oid aggtranstype,
4077 Oid aggserialfn, Oid aggdeserialfn,
4078 Datum initValue, bool initValueIsNull,
4079 Oid *inputTypes, int numArguments)
4080{
4081 int numGroupingSets = Max(aggstate->maxsets, 1);
4082 Expr *transfnexpr;
4083 int numTransArgs;
4084 Expr *serialfnexpr = NULL;
4085 Expr *deserialfnexpr = NULL;
4086 ListCell *lc;
4087 int numInputs;
4088 int numDirectArgs;
4089 List *sortlist;
4090 int numSortCols;
4091 int numDistinctCols;
4092 int i;
4093
4094 /* Begin filling in the pertrans data */
4095 pertrans->aggref = aggref;
4096 pertrans->aggshared = false;
4097 pertrans->aggCollation = aggref->inputcollid;
4098 pertrans->transfn_oid = transfn_oid;
4099 pertrans->serialfn_oid = aggserialfn;
4100 pertrans->deserialfn_oid = aggdeserialfn;
4101 pertrans->initValue = initValue;
4102 pertrans->initValueIsNull = initValueIsNull;
4103
4104 /* Count the "direct" arguments, if any */
4105 numDirectArgs = list_length(aggref->aggdirectargs);
4106
4107 /* Count the number of aggregated input columns */
4108 pertrans->numInputs = numInputs = list_length(aggref->args);
4109
4110 pertrans->aggtranstype = aggtranstype;
4111
4112 /* account for the current transition state */
4113 numTransArgs = pertrans->numTransInputs + 1;
4114
4115 /*
4116 * Set up infrastructure for calling the transfn. Note that invtransfn is
4117 * not needed here.
4118 */
4120 numArguments,
4121 numDirectArgs,
4122 aggref->aggvariadic,
4123 aggtranstype,
4124 aggref->inputcollid,
4125 transfn_oid,
4126 InvalidOid,
4127 &transfnexpr,
4128 NULL);
4129
4130 fmgr_info(transfn_oid, &pertrans->transfn);
4131 fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn);
4132
4133 pertrans->transfn_fcinfo =
4136 &pertrans->transfn,
4137 numTransArgs,
4138 pertrans->aggCollation,
4139 (Node *) aggstate, NULL);
4140
4141 /* get info about the state value's datatype */
4142 get_typlenbyval(aggtranstype,
4143 &pertrans->transtypeLen,
4144 &pertrans->transtypeByVal);
4145
4146 if (OidIsValid(aggserialfn))
4147 {
4149 &serialfnexpr);
4150 fmgr_info(aggserialfn, &pertrans->serialfn);
4151 fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn);
4152
4153 pertrans->serialfn_fcinfo =
4156 &pertrans->serialfn,
4157 1,
4158 InvalidOid,
4159 (Node *) aggstate, NULL);
4160 }
4161
4162 if (OidIsValid(aggdeserialfn))
4163 {
4164 build_aggregate_deserialfn_expr(aggdeserialfn,
4165 &deserialfnexpr);
4166 fmgr_info(aggdeserialfn, &pertrans->deserialfn);
4167 fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn);
4168
4169 pertrans->deserialfn_fcinfo =
4172 &pertrans->deserialfn,
4173 2,
4174 InvalidOid,
4175 (Node *) aggstate, NULL);
4176 }
4177
4178 /*
4179 * If we're doing either DISTINCT or ORDER BY for a plain agg, then we
4180 * have a list of SortGroupClause nodes; fish out the data in them and
4181 * stick them into arrays. We ignore ORDER BY for an ordered-set agg,
4182 * however; the agg's transfn and finalfn are responsible for that.
4183 *
4184 * When the planner has set the aggpresorted flag, the input to the
4185 * aggregate is already correctly sorted. For ORDER BY aggregates we can
4186 * simply treat these as normal aggregates. For presorted DISTINCT
4187 * aggregates an extra step must be added to remove duplicate consecutive
4188 * inputs.
4189 *
4190 * Note that by construction, if there is a DISTINCT clause then the ORDER
4191 * BY clause is a prefix of it (see transformDistinctClause).
4192 */
4193 if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
4194 {
4195 sortlist = NIL;
4196 numSortCols = numDistinctCols = 0;
4197 pertrans->aggsortrequired = false;
4198 }
4199 else if (aggref->aggpresorted && aggref->aggdistinct == NIL)
4200 {
4201 sortlist = NIL;
4202 numSortCols = numDistinctCols = 0;
4203 pertrans->aggsortrequired = false;
4204 }
4205 else if (aggref->aggdistinct)
4206 {
4207 sortlist = aggref->aggdistinct;
4208 numSortCols = numDistinctCols = list_length(sortlist);
4209 Assert(numSortCols >= list_length(aggref->aggorder));
4210 pertrans->aggsortrequired = !aggref->aggpresorted;
4211 }
4212 else
4213 {
4214 sortlist = aggref->aggorder;
4215 numSortCols = list_length(sortlist);
4216 numDistinctCols = 0;
4217 pertrans->aggsortrequired = (numSortCols > 0);
4218 }
4219
4220 pertrans->numSortCols = numSortCols;
4221 pertrans->numDistinctCols = numDistinctCols;
4222
4223 /*
4224 * If we have either sorting or filtering to do, create a tupledesc and
4225 * slot corresponding to the aggregated inputs (including sort
4226 * expressions) of the agg.
4227 */
4228 if (numSortCols > 0 || aggref->aggfilter)
4229 {
4230 pertrans->sortdesc = ExecTypeFromTL(aggref->args);
4231 pertrans->sortslot =
4232 ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
4234 }
4235
4236 if (numSortCols > 0)
4237 {
4238 /*
4239 * We don't implement DISTINCT or ORDER BY aggs in the HASHED case
4240 * (yet)
4241 */
4242 Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
4243
4244 /* ORDER BY aggregates are not supported with partial aggregation */
4245 Assert(!DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
4246
4247 /* If we have only one input, we need its len/byval info. */
4248 if (numInputs == 1)
4249 {
4250 get_typlenbyval(inputTypes[numDirectArgs],
4251 &pertrans->inputtypeLen,
4252 &pertrans->inputtypeByVal);
4253 }
4254 else if (numDistinctCols > 0)
4255 {
4256 /* we will need an extra slot to store prior values */
4257 pertrans->uniqslot =
4258 ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
4260 }
4261
4262 /* Extract the sort information for use later */
4263 pertrans->sortColIdx =
4264 (AttrNumber *) palloc(numSortCols * sizeof(AttrNumber));
4265 pertrans->sortOperators =
4266 (Oid *) palloc(numSortCols * sizeof(Oid));
4267 pertrans->sortCollations =
4268 (Oid *) palloc(numSortCols * sizeof(Oid));
4269 pertrans->sortNullsFirst =
4270 (bool *) palloc(numSortCols * sizeof(bool));
4271
4272 i = 0;
4273 foreach(lc, sortlist)
4274 {
4275 SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc);
4276 TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args);
4277
4278 /* the parser should have made sure of this */
4279 Assert(OidIsValid(sortcl->sortop));
4280
4281 pertrans->sortColIdx[i] = tle->resno;
4282 pertrans->sortOperators[i] = sortcl->sortop;
4283 pertrans->sortCollations[i] = exprCollation((Node *) tle->expr);
4284 pertrans->sortNullsFirst[i] = sortcl->nulls_first;
4285 i++;
4286 }
4287 Assert(i == numSortCols);
4288 }
4289
4290 if (aggref->aggdistinct)
4291 {
4292 Oid *ops;
4293
4294 Assert(numArguments > 0);
4295 Assert(list_length(aggref->aggdistinct) == numDistinctCols);
4296
4297 ops = palloc(numDistinctCols * sizeof(Oid));
4298
4299 i = 0;
4300 foreach(lc, aggref->aggdistinct)
4301 ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop;
4302
4303 /* lookup / build the necessary comparators */
4304 if (numDistinctCols == 1)
4305 fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne);
4306 else
4307 pertrans->equalfnMulti =
4309 numDistinctCols,
4310 pertrans->sortColIdx,
4311 ops,
4312 pertrans->sortCollations,
4313 &aggstate->ss.ps);
4314 pfree(ops);
4315 }
4316
4317 pertrans->sortstates = (Tuplesortstate **)
4318 palloc0(sizeof(Tuplesortstate *) * numGroupingSets);
4319}
4320
4321
4322static Datum
4323GetAggInitVal(Datum textInitVal, Oid transtype)
4324{
4325 Oid typinput,
4326 typioparam;
4327 char *strInitVal;
4328 Datum initVal;
4329
4330 getTypeInputInfo(transtype, &typinput, &typioparam);
4331 strInitVal = TextDatumGetCString(textInitVal);
4332 initVal = OidInputFunctionCall(typinput, strInitVal,
4333 typioparam, -1);
4334 pfree(strInitVal);
4335 return initVal;
4336}
4337
4338void
4340{
4342 int transno;
4343 int numGroupingSets = Max(node->maxsets, 1);
4344 int setno;
4345
4346 /*
4347 * When ending a parallel worker, copy the statistics gathered by the
4348 * worker back into shared memory so that it can be picked up by the main
4349 * process to report in EXPLAIN ANALYZE.
4350 */
4351 if (node->shared_info && IsParallelWorker())
4352 {
4354
4355 Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
4358 si->hash_disk_used = node->hash_disk_used;
4359 si->hash_mem_peak = node->hash_mem_peak;
4360 }
4361
4362 /* Make sure we have closed any open tuplesorts */
4363
4364 if (node->sort_in)
4365 tuplesort_end(node->sort_in);
4366 if (node->sort_out)
4367 tuplesort_end(node->sort_out);
4368
4370
4371 if (node->hash_metacxt != NULL)
4372 {
4374 node->hash_metacxt = NULL;
4375 }
4376
4377 for (transno = 0; transno < node->numtrans; transno++)
4378 {
4379 AggStatePerTrans pertrans = &node->pertrans[transno];
4380
4381 for (setno = 0; setno < numGroupingSets; setno++)
4382 {
4383 if (pertrans->sortstates[setno])
4384 tuplesort_end(pertrans->sortstates[setno]);
4385 }
4386 }
4387
4388 /* And ensure any agg shutdown callbacks have been called */
4389 for (setno = 0; setno < numGroupingSets; setno++)
4390 ReScanExprContext(node->aggcontexts[setno]);
4391 if (node->hashcontext)
4393
4394 outerPlan = outerPlanState(node);
4396}
4397
4398void
4400{
4401 ExprContext *econtext = node->ss.ps.ps_ExprContext;
4403 Agg *aggnode = (Agg *) node->ss.ps.plan;
4404 int transno;
4405 int numGroupingSets = Max(node->maxsets, 1);
4406 int setno;
4407
4408 node->agg_done = false;
4409
4410 if (node->aggstrategy == AGG_HASHED)
4411 {
4412 /*
4413 * In the hashed case, if we haven't yet built the hash table then we
4414 * can just return; nothing done yet, so nothing to undo. If subnode's
4415 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
4416 * else no reason to re-scan it at all.
4417 */
4418 if (!node->table_filled)
4419 return;
4420
4421 /*
4422 * If we do have the hash table, and it never spilled, and the subplan
4423 * does not have any parameter changes, and none of our own parameter
4424 * changes affect input expressions of the aggregated functions, then
4425 * we can just rescan the existing hash table; no need to build it
4426 * again.
4427 */
4428 if (outerPlan->chgParam == NULL && !node->hash_ever_spilled &&
4429 !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
4430 {
4432 &node->perhash[0].hashiter);
4433 select_current_set(node, 0, true);
4434 return;
4435 }
4436 }
4437
4438 /* Make sure we have closed any open tuplesorts */
4439 for (transno = 0; transno < node->numtrans; transno++)
4440 {
4441 for (setno = 0; setno < numGroupingSets; setno++)
4442 {
4443 AggStatePerTrans pertrans = &node->pertrans[transno];
4444
4445 if (pertrans->sortstates[setno])
4446 {
4447 tuplesort_end(pertrans->sortstates[setno]);
4448 pertrans->sortstates[setno] = NULL;
4449 }
4450 }
4451 }
4452
4453 /*
4454 * We don't need to ReScanExprContext the output tuple context here;
4455 * ExecReScan already did it. But we do need to reset our per-grouping-set
4456 * contexts, which may have transvalues stored in them. (We use rescan
4457 * rather than just reset because transfns may have registered callbacks
4458 * that need to be run now.) For the AGG_HASHED case, see below.
4459 */
4460
4461 for (setno = 0; setno < numGroupingSets; setno++)
4462 {
4463 ReScanExprContext(node->aggcontexts[setno]);
4464 }
4465
4466 /* Release first tuple of group, if we have made a copy */
4467 if (node->grp_firstTuple != NULL)
4468 {
4470 node->grp_firstTuple = NULL;
4471 }
4473
4474 /* Forget current agg values */
4475 MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
4476 MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
4477
4478 /*
4479 * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
4480 * the hashcontext. This used to be an issue, but now, resetting a context
4481 * automatically deletes sub-contexts too.
4482 */
4483 if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
4484 {
4486
4487 node->hash_ever_spilled = false;
4488 node->hash_spill_mode = false;
4489 node->hash_ngroups_current = 0;
4490
4492 /* Rebuild an empty hash table */
4493 build_hash_tables(node);
4494 node->table_filled = false;
4495 /* iterator will be reset when the table is filled */
4496
4497 hashagg_recompile_expressions(node, false, false);
4498 }
4499
4500 if (node->aggstrategy != AGG_HASHED)
4501 {
4502 /*
4503 * Reset the per-group state (in particular, mark transvalues null)
4504 */
4505 for (setno = 0; setno < numGroupingSets; setno++)
4506 {
4507 MemSet(node->pergroups[setno], 0,
4508 sizeof(AggStatePerGroupData) * node->numaggs);
4509 }
4510
4511 /* reset to phase 1 */
4512 initialize_phase(node, 1);
4513
4514 node->input_done = false;
4515 node->projected_set = -1;
4516 }
4517
4518 if (outerPlan->chgParam == NULL)
4520}
4521
4522
4523/***********************************************************************
4524 * API exposed to aggregate functions
4525 ***********************************************************************/
4526
4527
4528/*
4529 * AggCheckCallContext - test if a SQL function is being called as an aggregate
4530 *
4531 * The transition and/or final functions of an aggregate may want to verify
4532 * that they are being called as aggregates, rather than as plain SQL
4533 * functions. They should use this function to do so. The return value
4534 * is nonzero if being called as an aggregate, or zero if not. (Specific
4535 * nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more
4536 * values could conceivably appear in future.)
4537 *
4538 * If aggcontext isn't NULL, the function also stores at *aggcontext the
4539 * identity of the memory context that aggregate transition values are being
4540 * stored in. Note that the same aggregate call site (flinfo) may be called
4541 * interleaved on different transition values in different contexts, so it's
4542 * not kosher to cache aggcontext under fn_extra. It is, however, kosher to
4543 * cache it in the transvalue itself (for internal-type transvalues).
4544 */
4545int
4547{
4548 if (fcinfo->context && IsA(fcinfo->context, AggState))
4549 {
4550 if (aggcontext)
4551 {
4552 AggState *aggstate = ((AggState *) fcinfo->context);
4553 ExprContext *cxt = aggstate->curaggcontext;
4554
4555 *aggcontext = cxt->ecxt_per_tuple_memory;
4556 }
4557 return AGG_CONTEXT_AGGREGATE;
4558 }
4559 if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
4560 {
4561 if (aggcontext)
4562 *aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext;
4563 return AGG_CONTEXT_WINDOW;
4564 }
4565
4566 /* this is just to prevent "uninitialized variable" warnings */
4567 if (aggcontext)
4568 *aggcontext = NULL;
4569 return 0;
4570}
4571
4572/*
4573 * AggGetAggref - allow an aggregate support function to get its Aggref
4574 *
4575 * If the function is being called as an aggregate support function,
4576 * return the Aggref node for the aggregate call. Otherwise, return NULL.
4577 *
4578 * Aggregates sharing the same inputs and transition functions can get
4579 * merged into a single transition calculation. If the transition function
4580 * calls AggGetAggref, it will get some one of the Aggrefs for which it is
4581 * executing. It must therefore not pay attention to the Aggref fields that
4582 * relate to the final function, as those are indeterminate. But if a final
4583 * function calls AggGetAggref, it will get a precise result.
4584 *
4585 * Note that if an aggregate is being used as a window function, this will
4586 * return NULL. We could provide a similar function to return the relevant
4587 * WindowFunc node in such cases, but it's not needed yet.
4588 */
4589Aggref *
4591{
4592 if (fcinfo->context && IsA(fcinfo->context, AggState))
4593 {
4594 AggState *aggstate = (AggState *) fcinfo->context;
4595 AggStatePerAgg curperagg;
4596 AggStatePerTrans curpertrans;
4597
4598 /* check curperagg (valid when in a final function) */
4599 curperagg = aggstate->curperagg;
4600
4601 if (curperagg)
4602 return curperagg->aggref;
4603
4604 /* check curpertrans (valid when in a transition function) */
4605 curpertrans = aggstate->curpertrans;
4606
4607 if (curpertrans)
4608 return curpertrans->aggref;
4609 }
4610 return NULL;
4611}
4612
4613/*
4614 * AggGetTempMemoryContext - fetch short-term memory context for aggregates
4615 *
4616 * This is useful in agg final functions; the context returned is one that
4617 * the final function can safely reset as desired. This isn't useful for
4618 * transition functions, since the context returned MAY (we don't promise)
4619 * be the same as the context those are called in.
4620 *
4621 * As above, this is currently not useful for aggs called as window functions.
4622 */
4625{
4626 if (fcinfo->context && IsA(fcinfo->context, AggState))
4627 {
4628 AggState *aggstate = (AggState *) fcinfo->context;
4629
4630 return aggstate->tmpcontext->ecxt_per_tuple_memory;
4631 }
4632 return NULL;
4633}
4634
4635/*
4636 * AggStateIsShared - find out whether transition state is shared
4637 *
4638 * If the function is being called as an aggregate support function,
4639 * return true if the aggregate's transition state is shared across
4640 * multiple aggregates, false if it is not.
4641 *
4642 * Returns true if not called as an aggregate support function.
4643 * This is intended as a conservative answer, ie "no you'd better not
4644 * scribble on your input". In particular, will return true if the
4645 * aggregate is being used as a window function, which is a scenario
4646 * in which changing the transition state is a bad idea. We might
4647 * want to refine the behavior for the window case in future.
4648 */
4649bool
4651{
4652 if (fcinfo->context && IsA(fcinfo->context, AggState))
4653 {
4654 AggState *aggstate = (AggState *) fcinfo->context;
4655 AggStatePerAgg curperagg;
4656 AggStatePerTrans curpertrans;
4657
4658 /* check curperagg (valid when in a final function) */
4659 curperagg = aggstate->curperagg;
4660
4661 if (curperagg)
4662 return aggstate->pertrans[curperagg->transno].aggshared;
4663
4664 /* check curpertrans (valid when in a transition function) */
4665 curpertrans = aggstate->curpertrans;
4666
4667 if (curpertrans)
4668 return curpertrans->aggshared;
4669 }
4670 return true;
4671}
4672
4673/*
4674 * AggRegisterCallback - register a cleanup callback for an aggregate
4675 *
4676 * This is useful for aggs to register shutdown callbacks, which will ensure
4677 * that non-memory resources are freed. The callback will occur just before
4678 * the associated aggcontext (as returned by AggCheckCallContext) is reset,
4679 * either between groups or as a result of rescanning the query. The callback
4680 * will NOT be called on error paths. The typical use-case is for freeing of
4681 * tuplestores or tuplesorts maintained in aggcontext, or pins held by slots
4682 * created by the agg functions. (The callback will not be called until after
4683 * the result of the finalfn is no longer needed, so it's safe for the finalfn
4684 * to return data that will be freed by the callback.)
4685 *
4686 * As above, this is currently not useful for aggs called as window functions.
4687 */
4688void
4691 Datum arg)
4692{
4693 if (fcinfo->context && IsA(fcinfo->context, AggState))
4694 {
4695 AggState *aggstate = (AggState *) fcinfo->context;
4696 ExprContext *cxt = aggstate->curaggcontext;
4697
4698 RegisterExprContextCallback(cxt, func, arg);
4699
4700 return;
4701 }
4702 elog(ERROR, "aggregate function cannot register a callback in this context");
4703}
4704
4705
4706/* ----------------------------------------------------------------
4707 * Parallel Query Support
4708 * ----------------------------------------------------------------
4709 */
4710
4711 /* ----------------------------------------------------------------
4712 * ExecAggEstimate
4713 *
4714 * Estimate space required to propagate aggregate statistics.
4715 * ----------------------------------------------------------------
4716 */
4717void
4719{
4720 Size size;
4721
4722 /* don't need this if not instrumenting or no workers */
4723 if (!node->ss.ps.instrument || pcxt->nworkers == 0)
4724 return;
4725
4727 size = add_size(size, offsetof(SharedAggInfo, sinstrument));
4730}
4731
4732/* ----------------------------------------------------------------
4733 * ExecAggInitializeDSM
4734 *
4735 * Initialize DSM space for aggregate statistics.
4736 * ----------------------------------------------------------------
4737 */
4738void
4740{
4741 Size size;
4742
4743 /* don't need this if not instrumenting or no workers */
4744 if (!node->ss.ps.instrument || pcxt->nworkers == 0)
4745 return;
4746
4747 size = offsetof(SharedAggInfo, sinstrument)
4748 + pcxt->nworkers * sizeof(AggregateInstrumentation);
4749 node->shared_info = shm_toc_allocate(pcxt->toc, size);
4750 /* ensure any unfilled slots will contain zeroes */
4751 memset(node->shared_info, 0, size);
4752 node->shared_info->num_workers = pcxt->nworkers;
4753 shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
4754 node->shared_info);
4755}
4756
4757/* ----------------------------------------------------------------
4758 * ExecAggInitializeWorker
4759 *
4760 * Attach worker to DSM space for aggregate statistics.
4761 * ----------------------------------------------------------------
4762 */
4763void
4765{
4766 node->shared_info =
4767 shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
4768}
4769
4770/* ----------------------------------------------------------------
4771 * ExecAggRetrieveInstrumentation
4772 *
4773 * Transfer aggregate statistics from DSM to private memory.
4774 * ----------------------------------------------------------------
4775 */
4776void
4778{
4779 Size size;
4780 SharedAggInfo *si;
4781
4782 if (node->shared_info == NULL)
4783 return;
4784
4785 size = offsetof(SharedAggInfo, sinstrument)
4787 si = palloc(size);
4788 memcpy(si, node->shared_info, size);
4789 node->shared_info = si;
4790}
AclResult
Definition: acl.h:182
@ ACLCHECK_OK
Definition: acl.h:183
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:2622
AclResult object_aclcheck(Oid classid, Oid objectid, Oid roleid, AclMode mode)
Definition: aclchk.c:3804
int16 AttrNumber
Definition: attnum.h:21
int ParallelWorkerNumber
Definition: parallel.c:114
int bms_next_member(const Bitmapset *a, int prevbit)
Definition: bitmapset.c:1306
Bitmapset * bms_del_member(Bitmapset *a, int x)
Definition: bitmapset.c:868
void bms_free(Bitmapset *a)
Definition: bitmapset.c:239
int bms_num_members(const Bitmapset *a)
Definition: bitmapset.c:751
bool bms_is_member(int x, const Bitmapset *a)
Definition: bitmapset.c:510
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:815
Bitmapset * bms_add_members(Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:917
Bitmapset * bms_union(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:251
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:582
Bitmapset * bms_copy(const Bitmapset *a)
Definition: bitmapset.c:122
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define MAXALIGN(LEN)
Definition: c.h:768
#define Max(x, y)
Definition: c.h:955
#define Assert(condition)
Definition: c.h:815
int64_t int64
Definition: c.h:485
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
#define MemSet(start, val, len)
Definition: c.h:977
#define OidIsValid(objectId)
Definition: c.h:732
size_t Size
Definition: c.h:562
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
int my_log2(long num)
Definition: dynahash.c:1794
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode_for_file_access(void)
Definition: elog.c:876
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
void ExecReScan(PlanState *node)
Definition: execAmi.c:76
Datum ExecAggCopyTransValue(AggState *aggstate, AggStatePerTrans pertrans, Datum newValue, bool newValueIsNull, Datum oldValue, bool oldValueIsNull)
ExprState * ExecInitQual(List *qual, PlanState *parent)
Definition: execExpr.c:229
List * ExecInitExprList(List *nodes, PlanState *parent)
Definition: execExpr.c:335
ExprState * ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase, bool doSort, bool doHash, bool nullcheck)
Definition: execExpr.c:3669
ExprState * execTuplesMatchPrepare(TupleDesc desc, int numCols, const AttrNumber *keyColIdx, const Oid *eqOperators, const Oid *collations, PlanState *parent)
Definition: execGrouping.c:58
void execTuplesHashPrepare(int numCols, const Oid *eqOperators, Oid **eqFuncOids, FmgrInfo **hashFunctions)
Definition: execGrouping.c:97
TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 hash)
Definition: execGrouping.c:347
TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 *hash)
Definition: execGrouping.c:292
TupleHashTable BuildTupleHashTable(PlanState *parent, TupleDesc inputDesc, const TupleTableSlotOps *inputOps, int numCols, AttrNumber *keyColIdx, const Oid *eqfuncoids, FmgrInfo *hashfunctions, Oid *collations, long nbuckets, Size additionalsize, MemoryContext metacxt, MemoryContext tablecxt, MemoryContext tempcxt, bool use_variable_hash_iv)
Definition: execGrouping.c:161
void ResetTupleHashTable(TupleHashTable hashtable)
Definition: execGrouping.c:271
void ExecEndNode(PlanState *node)
Definition: execProcnode.c:562
PlanState * ExecInitNode(Plan *node, EState *estate, int eflags)
Definition: execProcnode.c:142
const TupleTableSlotOps TTSOpsVirtual
Definition: execTuples.c:84
TupleTableSlot * ExecStoreVirtualTuple(TupleTableSlot *slot)
Definition: execTuples.c:1739
TupleTableSlot * ExecAllocTableSlot(List **tupleTable, TupleDesc desc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1358
MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot, bool *shouldFree)
Definition: execTuples.c:1879
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1633
TupleTableSlot * ExecInitExtraTupleSlot(EState *estate, TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:2018
void ExecInitResultTupleSlotTL(PlanState *planstate, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1986
const TupleTableSlotOps TTSOpsMinimalTuple
Definition: execTuples.c:86
TupleTableSlot * ExecStoreAllNullTuple(TupleTableSlot *slot)
Definition: execTuples.c:1763
TupleDesc ExecTypeFromTL(List *targetList)
Definition: execTuples.c:2125
void ExecForceStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1656
TupleDesc ExecGetResultType(PlanState *planstate)
Definition: execUtils.c:496
void ReScanExprContext(ExprContext *econtext)
Definition: execUtils.c:444
void ExecCreateScanSlotFromOuterPlan(EState *estate, ScanState *scanstate, const TupleTableSlotOps *tts_ops)
Definition: execUtils.c:705
void ExecAssignExprContext(EState *estate, PlanState *planstate)
Definition: execUtils.c:486
void ExecAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc)
Definition: execUtils.c:584
void RegisterExprContextCallback(ExprContext *econtext, ExprContextCallbackFunction function, Datum arg)
Definition: execUtils.c:953
ExprContext * CreateWorkExprContext(EState *estate)
Definition: execUtils.c:322
const TupleTableSlotOps * ExecGetResultSlotOps(PlanState *planstate, bool *isfixed)
Definition: execUtils.c:505
void(* ExprContextCallbackFunction)(Datum arg)
Definition: execnodes.h:229
#define InstrCountFiltered1(node, delta)
Definition: execnodes.h:1254
#define outerPlanState(node)
Definition: execnodes.h:1246
#define ScanTupleHashTable(htable, iter)
Definition: execnodes.h:884
#define ResetTupleHashIterator(htable, iter)
Definition: execnodes.h:882
struct AggStatePerGroupData * AggStatePerGroup
Definition: execnodes.h:2518
struct AggStatePerTransData * AggStatePerTrans
Definition: execnodes.h:2517
struct TupleHashEntryData TupleHashEntryData
struct AggregateInstrumentation AggregateInstrumentation
struct AggStatePerAggData * AggStatePerAgg
Definition: execnodes.h:2516
#define EXEC_FLAG_BACKWARD
Definition: executor.h:68
static TupleTableSlot * ExecProject(ProjectionInfo *projInfo)
Definition: executor.h:389
#define ResetExprContext(econtext)
Definition: executor.h:557
static bool ExecQual(ExprState *state, ExprContext *econtext)
Definition: executor.h:426
static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)
Definition: executor.h:453
static TupleTableSlot * ExecProcNode(PlanState *node)
Definition: executor.h:267
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:346
static Datum ExecEvalExprSwitchContext(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:361
#define EXEC_FLAG_EXPLAIN_ONLY
Definition: executor.h:65
#define EXEC_FLAG_MARK
Definition: executor.h:69
#define MakeExpandedObjectReadOnly(d, isnull, typlen)
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:127
Datum OidInputFunctionCall(Oid functionId, char *str, Oid typioparam, int32 typmod)
Definition: fmgr.c:1754
#define SizeForFunctionCallInfo(nargs)
Definition: fmgr.h:102
#define InitFunctionCallInfoData(Fcinfo, Flinfo, Nargs, Collation, Context, Resultinfo)
Definition: fmgr.h:150
#define AGG_CONTEXT_WINDOW
Definition: fmgr.h:761
#define LOCAL_FCINFO(name, nargs)
Definition: fmgr.h:110
#define AGG_CONTEXT_AGGREGATE
Definition: fmgr.h:760
struct FunctionCallInfoBaseData * FunctionCallInfo
Definition: fmgr.h:38
#define FunctionCallInvoke(fcinfo)
Definition: fmgr.h:172
#define fmgr_info_set_expr(expr, finfo)
Definition: fmgr.h:135
char * format_type_be(Oid type_oid)
Definition: format_type.c:343
int work_mem
Definition: globals.c:130
uint32 hash_bytes_uint32(uint32 k)
Definition: hashfn.c:610
for(;;)
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1435
MinimalTupleData * MinimalTuple
Definition: htup.h:27
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define SizeofMinimalTupleHeader
Definition: htup_details.h:699
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
Definition: hyperloglog.c:66
double estimateHyperLogLog(hyperLogLogState *cState)
Definition: hyperloglog.c:186
void addHyperLogLog(hyperLogLogState *cState, uint32 hash)
Definition: hyperloglog.c:167
void freeHyperLogLog(hyperLogLogState *cState)
Definition: hyperloglog.c:151
#define IsParallelWorker()
Definition: parallel.h:60
static int initValue(long lng_val)
Definition: informix.c:702
#define INJECTION_POINT(name)
#define INJECTION_POINT_CACHED(name)
#define IS_INJECTION_POINT_ATTACHED(name)
int j
Definition: isn.c:73
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
List * lappend(List *list, void *datum)
Definition: list.c:339
List * lcons_int(int datum, List *list)
Definition: list.c:513
List * list_delete_last(List *list)
Definition: list.c:957
void list_free(List *list)
Definition: list.c:1546
void list_free_deep(List *list)
Definition: list.c:1560
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
Definition: logtape.c:846
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:928
int64 LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1181
void LogicalTapeClose(LogicalTape *lt)
Definition: logtape.c:733
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:667
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
Definition: logtape.c:556
void LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size)
Definition: logtape.c:761
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
Definition: logtape.c:680
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2278
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1312
void getTypeInputInfo(Oid type, Oid *typInput, Oid *typIOParam)
Definition: lsyscache.c:2901
char * get_func_name(Oid funcid)
Definition: lsyscache.c:1635
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
Size MemoryContextMemAllocated(MemoryContext context, bool recurse)
Definition: mcxt.c:762
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
Oid GetUserId(void)
Definition: miscinit.c:517
static void hashagg_finish_initial_spills(AggState *aggstate)
Definition: nodeAgg.c:3101
static long hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
Definition: nodeAgg.c:1991
static void hash_agg_check_limits(AggState *aggstate)
Definition: nodeAgg.c:1865
static void initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable, TupleHashEntry entry)
Definition: nodeAgg.c:2070
static void find_hash_columns(AggState *aggstate)
Definition: nodeAgg.c:1572
static bool agg_refill_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2619
static void build_hash_table(AggState *aggstate, int setno, long nbuckets)
Definition: nodeAgg.c:1511
void ExecAggEstimate(AggState *node, ParallelContext *pcxt)
Definition: nodeAgg.c:4718
struct FindColsContext FindColsContext
static void hash_agg_enter_spill_mode(AggState *aggstate)
Definition: nodeAgg.c:1906
struct HashAggBatch HashAggBatch
static Datum GetAggInitVal(Datum textInitVal, Oid transtype)
Definition: nodeAgg.c:4323
static void find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
Definition: nodeAgg.c:1398
void AggRegisterCallback(FunctionCallInfo fcinfo, ExprContextCallbackFunction func, Datum arg)
Definition: nodeAgg.c:4689
#define HASHAGG_HLL_BIT_WIDTH
Definition: nodeAgg.c:316
static void agg_fill_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2565
Aggref * AggGetAggref(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4590
static void initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:579
static TupleTableSlot * fetch_input_tuple(AggState *aggstate)
Definition: nodeAgg.c:548
static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
Definition: nodeAgg.c:3135
static bool find_cols_walker(Node *node, FindColsContext *context)
Definition: nodeAgg.c:1421
void ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
Definition: nodeAgg.c:4764
void ExecAggRetrieveInstrumentation(AggState *node)
Definition: nodeAgg.c:4777
static TupleTableSlot * project_aggregates(AggState *aggstate)
Definition: nodeAgg.c:1372
static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
Definition: nodeAgg.c:3052
struct HashAggSpill HashAggSpill
static void process_ordered_aggregate_multi(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:950
void ExecReScanAgg(AggState *node)
Definition: nodeAgg.c:4399
int AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext)
Definition: nodeAgg.c:4546
static void advance_transition_function(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:707
static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
Definition: nodeAgg.c:1942
static void finalize_aggregates(AggState *aggstate, AggStatePerAgg peraggs, AggStatePerGroup pergroup)
Definition: nodeAgg.c:1295
static void initialize_phase(AggState *aggstate, int newphase)
Definition: nodeAgg.c:478
Size hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
Definition: nodeAgg.c:1703
static void initialize_aggregates(AggState *aggstate, AggStatePerGroup *pergroups, int numReset)
Definition: nodeAgg.c:666
static TupleTableSlot * agg_retrieve_hash_table_in_memory(AggState *aggstate)
Definition: nodeAgg.c:2797
void ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
Definition: nodeAgg.c:4739
static void finalize_aggregate(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull)
Definition: nodeAgg.c:1047
#define HASHAGG_MAX_PARTITIONS
Definition: nodeAgg.c:299
static void lookup_hash_entries(AggState *aggstate)
Definition: nodeAgg.c:2120
static TupleTableSlot * agg_retrieve_direct(AggState *aggstate)
Definition: nodeAgg.c:2219
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
Definition: nodeAgg.c:1750
static void prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
Definition: nodeAgg.c:1250
bool AggStateIsShared(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4650
static void build_pertrans_for_aggref(AggStatePerTrans pertrans, AggState *aggstate, EState *estate, Aggref *aggref, Oid transfn_oid, Oid aggtranstype, Oid aggserialfn, Oid aggdeserialfn, Datum initValue, bool initValueIsNull, Oid *inputTypes, int numArguments)
Definition: nodeAgg.c:4073
#define CHUNKHDRSZ
Definition: nodeAgg.c:321
static TupleTableSlot * agg_retrieve_hash_table(AggState *aggstate)
Definition: nodeAgg.c:2772
static void process_ordered_aggregate_single(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroupstate)
Definition: nodeAgg.c:849
static void advance_aggregates(AggState *aggstate)
Definition: nodeAgg.c:817
static TupleTableSlot * ExecAgg(PlanState *pstate)
Definition: nodeAgg.c:2183
static void prepare_hash_slot(AggStatePerHash perhash, TupleTableSlot *inputslot, TupleTableSlot *hashslot)
Definition: nodeAgg.c:1205
static void build_hash_tables(AggState *aggstate)
Definition: nodeAgg.c:1468
void ExecEndAgg(AggState *node)
Definition: nodeAgg.c:4339
#define HASHAGG_READ_BUFFER_SIZE
Definition: nodeAgg.c:307
static void hashagg_reset_spill_state(AggState *aggstate)
Definition: nodeAgg.c:3175
static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, TupleTableSlot *inputslot, uint32 hash)
Definition: nodeAgg.c:2963
static void select_current_set(AggState *aggstate, int setno, bool is_hash)
Definition: nodeAgg.c:456
static void finalize_partialaggregate(AggState *aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull)
Definition: nodeAgg.c:1147
AggState * ExecInitAgg(Agg *node, EState *estate, int eflags)
Definition: nodeAgg.c:3215
static void hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits, double input_groups, double hashentrysize)
Definition: nodeAgg.c:2920
#define HASHAGG_MIN_PARTITIONS
Definition: nodeAgg.c:298
void hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, Size *mem_limit, uint64 *ngroups_limit, int *num_partitions)
Definition: nodeAgg.c:1807
MemoryContext AggGetTempMemoryContext(FunctionCallInfo fcinfo)
Definition: nodeAgg.c:4624
#define HASHAGG_PARTITION_FACTOR
Definition: nodeAgg.c:297
static HashAggBatch * hashagg_batch_new(LogicalTape *input_tape, int setno, int64 input_tuples, double input_card, int used_bits)
Definition: nodeAgg.c:3033
#define HASHAGG_WRITE_BUFFER_SIZE
Definition: nodeAgg.c:308
static int hash_choose_num_partitions(double input_groups, double hashentrysize, int used_bits, int *log2_npartitions)
Definition: nodeAgg.c:2016
struct AggStatePerGroupData AggStatePerGroupData
Oid exprCollation(const Node *expr)
Definition: nodeFuncs.c:821
#define expression_tree_walker(n, w, c)
Definition: nodeFuncs.h:153
size_t get_hash_memory_limit(void)
Definition: nodeHash.c:3487
#define DO_AGGSPLIT_SKIPFINAL(as)
Definition: nodes.h:386
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
#define DO_AGGSPLIT_DESERIALIZE(as)
Definition: nodes.h:388
#define DO_AGGSPLIT_COMBINE(as)
Definition: nodes.h:385
@ AGG_SORTED
Definition: nodes.h:355
@ AGG_HASHED
Definition: nodes.h:356
@ AGG_MIXED
Definition: nodes.h:357
@ AGG_PLAIN
Definition: nodes.h:354
#define DO_AGGSPLIT_SERIALIZE(as)
Definition: nodes.h:387
#define makeNode(_type_)
Definition: nodes.h:155
#define castNode(_type_, nodeptr)
Definition: nodes.h:176
#define InvokeFunctionExecuteHook(objectId)
Definition: objectaccess.h:213
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
void build_aggregate_finalfn_expr(Oid *agg_input_types, int num_finalfn_inputs, Oid agg_state_type, Oid agg_result_type, Oid agg_input_collation, Oid finalfn_oid, Expr **finalfnexpr)
Definition: parse_agg.c:2225
void build_aggregate_deserialfn_expr(Oid deserialfn_oid, Expr **deserialfnexpr)
Definition: parse_agg.c:2201
void build_aggregate_transfn_expr(Oid *agg_input_types, int agg_num_inputs, int agg_num_direct_inputs, bool agg_variadic, Oid agg_state_type, Oid agg_input_collation, Oid transfn_oid, Oid invtransfn_oid, Expr **transfnexpr, Expr **invtransfnexpr)
Definition: parse_agg.c:2117
int get_aggregate_argtypes(Aggref *aggref, Oid *inputTypes)
Definition: parse_agg.c:1997
void build_aggregate_serialfn_expr(Oid serialfn_oid, Expr **serialfnexpr)
Definition: parse_agg.c:2178
bool IsBinaryCoercible(Oid srctype, Oid targettype)
@ OBJECT_AGGREGATE
Definition: parsenodes.h:2313
@ OBJECT_FUNCTION
Definition: parsenodes.h:2331
#define ACL_EXECUTE
Definition: parsenodes.h:83
FormData_pg_aggregate * Form_pg_aggregate
Definition: pg_aggregate.h:109
int16 attnum
Definition: pg_attribute.h:74
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:200
void * arg
#define FUNC_MAX_ARGS
#define lfirst(lc)
Definition: pg_list.h:172
#define llast(l)
Definition: pg_list.h:198
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define lfirst_int(lc)
Definition: pg_list.h:173
#define linitial_int(l)
Definition: pg_list.h:179
static void * list_nth(const List *list, int n)
Definition: pg_list.h:299
#define list_nth_node(type, list, n)
Definition: pg_list.h:327
FormData_pg_proc * Form_pg_proc
Definition: pg_proc.h:136
#define outerPlan(node)
Definition: plannodes.h:231
static bool DatumGetBool(Datum X)
Definition: postgres.h:95
uintptr_t Datum
Definition: postgres.h:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define OUTER_VAR
Definition: primnodes.h:243
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53