PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
execGrouping.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * execGrouping.c
4 * executor utility routines for grouping, hashing, and aggregation
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/executor/execGrouping.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include "access/parallel.h"
18#include "common/hashfn.h"
19#include "executor/executor.h"
20#include "miscadmin.h"
21#include "utils/lsyscache.h"
22
23static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
24static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
25 const MinimalTuple tuple);
27 TupleTableSlot *slot,
28 bool *isnew, uint32 hash);
29
30/*
31 * Define parameters for tuple hash table code generation. The interface is
32 * *also* declared in execnodes.h (to generate the types, which are externally
33 * visible).
34 */
35#define SH_PREFIX tuplehash
36#define SH_ELEMENT_TYPE TupleHashEntryData
37#define SH_KEY_TYPE MinimalTuple
38#define SH_KEY firstTuple
39#define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
40#define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
41#define SH_SCOPE extern
42#define SH_STORE_HASH
43#define SH_GET_HASH(tb, a) a->hash
44#define SH_DEFINE
45#include "lib/simplehash.h"
46
47
48/*****************************************************************************
49 * Utility routines for grouping tuples together
50 *****************************************************************************/
51
52/*
53 * execTuplesMatchPrepare
54 * Build expression that can be evaluated using ExecQual(), returning
55 * whether an ExprContext's inner/outer tuples are NOT DISTINCT
56 */
59 int numCols,
60 const AttrNumber *keyColIdx,
61 const Oid *eqOperators,
62 const Oid *collations,
63 PlanState *parent)
64{
65 Oid *eqFunctions;
66 int i;
67 ExprState *expr;
68
69 if (numCols == 0)
70 return NULL;
71
72 eqFunctions = (Oid *) palloc(numCols * sizeof(Oid));
73
74 /* lookup equality functions */
75 for (i = 0; i < numCols; i++)
76 eqFunctions[i] = get_opcode(eqOperators[i]);
77
78 /* build actual expression */
79 expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL,
80 numCols, keyColIdx, eqFunctions, collations,
81 parent);
82
83 return expr;
84}
85
86/*
87 * execTuplesHashPrepare
88 * Look up the equality and hashing functions needed for a TupleHashTable.
89 *
90 * This is similar to execTuplesMatchPrepare, but we also need to find the
91 * hash functions associated with the equality operators. *eqFunctions and
92 * *hashFunctions receive the palloc'd result arrays.
93 *
94 * Note: we expect that the given operators are not cross-type comparisons.
95 */
96void
98 const Oid *eqOperators,
99 Oid **eqFuncOids,
100 FmgrInfo **hashFunctions)
101{
102 int i;
103
104 *eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid));
105 *hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
106
107 for (i = 0; i < numCols; i++)
108 {
109 Oid eq_opr = eqOperators[i];
110 Oid eq_function;
111 Oid left_hash_function;
112 Oid right_hash_function;
113
114 eq_function = get_opcode(eq_opr);
115 if (!get_op_hash_functions(eq_opr,
116 &left_hash_function, &right_hash_function))
117 elog(ERROR, "could not find hash function for hash operator %u",
118 eq_opr);
119 /* We're not supporting cross-type cases here */
120 Assert(left_hash_function == right_hash_function);
121 (*eqFuncOids)[i] = eq_function;
122 fmgr_info(right_hash_function, &(*hashFunctions)[i]);
123 }
124}
125
126
127/*****************************************************************************
128 * Utility routines for all-in-memory hash tables
129 *
130 * These routines build hash tables for grouping tuples together (eg, for
131 * hash aggregation). There is one entry for each not-distinct set of tuples
132 * presented.
133 *****************************************************************************/
134
135/*
136 * Construct an empty TupleHashTable
137 *
138 * parent: PlanState node that will own this hash table
139 * inputDesc: tuple descriptor for input tuples
140 * inputOps: slot ops for input tuples, or NULL if unknown or not fixed
141 * numCols: number of columns to be compared (length of next 4 arrays)
142 * keyColIdx: indexes of tuple columns to compare
143 * eqfuncoids: OIDs of equality comparison functions to use
144 * hashfunctions: FmgrInfos of datatype-specific hashing functions to use
145 * collations: collations to use in comparisons
146 * nbuckets: initial estimate of hashtable size
147 * additionalsize: size of data stored in ->additional
148 * metacxt: memory context for long-lived allocation, but not per-entry data
149 * tablecxt: memory context in which to store table entries
150 * tempcxt: short-lived context for evaluation hash and comparison functions
151 * use_variable_hash_iv: if true, adjust hash IV per-parallel-worker
152 *
153 * The hashfunctions array may be made with execTuplesHashPrepare(). Note they
154 * are not cross-type functions, but expect to see the table datatype(s)
155 * on both sides.
156 *
157 * Note that the keyColIdx, hashfunctions, and collations arrays must be
158 * allocated in storage that will live as long as the hashtable does.
159 */
162 TupleDesc inputDesc,
163 const TupleTableSlotOps *inputOps,
164 int numCols,
165 AttrNumber *keyColIdx,
166 const Oid *eqfuncoids,
167 FmgrInfo *hashfunctions,
168 Oid *collations,
169 long nbuckets,
170 Size additionalsize,
171 MemoryContext metacxt,
172 MemoryContext tablecxt,
173 MemoryContext tempcxt,
174 bool use_variable_hash_iv)
175{
176 TupleHashTable hashtable;
177 Size entrysize = sizeof(TupleHashEntryData) + additionalsize;
178 Size hash_mem_limit;
179 MemoryContext oldcontext;
180 bool allow_jit;
181 uint32 hash_iv = 0;
182
183 Assert(nbuckets > 0);
184
185 /* Limit initial table size request to not more than hash_mem */
186 hash_mem_limit = get_hash_memory_limit() / entrysize;
187 if (nbuckets > hash_mem_limit)
188 nbuckets = hash_mem_limit;
189
190 oldcontext = MemoryContextSwitchTo(metacxt);
191
192 hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData));
193
194 hashtable->numCols = numCols;
195 hashtable->keyColIdx = keyColIdx;
196 hashtable->tab_collations = collations;
197 hashtable->tablecxt = tablecxt;
198 hashtable->tempcxt = tempcxt;
199 hashtable->tableslot = NULL; /* will be made on first lookup */
200 hashtable->inputslot = NULL;
201 hashtable->in_hash_expr = NULL;
202 hashtable->cur_eq_func = NULL;
203
204 /*
205 * If parallelism is in use, even if the leader backend is performing the
206 * scan itself, we don't want to create the hashtable exactly the same way
207 * in all workers. As hashtables are iterated over in keyspace-order,
208 * doing so in all processes in the same way is likely to lead to
209 * "unbalanced" hashtables when the table size initially is
210 * underestimated.
211 */
212 if (use_variable_hash_iv)
214
215 hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
216
217 /*
218 * We copy the input tuple descriptor just for safety --- we assume all
219 * input tuples will have equivalent descriptors.
220 */
223
224 /*
225 * If the caller fails to make the metacxt different from the tablecxt,
226 * allowing JIT would lead to the generated functions to a) live longer
227 * than the query or b) be re-generated each time the table is being
228 * reset. Therefore prevent JIT from being used in that case, by not
229 * providing a parent node (which prevents accessing the JitContext in the
230 * EState).
231 */
232 allow_jit = (metacxt != tablecxt);
233
234 /* build hash ExprState for all columns */
235 hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc,
236 inputOps,
237 hashfunctions,
238 collations,
239 numCols,
240 keyColIdx,
241 allow_jit ? parent : NULL,
242 hash_iv);
243
244 /* build comparator for all columns */
245 hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc,
246 inputOps,
248 numCols,
249 keyColIdx, eqfuncoids, collations,
250 allow_jit ? parent : NULL);
251
252 /*
253 * While not pretty, it's ok to not shut down this context, but instead
254 * rely on the containing memory context being reset, as
255 * ExecBuildGroupingEqual() only builds a very simple expression calling
256 * functions (i.e. nothing that'd employ RegisterExprContextCallback()).
257 */
259
260 MemoryContextSwitchTo(oldcontext);
261
262 return hashtable;
263}
264
265/*
266 * Reset contents of the hashtable to be empty, preserving all the non-content
267 * state. Note that the tablecxt passed to BuildTupleHashTable() should
268 * also be reset, otherwise there will be leaks.
269 */
270void
272{
273 tuplehash_reset(hashtable->hashtab);
274}
275
276/*
277 * Find or create a hashtable entry for the tuple group containing the
278 * given tuple. The tuple must be the same type as the hashtable entries.
279 *
280 * If isnew is NULL, we do not create new entries; we return NULL if no
281 * match is found.
282 *
283 * If hash is not NULL, we set it to the calculated hash value. This allows
284 * callers access to the hash value even if no entry is returned.
285 *
286 * If isnew isn't NULL, then a new entry is created if no existing entry
287 * matches. On return, *isnew is true if the entry is newly created,
288 * false if it existed already. ->additional_data in the new entry has
289 * been zeroed.
290 */
293 bool *isnew, uint32 *hash)
294{
295 TupleHashEntry entry;
296 MemoryContext oldContext;
297 uint32 local_hash;
298
299 /* Need to run the hash functions in short-lived context */
300 oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
301
302 /* set up data needed by hash and match functions */
303 hashtable->inputslot = slot;
304 hashtable->in_hash_expr = hashtable->tab_hash_expr;
305 hashtable->cur_eq_func = hashtable->tab_eq_func;
306
307 local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
308 entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
309
310 if (hash != NULL)
311 *hash = local_hash;
312
313 Assert(entry == NULL || entry->hash == local_hash);
314
315 MemoryContextSwitchTo(oldContext);
316
317 return entry;
318}
319
320/*
321 * Compute the hash value for a tuple
322 */
323uint32
325{
326 MemoryContext oldContext;
327 uint32 hash;
328
329 hashtable->inputslot = slot;
330 hashtable->in_hash_expr = hashtable->tab_hash_expr;
331
332 /* Need to run the hash functions in short-lived context */
333 oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
334
335 hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
336
337 MemoryContextSwitchTo(oldContext);
338
339 return hash;
340}
341
342/*
343 * A variant of LookupTupleHashEntry for callers that have already computed
344 * the hash value.
345 */
348 bool *isnew, uint32 hash)
349{
350 TupleHashEntry entry;
351 MemoryContext oldContext;
352
353 /* Need to run the hash functions in short-lived context */
354 oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
355
356 /* set up data needed by hash and match functions */
357 hashtable->inputslot = slot;
358 hashtable->in_hash_expr = hashtable->tab_hash_expr;
359 hashtable->cur_eq_func = hashtable->tab_eq_func;
360
361 entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
362 Assert(entry == NULL || entry->hash == hash);
363
364 MemoryContextSwitchTo(oldContext);
365
366 return entry;
367}
368
369/*
370 * Search for a hashtable entry matching the given tuple. No entry is
371 * created if there's not a match. This is similar to the non-creating
372 * case of LookupTupleHashEntry, except that it supports cross-type
373 * comparisons, in which the given tuple is not of the same type as the
374 * table entries. The caller must provide the hash ExprState to use for
375 * the input tuple, as well as the equality ExprState, since these may be
376 * different from the table's internal functions.
377 */
380 ExprState *eqcomp,
381 ExprState *hashexpr)
382{
383 TupleHashEntry entry;
384 MemoryContext oldContext;
386
387 /* Need to run the hash functions in short-lived context */
388 oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
389
390 /* Set up data needed by hash and match functions */
391 hashtable->inputslot = slot;
392 hashtable->in_hash_expr = hashexpr;
393 hashtable->cur_eq_func = eqcomp;
394
395 /* Search the hash table */
396 key = NULL; /* flag to reference inputslot */
397 entry = tuplehash_lookup(hashtable->hashtab, key);
398 MemoryContextSwitchTo(oldContext);
399
400 return entry;
401}
402
403/*
404 * If tuple is NULL, use the input slot instead. This convention avoids the
405 * need to materialize virtual input tuples unless they actually need to get
406 * copied into the table.
407 *
408 * Also, the caller must select an appropriate memory context for running
409 * the hash functions.
410 */
411static uint32
412TupleHashTableHash_internal(struct tuplehash_hash *tb,
413 const MinimalTuple tuple)
414{
415 TupleHashTable hashtable = (TupleHashTable) tb->private_data;
416 uint32 hashkey;
417 TupleTableSlot *slot;
418 bool isnull;
419
420 if (tuple == NULL)
421 {
422 /* Process the current input tuple for the table */
423 hashtable->exprcontext->ecxt_innertuple = hashtable->inputslot;
424 hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->in_hash_expr,
425 hashtable->exprcontext,
426 &isnull));
427 }
428 else
429 {
430 /*
431 * Process a tuple already stored in the table.
432 *
433 * (this case never actually occurs due to the way simplehash.h is
434 * used, as the hash-value is stored in the entries)
435 */
436 slot = hashtable->exprcontext->ecxt_innertuple = hashtable->tableslot;
437 ExecStoreMinimalTuple(tuple, slot, false);
438 hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->tab_hash_expr,
439 hashtable->exprcontext,
440 &isnull));
441 }
442
443 /*
444 * The hashing done above, even with an initial value, doesn't tend to
445 * result in good hash perturbation. Running the value produced above
446 * through murmurhash32 leads to near perfect hash perturbation.
447 */
448 return murmurhash32(hashkey);
449}
450
451/*
452 * Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful
453 * so that we can avoid switching the memory context multiple times for
454 * LookupTupleHashEntry.
455 *
456 * NB: This function may or may not change the memory context. Caller is
457 * expected to change it back.
458 */
459static inline TupleHashEntry
461 bool *isnew, uint32 hash)
462{
463 TupleHashEntryData *entry;
464 bool found;
466
467 key = NULL; /* flag to reference inputslot */
468
469 if (isnew)
470 {
471 entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found);
472
473 if (found)
474 {
475 /* found pre-existing entry */
476 *isnew = false;
477 }
478 else
479 {
480 /* created new entry */
481 *isnew = true;
482 /* zero caller data */
483 entry->additional = NULL;
485 /* Copy the first tuple into the table context */
487 }
488 }
489 else
490 {
491 entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash);
492 }
493
494 return entry;
495}
496
497/*
498 * See whether two tuples (presumably of the same hash value) match
499 */
500static int
501TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
502{
503 TupleTableSlot *slot1;
504 TupleTableSlot *slot2;
505 TupleHashTable hashtable = (TupleHashTable) tb->private_data;
506 ExprContext *econtext = hashtable->exprcontext;
507
508 /*
509 * We assume that simplehash.h will only ever call us with the first
510 * argument being an actual table entry, and the second argument being
511 * LookupTupleHashEntry's dummy TupleHashEntryData. The other direction
512 * could be supported too, but is not currently required.
513 */
514 Assert(tuple1 != NULL);
515 slot1 = hashtable->tableslot;
516 ExecStoreMinimalTuple(tuple1, slot1, false);
517 Assert(tuple2 == NULL);
518 slot2 = hashtable->inputslot;
519
520 /* For crosstype comparisons, the inputslot must be first */
521 econtext->ecxt_innertuple = slot2;
522 econtext->ecxt_outertuple = slot1;
523 return !ExecQualAndReset(hashtable->cur_eq_func, econtext);
524}
int16 AttrNumber
Definition: attnum.h:21
int ParallelWorkerNumber
Definition: parallel.c:114
#define Assert(condition)
Definition: c.h:812
uint32_t uint32
Definition: c.h:485
size_t Size
Definition: c.h:559
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
ExprState * ExecBuildHash32FromAttrs(TupleDesc desc, const TupleTableSlotOps *ops, FmgrInfo *hashfunctions, Oid *collations, int numCols, AttrNumber *keyColIdx, PlanState *parent, uint32 init_value)
Definition: execExpr.c:3998
ExprState * ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc, const TupleTableSlotOps *lops, const TupleTableSlotOps *rops, int numCols, const AttrNumber *keyColIdx, const Oid *eqfunctions, const Oid *collations, PlanState *parent)
Definition: execExpr.c:4321
ExprState * execTuplesMatchPrepare(TupleDesc desc, int numCols, const AttrNumber *keyColIdx, const Oid *eqOperators, const Oid *collations, PlanState *parent)
Definition: execGrouping.c:58
void execTuplesHashPrepare(int numCols, const Oid *eqOperators, Oid **eqFuncOids, FmgrInfo **hashFunctions)
Definition: execGrouping.c:97
TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 hash)
Definition: execGrouping.c:347
TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 *hash)
Definition: execGrouping.c:292
static uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb, const MinimalTuple tuple)
Definition: execGrouping.c:412
uint32 TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
Definition: execGrouping.c:324
TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, ExprState *eqcomp, ExprState *hashexpr)
Definition: execGrouping.c:379
static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
Definition: execGrouping.c:501
TupleHashTable BuildTupleHashTable(PlanState *parent, TupleDesc inputDesc, const TupleTableSlotOps *inputOps, int numCols, AttrNumber *keyColIdx, const Oid *eqfuncoids, FmgrInfo *hashfunctions, Oid *collations, long nbuckets, Size additionalsize, MemoryContext metacxt, MemoryContext tablecxt, MemoryContext tempcxt, bool use_variable_hash_iv)
Definition: execGrouping.c:161
void ResetTupleHashTable(TupleHashTable hashtable)
Definition: execGrouping.c:271
static TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew, uint32 hash)
Definition: execGrouping.c:460
TupleTableSlot * MakeSingleTupleTableSlot(TupleDesc tupdesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1425
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1633
const TupleTableSlotOps TTSOpsMinimalTuple
Definition: execTuples.c:86
ExprContext * CreateStandaloneExprContext(void)
Definition: execUtils.c:357
struct TupleHashTableData * TupleHashTable
Definition: execnodes.h:810
struct TupleHashEntryData TupleHashEntryData
static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)
Definition: executor.h:453
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
Definition: executor.h:346
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:127
static uint32 murmurhash32(uint32 data)
Definition: hashfn.h:92
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1285
bool get_op_hash_functions(Oid opno, RegProcedure *lhs_procno, RegProcedure *rhs_procno)
Definition: lsyscache.c:510
void * palloc(Size size)
Definition: mcxt.c:1317
size_t get_hash_memory_limit(void)
Definition: nodeHash.c:3487
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static uint32 DatumGetUInt32(Datum X)
Definition: postgres.h:222
unsigned int Oid
Definition: postgres_ext.h:31
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
TupleTableSlot * ecxt_innertuple
Definition: execnodes.h:260
Definition: fmgr.h:57
MinimalTuple firstTuple
Definition: execnodes.h:814
AttrNumber * keyColIdx
Definition: execnodes.h:832
tuplehash_hash * hashtab
Definition: execnodes.h:830
ExprState * in_hash_expr
Definition: execnodes.h:841
ExprState * tab_hash_expr
Definition: execnodes.h:833
MemoryContext tempcxt
Definition: execnodes.h:837
ExprState * tab_eq_func
Definition: execnodes.h:834
TupleTableSlot * tableslot
Definition: execnodes.h:838
ExprContext * exprcontext
Definition: execnodes.h:843
MemoryContext tablecxt
Definition: execnodes.h:836
TupleTableSlot * inputslot
Definition: execnodes.h:840
ExprState * cur_eq_func
Definition: execnodes.h:842
TupleDesc CreateTupleDescCopy(TupleDesc tupdesc)
Definition: tupdesc.c:235
static MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot)
Definition: tuptable.h:492