PostgreSQL Source Code git master
Loading...
Searching...
No Matches
mvdistinct.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * mvdistinct.c
4 * POSTGRES multivariate ndistinct coefficients
5 *
6 * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
7 * is tricky, and the estimation error is often significant.
8
9 * The multivariate ndistinct coefficients address this by storing ndistinct
10 * estimates for combinations of the user-specified columns. So for example
11 * given a statistics object on three columns (a,b,c), this module estimates
12 * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
13 * estimates are already available in pg_statistic.
14 *
15 *
16 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 * IDENTIFICATION
20 * src/backend/statistics/mvdistinct.c
21 *
22 *-------------------------------------------------------------------------
23 */
24#include "postgres.h"
25
26#include <math.h>
27
31#include "utils/syscache.h"
32#include "utils/typcache.h"
33#include "varatt.h"
34
36 int k, int *combination);
37static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
38static int n_choose_k(int n, int k);
39static int num_combinations(int n);
40
41/* size of the struct header fields (magic, type, nitems) */
42#define SizeOfHeader (3 * sizeof(uint32))
43
44/* size of a serialized ndistinct item (coefficient, natts, atts) */
45#define SizeOfItem(natts) \
46 (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
47
48/* minimal size of a ndistinct item (with two attributes) */
49#define MinSizeOfItem SizeOfItem(2)
50
51/* minimal size of mvndistinct, when all items are minimal */
52#define MinSizeOfItems(nitems) \
53 (SizeOfHeader + (nitems) * MinSizeOfItem)
54
55/* Combination generator API */
56
57/* internal state for generator of k-combinations of n elements */
59{
60 int k; /* size of the combination */
61 int n; /* total number of elements */
62 int current; /* index of the next combination to return */
63 int ncombinations; /* number of combinations (size of array) */
64 int *combinations; /* array of pre-built combinations */
66
67static CombinationGenerator *generator_init(int n, int k);
71
72
73/*
74 * statext_ndistinct_build
75 * Compute ndistinct coefficient for the combination of attributes.
76 *
77 * This computes the ndistinct estimate using the same estimator used
78 * in analyze.c and then computes the coefficient.
79 *
80 * To handle expressions easily, we treat them as system attributes with
81 * negative attnums, and offset everything by number of expressions to
82 * allow using Bitmapsets.
83 */
86{
87 MVNDistinct *result;
88 int k;
89 int itemcnt;
90 int numattrs = data->nattnums;
92
93 result = palloc(offsetof(MVNDistinct, items) +
94 numcombs * sizeof(MVNDistinctItem));
97 result->nitems = numcombs;
98
99 itemcnt = 0;
100 for (k = 2; k <= numattrs; k++)
101 {
102 int *combination;
104
105 /* generate combinations of K out of N elements */
107
109 {
110 MVNDistinctItem *item = &result->items[itemcnt];
111 int j;
112
114 item->nattributes = k;
115
116 /* translate the indexes to attnums */
117 for (j = 0; j < k; j++)
118 {
119 item->attributes[j] = data->attnums[combination[j]];
120
122 }
123
124 item->ndistinct =
126
127 itemcnt++;
129 }
130
132 }
133
134 /* must consume exactly the whole output array */
135 Assert(itemcnt == result->nitems);
136
137 return result;
138}
139
140/*
141 * statext_ndistinct_load
142 * Load the ndistinct value for the indicated pg_statistic_ext tuple
143 */
146{
147 MVNDistinct *result;
148 bool isnull;
149 Datum ndist;
150 HeapTuple htup;
151
154 if (!HeapTupleIsValid(htup))
155 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
156
159 if (isnull)
160 elog(ERROR,
161 "requested statistics kind \"%c\" is not yet built for statistics object %u",
163
165
166 ReleaseSysCache(htup);
167
168 return result;
169}
170
171/*
172 * statext_ndistinct_serialize
173 * serialize ndistinct to the on-disk bytea format
174 */
175bytea *
177{
178 int i;
179 bytea *output;
180 char *tmp;
181 Size len;
182
183 Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
185
186 /*
187 * Base size is size of scalar fields in the struct, plus one base struct
188 * for each item, including number of items for each.
189 */
191
192 /* and also include space for the actual attribute numbers */
193 for (i = 0; i < ndistinct->nitems; i++)
194 {
195 int nmembers;
196
197 nmembers = ndistinct->items[i].nattributes;
198 Assert(nmembers >= 2);
199
200 len += SizeOfItem(nmembers);
201 }
202
203 output = (bytea *) palloc(len);
205
206 tmp = VARDATA(output);
207
208 /* Store the base struct values (magic, type, nitems) */
209 memcpy(tmp, &ndistinct->magic, sizeof(uint32));
210 tmp += sizeof(uint32);
211 memcpy(tmp, &ndistinct->type, sizeof(uint32));
212 tmp += sizeof(uint32);
213 memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
214 tmp += sizeof(uint32);
215
216 /*
217 * store number of attributes and attribute numbers for each entry
218 */
219 for (i = 0; i < ndistinct->nitems; i++)
220 {
221 MVNDistinctItem item = ndistinct->items[i];
222 int nmembers = item.nattributes;
223
224 memcpy(tmp, &item.ndistinct, sizeof(double));
225 tmp += sizeof(double);
226 memcpy(tmp, &nmembers, sizeof(int));
227 tmp += sizeof(int);
228
229 memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
230 tmp += nmembers * sizeof(AttrNumber);
231
232 /* protect against overflows */
233 Assert(tmp <= ((char *) output + len));
234 }
235
236 /* check we used exactly the expected space */
237 Assert(tmp == ((char *) output + len));
238
239 return output;
240}
241
242/*
243 * statext_ndistinct_deserialize
244 * Read an on-disk bytea format MVNDistinct to in-memory format
245 */
248{
249 int i;
252 MVNDistinct *ndistinct;
253 char *tmp;
254
255 if (data == NULL)
256 return NULL;
257
258 /* we expect at least the basic fields of MVNDistinct struct */
260 elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
262
263 /* initialize pointer to the data part (skip the varlena header) */
264 tmp = VARDATA_ANY(data);
265
266 /* read the header fields and perform basic sanity checks */
267 memcpy(&ndist.magic, tmp, sizeof(uint32));
268 tmp += sizeof(uint32);
269 memcpy(&ndist.type, tmp, sizeof(uint32));
270 tmp += sizeof(uint32);
271 memcpy(&ndist.nitems, tmp, sizeof(uint32));
272 tmp += sizeof(uint32);
273
274 if (ndist.magic != STATS_NDISTINCT_MAGIC)
275 elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
278 elog(ERROR, "invalid ndistinct type %d (expected %d)",
280 if (ndist.nitems == 0)
281 elog(ERROR, "invalid zero-length item array in MVNDistinct");
282
283 /* what minimum bytea size do we expect for those parameters */
286 elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
288
289 /*
290 * Allocate space for the ndistinct items (no space for each item's
291 * attnos: those live in bitmapsets allocated separately)
292 */
293 ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
294 (ndist.nitems * sizeof(MVNDistinctItem)));
295 ndistinct->magic = ndist.magic;
296 ndistinct->type = ndist.type;
297 ndistinct->nitems = ndist.nitems;
298
299 for (i = 0; i < ndistinct->nitems; i++)
300 {
301 MVNDistinctItem *item = &ndistinct->items[i];
302
303 /* ndistinct value */
304 memcpy(&item->ndistinct, tmp, sizeof(double));
305 tmp += sizeof(double);
306
307 /* number of attributes */
308 memcpy(&item->nattributes, tmp, sizeof(int));
309 tmp += sizeof(int);
310 Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
311
312 item->attributes
313 = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
314
315 memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
316 tmp += sizeof(AttrNumber) * item->nattributes;
317
318 /* still within the bytea */
319 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
320 }
321
322 /* we should have consumed the whole bytea exactly */
323 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
324
325 return ndistinct;
326}
327
328/*
329 * Free allocations of a MVNDistinct.
330 */
331void
333{
334 for (int i = 0; i < ndistinct->nitems; i++)
335 pfree(ndistinct->items[i].attributes);
336 pfree(ndistinct);
337}
338
339/*
340 * Validate a set of MVNDistincts against the extended statistics object
341 * definition.
342 *
343 * Every MVNDistinctItem must be checked to ensure that the attnums in the
344 * attributes list correspond to attnums/expressions defined by the extended
345 * statistics object.
346 *
347 * Positive attnums are attributes which must be found in the stxkeys,
348 * while negative attnums correspond to an expression number, no attribute
349 * number can be below (0 - numexprs).
350 */
351bool
353 const int2vector *stxkeys,
354 int numexprs, int elevel)
355{
357
358 /* Scan through each MVNDistinct entry */
359 for (int i = 0; i < ndistinct->nitems; i++)
360 {
361 MVNDistinctItem item = ndistinct->items[i];
362
363 /*
364 * Cross-check each attribute in a MVNDistinct entry with the extended
365 * stats object definition.
366 */
367 for (int j = 0; j < item.nattributes; j++)
368 {
370 bool ok = false;
371
372 if (attnum > 0)
373 {
374 /* attribute number in stxkeys */
375 for (int k = 0; k < stxkeys->dim1; k++)
376 {
377 if (attnum == stxkeys->values[k])
378 {
379 ok = true;
380 break;
381 }
382 }
383 }
384 else if ((attnum < 0) && (attnum >= attnum_expr_lowbound))
385 {
386 /* attribute number for an expression */
387 ok = true;
388 }
389
390 if (!ok)
391 {
392 ereport(elevel,
394 errmsg("could not validate \"%s\" object: invalid attribute number %d found",
395 "pg_ndistinct", attnum)));
396 return false;
397 }
398 }
399 }
400
401 return true;
402}
403
404/*
405 * ndistinct_for_combination
406 * Estimates number of distinct values in a combination of columns.
407 *
408 * This uses the same ndistinct estimator as compute_scalar_stats() in
409 * ANALYZE, i.e.,
410 * n*d / (n - f1 + f1*n/N)
411 *
412 * except that instead of values in a single column we are dealing with
413 * combination of multiple columns.
414 */
415static double
417 int k, int *combination)
418{
419 int i,
420 j;
421 int f1,
422 cnt,
423 d;
424 bool *isnull;
425 Datum *values;
428 int numrows = data->numrows;
429
430 mss = multi_sort_init(k);
431
432 /*
433 * In order to determine the number of distinct elements, create separate
434 * values[]/isnull[] arrays with all the data we have, then sort them
435 * using the specified column combination as dimensions. We could try to
436 * sort in place, but it'd probably be more complex and bug-prone.
437 */
438 items = palloc_array(SortItem, numrows);
439 values = palloc0_array(Datum, numrows * k);
440 isnull = palloc0_array(bool, numrows * k);
441
442 for (i = 0; i < numrows; i++)
443 {
444 items[i].values = &values[i * k];
445 items[i].isnull = &isnull[i * k];
446 }
447
448 /*
449 * For each dimension, set up sort-support and fill in the values from the
450 * sample data.
451 *
452 * We use the column data types' default sort operators and collations;
453 * perhaps at some point it'd be worth using column-specific collations?
454 */
455 for (i = 0; i < k; i++)
456 {
457 Oid typid;
461
462 typid = colstat->attrtypid;
463 collid = colstat->attrcollid;
464
466 if (type->lt_opr == InvalidOid) /* shouldn't happen */
467 elog(ERROR, "cache lookup failed for ordering operator for type %u",
468 typid);
469
470 /* prepare the sort function for this dimension */
472
473 /* accumulate all the data for this dimension into the arrays */
474 for (j = 0; j < numrows; j++)
475 {
476 items[j].values[i] = data->values[combination[i]][j];
477 items[j].isnull[i] = data->nulls[combination[i]][j];
478 }
479 }
480
481 /* We can sort the array now ... */
482 qsort_interruptible(items, numrows, sizeof(SortItem),
484
485 /* ... and count the number of distinct combinations */
486
487 f1 = 0;
488 cnt = 1;
489 d = 1;
490 for (i = 1; i < numrows; i++)
491 {
492 if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
493 {
494 if (cnt == 1)
495 f1 += 1;
496
497 d++;
498 cnt = 0;
499 }
500
501 cnt += 1;
502 }
503
504 if (cnt == 1)
505 f1 += 1;
506
507 return estimate_ndistinct(totalrows, numrows, d, f1);
508}
509
510/* The Duj1 estimator (already used in analyze.c). */
511static double
512estimate_ndistinct(double totalrows, int numrows, int d, int f1)
513{
514 double numer,
515 denom,
516 ndistinct;
517
518 numer = (double) numrows * (double) d;
519
520 denom = (double) (numrows - f1) +
521 (double) f1 * (double) numrows / totalrows;
522
523 ndistinct = numer / denom;
524
525 /* Clamp to sane range in case of roundoff error */
526 if (ndistinct < (double) d)
527 ndistinct = (double) d;
528
529 if (ndistinct > totalrows)
530 ndistinct = totalrows;
531
532 return floor(ndistinct + 0.5);
533}
534
535/*
536 * n_choose_k
537 * computes binomial coefficients using an algorithm that is both
538 * efficient and prevents overflows
539 */
540static int
541n_choose_k(int n, int k)
542{
543 int d,
544 r;
545
546 Assert((k > 0) && (n >= k));
547
548 /* use symmetry of the binomial coefficients */
549 k = Min(k, n - k);
550
551 r = 1;
552 for (d = 1; d <= k; ++d)
553 {
554 r *= n--;
555 r /= d;
556 }
557
558 return r;
559}
560
561/*
562 * num_combinations
563 * number of combinations, excluding single-value combinations
564 */
565static int
567{
568 return (1 << n) - (n + 1);
569}
570
571/*
572 * generator_init
573 * initialize the generator of combinations
574 *
575 * The generator produces combinations of K elements in the interval (0..N).
576 * We prebuild all the combinations in this method, which is simpler than
577 * generating them on the fly.
578 */
580generator_init(int n, int k)
581{
583
584 Assert((n >= k) && (k > 0));
585
586 /* allocate the generator state as a single chunk of memory */
588
589 state->ncombinations = n_choose_k(n, k);
590
591 /* pre-allocate space for all combinations */
592 state->combinations = palloc_array(int, k * state->ncombinations);
593
594 state->current = 0;
595 state->k = k;
596 state->n = n;
597
598 /* now actually pre-generate all the combinations of K elements */
600
601 /* make sure we got the expected number of combinations */
602 Assert(state->current == state->ncombinations);
603
604 /* reset the number, so we start with the first one */
605 state->current = 0;
606
607 return state;
608}
609
610/*
611 * generator_next
612 * returns the next combination from the prebuilt list
613 *
614 * Returns a combination of K array indexes (0 .. N), as specified to
615 * generator_init), or NULL when there are no more combination.
616 */
617static int *
619{
620 if (state->current == state->ncombinations)
621 return NULL;
622
623 return &state->combinations[state->k * state->current++];
624}
625
626/*
627 * generator_free
628 * free the internal state of the generator
629 *
630 * Releases the generator internal state (pre-built combinations).
631 */
632static void
634{
635 pfree(state->combinations);
636 pfree(state);
637}
638
639/*
640 * generate_combinations_recurse
641 * given a prefix, generate all possible combinations
642 *
643 * Given a prefix (first few elements of the combination), generate following
644 * elements recursively. We generate the combinations in lexicographic order,
645 * which eliminates permutations of the same combination.
646 */
647static void
649 int index, int start, int *current)
650{
651 /* If we haven't filled all the elements, simply recurse. */
652 if (index < state->k)
653 {
654 int i;
655
656 /*
657 * The values have to be in ascending order, so make sure we start
658 * with the value passed by parameter.
659 */
660
661 for (i = start; i < state->n; i++)
662 {
663 current[index] = i;
664 generate_combinations_recurse(state, (index + 1), (i + 1), current);
665 }
666
667 return;
668 }
669 else
670 {
671 /* we got a valid combination, add it to the array */
672 memcpy(&state->combinations[(state->k * state->current)],
673 current, state->k * sizeof(int));
674 state->current++;
675 }
676}
677
678/*
679 * generate_combinations
680 * generate all k-combinations of N elements
681 */
682static void
684{
685 int *current = palloc0_array(int, state->k);
686
687 generate_combinations_recurse(state, 0, 0, current);
688
689 pfree(current);
690}
int16 AttrNumber
Definition attnum.h:21
#define AttributeNumberIsValid(attributeNumber)
Definition attnum.h:34
static Datum values[MAXATTR]
Definition bootstrap.c:155
#define Min(x, y)
Definition c.h:997
#define MAXALIGN(LEN)
Definition c.h:826
#define VARHDRSZ
Definition c.h:711
#define Assert(condition)
Definition c.h:873
uint32_t uint32
Definition c.h:546
size_t Size
Definition c.h:619
Oid collid
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int multi_sort_compare(const void *a, const void *b, void *arg)
MultiSortSupport multi_sort_init(int ndims)
void multi_sort_add_dimension(MultiSortSupport mss, int sortdim, Oid oper, Oid collation)
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define DatumGetByteaPP(X)
Definition fmgr.h:292
return str start
#define HeapTupleIsValid(tuple)
Definition htup.h:78
#define nitems(x)
Definition indent.h:31
FILE * output
int j
Definition isn.c:78
int i
Definition isn.c:77
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * palloc(Size size)
Definition mcxt.c:1387
static int n_choose_k(int n, int k)
Definition mvdistinct.c:541
#define SizeOfHeader
Definition mvdistinct.c:42
void statext_ndistinct_free(MVNDistinct *ndistinct)
Definition mvdistinct.c:332
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1)
Definition mvdistinct.c:512
static void generate_combinations_recurse(CombinationGenerator *state, int index, int start, int *current)
Definition mvdistinct.c:648
MVNDistinct * statext_ndistinct_deserialize(bytea *data)
Definition mvdistinct.c:247
static double ndistinct_for_combination(double totalrows, StatsBuildData *data, int k, int *combination)
Definition mvdistinct.c:416
bytea * statext_ndistinct_serialize(MVNDistinct *ndistinct)
Definition mvdistinct.c:176
static void generate_combinations(CombinationGenerator *state)
Definition mvdistinct.c:683
MVNDistinct * statext_ndistinct_load(Oid mvoid, bool inh)
Definition mvdistinct.c:145
static int num_combinations(int n)
Definition mvdistinct.c:566
MVNDistinct * statext_ndistinct_build(double totalrows, StatsBuildData *data)
Definition mvdistinct.c:85
#define SizeOfItem(natts)
Definition mvdistinct.c:45
static void generator_free(CombinationGenerator *state)
Definition mvdistinct.c:633
static CombinationGenerator * generator_init(int n, int k)
Definition mvdistinct.c:580
#define MinSizeOfItems(nitems)
Definition mvdistinct.c:52
bool statext_ndistinct_validate(const MVNDistinct *ndistinct, const int2vector *stxkeys, int numexprs, int elevel)
Definition mvdistinct.c:352
static int * generator_next(CombinationGenerator *state)
Definition mvdistinct.c:618
int16 attnum
const void size_t len
const void * data
void qsort_interruptible(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
static Datum BoolGetDatum(bool X)
Definition postgres.h:112
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:262
uint64_t Datum
Definition postgres.h:70
#define InvalidOid
unsigned int Oid
static int fb(int x)
int f1[ARRAY_SIZE]
#define STATS_NDISTINCT_MAGIC
Definition statistics.h:22
#define STATS_NDISTINCT_TYPE_BASIC
Definition statistics.h:23
#define STATS_MAX_DIMENSIONS
Definition statistics.h:19
AttrNumber * attributes
Definition statistics.h:30
uint32 nitems
Definition statistics.h:38
uint32 type
Definition statistics.h:37
uint32 magic
Definition statistics.h:36
MVNDistinctItem items[FLEXIBLE_ARRAY_MEMBER]
Definition statistics.h:39
Oid attrtypid
Definition vacuum.h:126
Definition type.h:96
Definition c.h:706
void ReleaseSysCache(HeapTuple tuple)
Definition syscache.c:264
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition syscache.c:595
HeapTuple SearchSysCache2(int cacheId, Datum key1, Datum key2)
Definition syscache.c:230
static ItemArray items
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition typcache.c:386
#define TYPECACHE_LT_OPR
Definition typcache.h:139
static Size VARSIZE_ANY(const void *PTR)
Definition varatt.h:460
static Size VARSIZE_ANY_EXHDR(const void *PTR)
Definition varatt.h:472
static char * VARDATA(const void *PTR)
Definition varatt.h:305
static char * VARDATA_ANY(const void *PTR)
Definition varatt.h:486
static void SET_VARSIZE(void *PTR, Size len)
Definition varatt.h:432
const char * type