PostgreSQL Source Code  git master
array_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * array_selfuncs.c
4  * Functions for selectivity estimation of array operators
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/array_selfuncs.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/htup_details.h"
20 #include "catalog/pg_operator.h"
21 #include "catalog/pg_statistic.h"
22 #include "utils/array.h"
23 #include "utils/fmgrprotos.h"
24 #include "utils/lsyscache.h"
25 #include "utils/selfuncs.h"
26 #include "utils/typcache.h"
27 
28 
29 /* Default selectivity constant for "@>" and "<@" operators */
30 #define DEFAULT_CONTAIN_SEL 0.005
31 
32 /* Default selectivity constant for "&&" operator */
33 #define DEFAULT_OVERLAP_SEL 0.01
34 
35 /* Default selectivity for given operator */
36 #define DEFAULT_SEL(operator) \
37  ((operator) == OID_ARRAY_OVERLAP_OP ? \
38  DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
39 
40 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
41  Oid elemtype, Oid operator);
43  TypeCacheEntry *typentry,
44  Datum *mcelem, int nmcelem,
45  float4 *numbers, int nnumbers,
46  float4 *hist, int nhist,
47  Oid operator);
48 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
49  float4 *numbers, int nnumbers,
50  Datum *array_data, int nitems,
51  Oid operator, TypeCacheEntry *typentry);
52 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
53  float4 *numbers, int nnumbers,
54  Datum *array_data, int nitems,
55  float4 *hist, int nhist,
56  Oid operator, TypeCacheEntry *typentry);
57 static float *calc_hist(const float4 *hist, int nhist, int n);
58 static float *calc_distr(const float *p, int n, int m, float rest);
59 static int floor_log2(uint32 n);
60 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
61  int *index, TypeCacheEntry *typentry);
62 static int element_compare(const void *key1, const void *key2, void *arg);
63 static int float_compare_desc(const void *key1, const void *key2);
64 
65 
66 /*
67  * scalararraysel_containment
68  * Estimate selectivity of ScalarArrayOpExpr via array containment.
69  *
70  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
71  * selectivity as though this were an array containment operator,
72  * array_var op ARRAY[const].
73  *
74  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
75  * is the array element type's default equality or inequality operator, and
76  * has aggressively simplified both inputs to constants.
77  *
78  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
79  */
82  Node *leftop, Node *rightop,
83  Oid elemtype, bool isEquality, bool useOr,
84  int varRelid)
85 {
86  Selectivity selec;
87  VariableStatData vardata;
88  Datum constval;
89  TypeCacheEntry *typentry;
90  FmgrInfo *cmpfunc;
91 
92  /*
93  * rightop must be a variable, else punt.
94  */
95  examine_variable(root, rightop, varRelid, &vardata);
96  if (!vardata.rel)
97  {
98  ReleaseVariableStats(vardata);
99  return -1.0;
100  }
101 
102  /*
103  * leftop must be a constant, else punt.
104  */
105  if (!IsA(leftop, Const))
106  {
107  ReleaseVariableStats(vardata);
108  return -1.0;
109  }
110  if (((Const *) leftop)->constisnull)
111  {
112  /* qual can't succeed if null on left */
113  ReleaseVariableStats(vardata);
114  return (Selectivity) 0.0;
115  }
116  constval = ((Const *) leftop)->constvalue;
117 
118  /* Get element type's default comparison function */
119  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
120  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
121  {
122  ReleaseVariableStats(vardata);
123  return -1.0;
124  }
125  cmpfunc = &typentry->cmp_proc_finfo;
126 
127  /*
128  * If the operator is <>, swap ANY/ALL, then invert the result later.
129  */
130  if (!isEquality)
131  useOr = !useOr;
132 
133  /* Get array element stats for var, if available */
134  if (HeapTupleIsValid(vardata.statsTuple) &&
135  statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
136  {
137  Form_pg_statistic stats;
138  AttStatsSlot sslot;
139  AttStatsSlot hslot;
140 
141  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
142 
143  /* MCELEM will be an array of same type as element */
144  if (get_attstatsslot(&sslot, vardata.statsTuple,
145  STATISTIC_KIND_MCELEM, InvalidOid,
147  {
148  /* For ALL case, also get histogram of distinct-element counts */
149  if (useOr ||
150  !get_attstatsslot(&hslot, vardata.statsTuple,
151  STATISTIC_KIND_DECHIST, InvalidOid,
153  memset(&hslot, 0, sizeof(hslot));
154 
155  /*
156  * For = ANY, estimate as var @> ARRAY[const].
157  *
158  * For = ALL, estimate as var <@ ARRAY[const].
159  */
160  if (useOr)
162  sslot.nvalues,
163  sslot.numbers,
164  sslot.nnumbers,
165  &constval, 1,
166  OID_ARRAY_CONTAINS_OP,
167  typentry);
168  else
169  selec = mcelem_array_contained_selec(sslot.values,
170  sslot.nvalues,
171  sslot.numbers,
172  sslot.nnumbers,
173  &constval, 1,
174  hslot.numbers,
175  hslot.nnumbers,
176  OID_ARRAY_CONTAINED_OP,
177  typentry);
178 
179  free_attstatsslot(&hslot);
180  free_attstatsslot(&sslot);
181  }
182  else
183  {
184  /* No most-common-elements info, so do without */
185  if (useOr)
186  selec = mcelem_array_contain_overlap_selec(NULL, 0,
187  NULL, 0,
188  &constval, 1,
189  OID_ARRAY_CONTAINS_OP,
190  typentry);
191  else
192  selec = mcelem_array_contained_selec(NULL, 0,
193  NULL, 0,
194  &constval, 1,
195  NULL, 0,
196  OID_ARRAY_CONTAINED_OP,
197  typentry);
198  }
199 
200  /*
201  * MCE stats count only non-null rows, so adjust for null rows.
202  */
203  selec *= (1.0 - stats->stanullfrac);
204  }
205  else
206  {
207  /* No stats at all, so do without */
208  if (useOr)
209  selec = mcelem_array_contain_overlap_selec(NULL, 0,
210  NULL, 0,
211  &constval, 1,
212  OID_ARRAY_CONTAINS_OP,
213  typentry);
214  else
215  selec = mcelem_array_contained_selec(NULL, 0,
216  NULL, 0,
217  &constval, 1,
218  NULL, 0,
219  OID_ARRAY_CONTAINED_OP,
220  typentry);
221  /* we assume no nulls here, so no stanullfrac correction */
222  }
223 
224  ReleaseVariableStats(vardata);
225 
226  /*
227  * If the operator is <>, invert the results.
228  */
229  if (!isEquality)
230  selec = 1.0 - selec;
231 
232  CLAMP_PROBABILITY(selec);
233 
234  return selec;
235 }
236 
237 /*
238  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
239  */
240 Datum
242 {
244  Oid operator = PG_GETARG_OID(1);
245  List *args = (List *) PG_GETARG_POINTER(2);
246  int varRelid = PG_GETARG_INT32(3);
247  VariableStatData vardata;
248  Node *other;
249  bool varonleft;
250  Selectivity selec;
251  Oid element_typeid;
252 
253  /*
254  * If expression is not (variable op something) or (something op
255  * variable), then punt and return a default estimate.
256  */
257  if (!get_restriction_variable(root, args, varRelid,
258  &vardata, &other, &varonleft))
259  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
260 
261  /*
262  * Can't do anything useful if the something is not a constant, either.
263  */
264  if (!IsA(other, Const))
265  {
266  ReleaseVariableStats(vardata);
267  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
268  }
269 
270  /*
271  * The "&&", "@>" and "<@" operators are strict, so we can cope with a
272  * NULL constant right away.
273  */
274  if (((Const *) other)->constisnull)
275  {
276  ReleaseVariableStats(vardata);
277  PG_RETURN_FLOAT8(0.0);
278  }
279 
280  /*
281  * If var is on the right, commute the operator, so that we can assume the
282  * var is on the left in what follows.
283  */
284  if (!varonleft)
285  {
286  if (operator == OID_ARRAY_CONTAINS_OP)
287  operator = OID_ARRAY_CONTAINED_OP;
288  else if (operator == OID_ARRAY_CONTAINED_OP)
289  operator = OID_ARRAY_CONTAINS_OP;
290  }
291 
292  /*
293  * OK, there's a Var and a Const we're dealing with here. We need the
294  * Const to be an array with same element type as column, else we can't do
295  * anything useful. (Such cases will likely fail at runtime, but here
296  * we'd rather just return a default estimate.)
297  */
298  element_typeid = get_base_element_type(((Const *) other)->consttype);
299  if (element_typeid != InvalidOid &&
300  element_typeid == get_base_element_type(vardata.vartype))
301  {
302  selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
303  element_typeid, operator);
304  }
305  else
306  {
307  selec = DEFAULT_SEL(operator);
308  }
309 
310  ReleaseVariableStats(vardata);
311 
312  CLAMP_PROBABILITY(selec);
313 
314  PG_RETURN_FLOAT8((float8) selec);
315 }
316 
317 /*
318  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
319  */
320 Datum
322 {
323  /* For the moment this is just a stub */
324  Oid operator = PG_GETARG_OID(1);
325 
326  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
327 }
328 
329 /*
330  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
331  * or "arraycolumn <@ const" based on the statistics
332  *
333  * This function is mainly responsible for extracting the pg_statistic data
334  * to be used; we then pass the problem on to mcelem_array_selec().
335  */
336 static Selectivity
338  Oid elemtype, Oid operator)
339 {
340  Selectivity selec;
341  TypeCacheEntry *typentry;
342  FmgrInfo *cmpfunc;
343  ArrayType *array;
344 
345  /* Get element type's default comparison function */
346  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
347  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
348  return DEFAULT_SEL(operator);
349  cmpfunc = &typentry->cmp_proc_finfo;
350 
351  /*
352  * The caller made sure the const is an array with same element type, so
353  * get it now
354  */
355  array = DatumGetArrayTypeP(constval);
356 
357  if (HeapTupleIsValid(vardata->statsTuple) &&
358  statistic_proc_security_check(vardata, cmpfunc->fn_oid))
359  {
360  Form_pg_statistic stats;
361  AttStatsSlot sslot;
362  AttStatsSlot hslot;
363 
364  stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
365 
366  /* MCELEM will be an array of same type as column */
367  if (get_attstatsslot(&sslot, vardata->statsTuple,
368  STATISTIC_KIND_MCELEM, InvalidOid,
370  {
371  /*
372  * For "array <@ const" case we also need histogram of distinct
373  * element counts.
374  */
375  if (operator != OID_ARRAY_CONTAINED_OP ||
376  !get_attstatsslot(&hslot, vardata->statsTuple,
377  STATISTIC_KIND_DECHIST, InvalidOid,
379  memset(&hslot, 0, sizeof(hslot));
380 
381  /* Use the most-common-elements slot for the array Var. */
382  selec = mcelem_array_selec(array, typentry,
383  sslot.values, sslot.nvalues,
384  sslot.numbers, sslot.nnumbers,
385  hslot.numbers, hslot.nnumbers,
386  operator);
387 
388  free_attstatsslot(&hslot);
389  free_attstatsslot(&sslot);
390  }
391  else
392  {
393  /* No most-common-elements info, so do without */
394  selec = mcelem_array_selec(array, typentry,
395  NULL, 0, NULL, 0, NULL, 0,
396  operator);
397  }
398 
399  /*
400  * MCE stats count only non-null rows, so adjust for null rows.
401  */
402  selec *= (1.0 - stats->stanullfrac);
403  }
404  else
405  {
406  /* No stats at all, so do without */
407  selec = mcelem_array_selec(array, typentry,
408  NULL, 0, NULL, 0, NULL, 0,
409  operator);
410  /* we assume no nulls here, so no stanullfrac correction */
411  }
412 
413  /* If constant was toasted, release the copy we made */
414  if (PointerGetDatum(array) != constval)
415  pfree(array);
416 
417  return selec;
418 }
419 
420 /*
421  * Array selectivity estimation based on most common elements statistics
422  *
423  * This function just deconstructs and sorts the array constant's contents,
424  * and then passes the problem on to mcelem_array_contain_overlap_selec or
425  * mcelem_array_contained_selec depending on the operator.
426  */
427 static Selectivity
429  Datum *mcelem, int nmcelem,
430  float4 *numbers, int nnumbers,
431  float4 *hist, int nhist,
432  Oid operator)
433 {
434  Selectivity selec;
435  int num_elems;
436  Datum *elem_values;
437  bool *elem_nulls;
438  bool null_present;
439  int nonnull_nitems;
440  int i;
441 
442  /*
443  * Prepare constant array data for sorting. Sorting lets us find unique
444  * elements and efficiently merge with the MCELEM array.
445  */
446  deconstruct_array(array,
447  typentry->type_id,
448  typentry->typlen,
449  typentry->typbyval,
450  typentry->typalign,
451  &elem_values, &elem_nulls, &num_elems);
452 
453  /* Collapse out any null elements */
454  nonnull_nitems = 0;
455  null_present = false;
456  for (i = 0; i < num_elems; i++)
457  {
458  if (elem_nulls[i])
459  null_present = true;
460  else
461  elem_values[nonnull_nitems++] = elem_values[i];
462  }
463 
464  /*
465  * Query "column @> '{anything, null}'" matches nothing. For the other
466  * two operators, presence of a null in the constant can be ignored.
467  */
468  if (null_present && operator == OID_ARRAY_CONTAINS_OP)
469  {
470  pfree(elem_values);
471  pfree(elem_nulls);
472  return (Selectivity) 0.0;
473  }
474 
475  /* Sort extracted elements using their default comparison function. */
476  qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
477  element_compare, typentry);
478 
479  /* Separate cases according to operator */
480  if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
481  selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
482  numbers, nnumbers,
483  elem_values, nonnull_nitems,
484  operator, typentry);
485  else if (operator == OID_ARRAY_CONTAINED_OP)
486  selec = mcelem_array_contained_selec(mcelem, nmcelem,
487  numbers, nnumbers,
488  elem_values, nonnull_nitems,
489  hist, nhist,
490  operator, typentry);
491  else
492  {
493  elog(ERROR, "arraycontsel called for unrecognized operator %u",
494  operator);
495  selec = 0.0; /* keep compiler quiet */
496  }
497 
498  pfree(elem_values);
499  pfree(elem_nulls);
500  return selec;
501 }
502 
503 /*
504  * Estimate selectivity of "column @> const" and "column && const" based on
505  * most common element statistics. This estimation assumes element
506  * occurrences are independent.
507  *
508  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
509  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
510  * not available. array_data (of length nitems) is the constant's elements.
511  *
512  * Both the mcelem and array_data arrays are assumed presorted according
513  * to the element type's cmpfunc. Null elements are not present.
514  *
515  * TODO: this estimate probably could be improved by using the distinct
516  * elements count histogram. For example, excepting the special case of
517  * "column @> '{}'", we can multiply the calculated selectivity by the
518  * fraction of nonempty arrays in the column.
519  */
520 static Selectivity
522  float4 *numbers, int nnumbers,
523  Datum *array_data, int nitems,
524  Oid operator, TypeCacheEntry *typentry)
525 {
526  Selectivity selec,
527  elem_selec;
528  int mcelem_index,
529  i;
530  bool use_bsearch;
531  float4 minfreq;
532 
533  /*
534  * There should be three more Numbers than Values, because the last three
535  * cells should hold minimal and maximal frequency among the non-null
536  * elements, and then the frequency of null elements. Ignore the Numbers
537  * if not right.
538  */
539  if (nnumbers != nmcelem + 3)
540  {
541  numbers = NULL;
542  nnumbers = 0;
543  }
544 
545  if (numbers)
546  {
547  /* Grab the lowest observed frequency */
548  minfreq = numbers[nmcelem];
549  }
550  else
551  {
552  /* Without statistics make some default assumptions */
553  minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
554  }
555 
556  /* Decide whether it is faster to use binary search or not. */
557  if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
558  use_bsearch = true;
559  else
560  use_bsearch = false;
561 
562  if (operator == OID_ARRAY_CONTAINS_OP)
563  {
564  /*
565  * Initial selectivity for "column @> const" query is 1.0, and it will
566  * be decreased with each element of constant array.
567  */
568  selec = 1.0;
569  }
570  else
571  {
572  /*
573  * Initial selectivity for "column && const" query is 0.0, and it will
574  * be increased with each element of constant array.
575  */
576  selec = 0.0;
577  }
578 
579  /* Scan mcelem and array in parallel. */
580  mcelem_index = 0;
581  for (i = 0; i < nitems; i++)
582  {
583  bool match = false;
584 
585  /* Ignore any duplicates in the array data. */
586  if (i > 0 &&
587  element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
588  continue;
589 
590  /* Find the smallest MCELEM >= this array item. */
591  if (use_bsearch)
592  {
593  match = find_next_mcelem(mcelem, nmcelem, array_data[i],
594  &mcelem_index, typentry);
595  }
596  else
597  {
598  while (mcelem_index < nmcelem)
599  {
600  int cmp = element_compare(&mcelem[mcelem_index],
601  &array_data[i],
602  typentry);
603 
604  if (cmp < 0)
605  mcelem_index++;
606  else
607  {
608  if (cmp == 0)
609  match = true; /* mcelem is found */
610  break;
611  }
612  }
613  }
614 
615  if (match && numbers)
616  {
617  /* MCELEM matches the array item; use its frequency. */
618  elem_selec = numbers[mcelem_index];
619  mcelem_index++;
620  }
621  else
622  {
623  /*
624  * The element is not in MCELEM. Punt, but assume that the
625  * selectivity cannot be more than minfreq / 2.
626  */
627  elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
628  }
629 
630  /*
631  * Update overall selectivity using the current element's selectivity
632  * and an assumption of element occurrence independence.
633  */
634  if (operator == OID_ARRAY_CONTAINS_OP)
635  selec *= elem_selec;
636  else
637  selec = selec + elem_selec - selec * elem_selec;
638 
639  /* Clamp intermediate results to stay sane despite roundoff error */
640  CLAMP_PROBABILITY(selec);
641  }
642 
643  return selec;
644 }
645 
646 /*
647  * Estimate selectivity of "column <@ const" based on most common element
648  * statistics.
649  *
650  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
651  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
652  * not available. array_data (of length nitems) is the constant's elements.
653  * hist (of length nhist) is from the array column's DECHIST statistics slot,
654  * or is NULL/0 if those stats are not available.
655  *
656  * Both the mcelem and array_data arrays are assumed presorted according
657  * to the element type's cmpfunc. Null elements are not present.
658  *
659  * Independent element occurrence would imply a particular distribution of
660  * distinct element counts among matching rows. Real data usually falsifies
661  * that assumption. For example, in a set of 11-element integer arrays having
662  * elements in the range [0..10], element occurrences are typically not
663  * independent. If they were, a sufficiently-large set would include all
664  * distinct element counts 0 through 11. We correct for this using the
665  * histogram of distinct element counts.
666  *
667  * In the "column @> const" and "column && const" cases, we usually have a
668  * "const" with low number of elements (otherwise we have selectivity close
669  * to 0 or 1 respectively). That's why the effect of dependence related
670  * to distinct element count distribution is negligible there. In the
671  * "column <@ const" case, number of elements is usually high (otherwise we
672  * have selectivity close to 0). That's why we should do a correction with
673  * the array distinct element count distribution here.
674  *
675  * Using the histogram of distinct element counts produces a different
676  * distribution law than independent occurrences of elements. This
677  * distribution law can be described as follows:
678  *
679  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
680  * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
681  *
682  * where:
683  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
684  * (1 - occurrence, 0 - no occurrence) in row
685  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
686  * (scalar values in [0..1]) according to collected statistics
687  * m = o1 + o2 + ... + on = total number of distinct elements in row
688  * hist[m] - histogram data for occurrence of m elements.
689  * ind[m] - probability of m occurrences from n events assuming their
690  * probabilities to be equal to frequencies of array elements.
691  *
692  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
693  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
694  */
695 static Selectivity
696 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
697  float4 *numbers, int nnumbers,
698  Datum *array_data, int nitems,
699  float4 *hist, int nhist,
700  Oid operator, TypeCacheEntry *typentry)
701 {
702  int mcelem_index,
703  i,
704  unique_nitems = 0;
705  float selec,
706  minfreq,
707  nullelem_freq;
708  float *dist,
709  *mcelem_dist,
710  *hist_part;
711  float avg_count,
712  mult,
713  rest;
714  float *elem_selec;
715 
716  /*
717  * There should be three more Numbers than Values in the MCELEM slot,
718  * because the last three cells should hold minimal and maximal frequency
719  * among the non-null elements, and then the frequency of null elements.
720  * Punt if not right, because we can't do much without the element freqs.
721  */
722  if (numbers == NULL || nnumbers != nmcelem + 3)
723  return DEFAULT_CONTAIN_SEL;
724 
725  /* Can't do much without a count histogram, either */
726  if (hist == NULL || nhist < 3)
727  return DEFAULT_CONTAIN_SEL;
728 
729  /*
730  * Grab some of the summary statistics that compute_array_stats() stores:
731  * lowest frequency, frequency of null elements, and average distinct
732  * element count.
733  */
734  minfreq = numbers[nmcelem];
735  nullelem_freq = numbers[nmcelem + 2];
736  avg_count = hist[nhist - 1];
737 
738  /*
739  * "rest" will be the sum of the frequencies of all elements not
740  * represented in MCELEM. The average distinct element count is the sum
741  * of the frequencies of *all* elements. Begin with that; we will proceed
742  * to subtract the MCELEM frequencies.
743  */
744  rest = avg_count;
745 
746  /*
747  * mult is a multiplier representing estimate of probability that each
748  * mcelem that is not present in constant doesn't occur.
749  */
750  mult = 1.0f;
751 
752  /*
753  * elem_selec is array of estimated frequencies for elements in the
754  * constant.
755  */
756  elem_selec = (float *) palloc(sizeof(float) * nitems);
757 
758  /* Scan mcelem and array in parallel. */
759  mcelem_index = 0;
760  for (i = 0; i < nitems; i++)
761  {
762  bool match = false;
763 
764  /* Ignore any duplicates in the array data. */
765  if (i > 0 &&
766  element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
767  continue;
768 
769  /*
770  * Iterate over MCELEM until we find an entry greater than or equal to
771  * this element of the constant. Update "rest" and "mult" for mcelem
772  * entries skipped over.
773  */
774  while (mcelem_index < nmcelem)
775  {
776  int cmp = element_compare(&mcelem[mcelem_index],
777  &array_data[i],
778  typentry);
779 
780  if (cmp < 0)
781  {
782  mult *= (1.0f - numbers[mcelem_index]);
783  rest -= numbers[mcelem_index];
784  mcelem_index++;
785  }
786  else
787  {
788  if (cmp == 0)
789  match = true; /* mcelem is found */
790  break;
791  }
792  }
793 
794  if (match)
795  {
796  /* MCELEM matches the array item. */
797  elem_selec[unique_nitems] = numbers[mcelem_index];
798  /* "rest" is decremented for all mcelems, matched or not */
799  rest -= numbers[mcelem_index];
800  mcelem_index++;
801  }
802  else
803  {
804  /*
805  * The element is not in MCELEM. Punt, but assume that the
806  * selectivity cannot be more than minfreq / 2.
807  */
808  elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
809  minfreq / 2);
810  }
811 
812  unique_nitems++;
813  }
814 
815  /*
816  * If we handled all constant elements without exhausting the MCELEM
817  * array, finish walking it to complete calculation of "rest" and "mult".
818  */
819  while (mcelem_index < nmcelem)
820  {
821  mult *= (1.0f - numbers[mcelem_index]);
822  rest -= numbers[mcelem_index];
823  mcelem_index++;
824  }
825 
826  /*
827  * The presence of many distinct rare elements materially decreases
828  * selectivity. Use the Poisson distribution to estimate the probability
829  * of a column value having zero occurrences of such elements. See above
830  * for the definition of "rest".
831  */
832  mult *= exp(-rest);
833 
834  /*----------
835  * Using the distinct element count histogram requires
836  * O(unique_nitems * (nmcelem + unique_nitems))
837  * operations. Beyond a certain computational cost threshold, it's
838  * reasonable to sacrifice accuracy for decreased planning time. We limit
839  * the number of operations to EFFORT * nmcelem; since nmcelem is limited
840  * by the column's statistics target, the work done is user-controllable.
841  *
842  * If the number of operations would be too large, we can reduce it
843  * without losing all accuracy by reducing unique_nitems and considering
844  * only the most-common elements of the constant array. To make the
845  * results exactly match what we would have gotten with only those
846  * elements to start with, we'd have to remove any discarded elements'
847  * frequencies from "mult", but since this is only an approximation
848  * anyway, we don't bother with that. Therefore it's sufficient to qsort
849  * elem_selec[] and take the largest elements. (They will no longer match
850  * up with the elements of array_data[], but we don't care.)
851  *----------
852  */
853 #define EFFORT 100
854 
855  if ((nmcelem + unique_nitems) > 0 &&
856  unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
857  {
858  /*
859  * Use the quadratic formula to solve for largest allowable N. We
860  * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
861  */
862  double b = (double) nmcelem;
863  int n;
864 
865  n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
866 
867  /* Sort, then take just the first n elements */
868  qsort(elem_selec, unique_nitems, sizeof(float),
870  unique_nitems = n;
871  }
872 
873  /*
874  * Calculate probabilities of each distinct element count for both mcelems
875  * and constant elements. At this point, assume independent element
876  * occurrence.
877  */
878  dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
879  mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
880 
881  /* ignore hist[nhist-1], which is the average not a histogram member */
882  hist_part = calc_hist(hist, nhist - 1, unique_nitems);
883 
884  selec = 0.0f;
885  for (i = 0; i <= unique_nitems; i++)
886  {
887  /*
888  * mult * dist[i] / mcelem_dist[i] gives us probability of qual
889  * matching from assumption of independent element occurrence with the
890  * condition that distinct element count = i.
891  */
892  if (mcelem_dist[i] > 0)
893  selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
894  }
895 
896  pfree(dist);
897  pfree(mcelem_dist);
898  pfree(hist_part);
899  pfree(elem_selec);
900 
901  /* Take into account occurrence of NULL element. */
902  selec *= (1.0f - nullelem_freq);
903 
904  CLAMP_PROBABILITY(selec);
905 
906  return selec;
907 }
908 
909 /*
910  * Calculate the first n distinct element count probabilities from a
911  * histogram of distinct element counts.
912  *
913  * Returns a palloc'd array of n+1 entries, with array[k] being the
914  * probability of element count k, k in [0..n].
915  *
916  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
917  * (nhist - 1)) probability to each value in (a,b) and an additional half of
918  * that to a and b themselves.
919  */
920 static float *
921 calc_hist(const float4 *hist, int nhist, int n)
922 {
923  float *hist_part;
924  int k,
925  i = 0;
926  float prev_interval = 0,
927  next_interval;
928  float frac;
929 
930  hist_part = (float *) palloc((n + 1) * sizeof(float));
931 
932  /*
933  * frac is a probability contribution for each interval between histogram
934  * values. We have nhist - 1 intervals, so contribution of each one will
935  * be 1 / (nhist - 1).
936  */
937  frac = 1.0f / ((float) (nhist - 1));
938 
939  for (k = 0; k <= n; k++)
940  {
941  int count = 0;
942 
943  /*
944  * Count the histogram boundaries equal to k. (Although the histogram
945  * should theoretically contain only exact integers, entries are
946  * floats so there could be roundoff error in large values. Treat any
947  * fractional value as equal to the next larger k.)
948  */
949  while (i < nhist && hist[i] <= k)
950  {
951  count++;
952  i++;
953  }
954 
955  if (count > 0)
956  {
957  /* k is an exact bound for at least one histogram box. */
958  float val;
959 
960  /* Find length between current histogram value and the next one */
961  if (i < nhist)
962  next_interval = hist[i] - hist[i - 1];
963  else
964  next_interval = 0;
965 
966  /*
967  * count - 1 histogram boxes contain k exclusively. They
968  * contribute a total of (count - 1) * frac probability. Also
969  * factor in the partial histogram boxes on either side.
970  */
971  val = (float) (count - 1);
972  if (next_interval > 0)
973  val += 0.5f / next_interval;
974  if (prev_interval > 0)
975  val += 0.5f / prev_interval;
976  hist_part[k] = frac * val;
977 
978  prev_interval = next_interval;
979  }
980  else
981  {
982  /* k does not appear as an exact histogram bound. */
983  if (prev_interval > 0)
984  hist_part[k] = frac / prev_interval;
985  else
986  hist_part[k] = 0.0f;
987  }
988  }
989 
990  return hist_part;
991 }
992 
993 /*
994  * Consider n independent events with probabilities p[]. This function
995  * calculates probabilities of exact k of events occurrence for k in [0..m].
996  * Returns a palloc'd array of size m+1.
997  *
998  * "rest" is the sum of the probabilities of all low-probability events not
999  * included in p.
1000  *
1001  * Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1002  * probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1003  * For any constant j, each increment of i increases the probability iff the
1004  * event occurs. So, by the law of total probability:
1005  * M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1006  * for i > 0, j > 0.
1007  * M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1008  */
1009 static float *
1010 calc_distr(const float *p, int n, int m, float rest)
1011 {
1012  float *row,
1013  *prev_row,
1014  *tmp;
1015  int i,
1016  j;
1017 
1018  /*
1019  * Since we return only the last row of the matrix and need only the
1020  * current and previous row for calculations, allocate two rows.
1021  */
1022  row = (float *) palloc((m + 1) * sizeof(float));
1023  prev_row = (float *) palloc((m + 1) * sizeof(float));
1024 
1025  /* M[0,0] = 1 */
1026  row[0] = 1.0f;
1027  for (i = 1; i <= n; i++)
1028  {
1029  float t = p[i - 1];
1030 
1031  /* Swap rows */
1032  tmp = row;
1033  row = prev_row;
1034  prev_row = tmp;
1035 
1036  /* Calculate next row */
1037  for (j = 0; j <= i && j <= m; j++)
1038  {
1039  float val = 0.0f;
1040 
1041  if (j < i)
1042  val += prev_row[j] * (1.0f - t);
1043  if (j > 0)
1044  val += prev_row[j - 1] * t;
1045  row[j] = val;
1046  }
1047  }
1048 
1049  /*
1050  * The presence of many distinct rare (not in "p") elements materially
1051  * decreases selectivity. Model their collective occurrence with the
1052  * Poisson distribution.
1053  */
1054  if (rest > DEFAULT_CONTAIN_SEL)
1055  {
1056  float t;
1057 
1058  /* Swap rows */
1059  tmp = row;
1060  row = prev_row;
1061  prev_row = tmp;
1062 
1063  for (i = 0; i <= m; i++)
1064  row[i] = 0.0f;
1065 
1066  /* Value of Poisson distribution for 0 occurrences */
1067  t = exp(-rest);
1068 
1069  /*
1070  * Calculate convolution of previously computed distribution and the
1071  * Poisson distribution.
1072  */
1073  for (i = 0; i <= m; i++)
1074  {
1075  for (j = 0; j <= m - i; j++)
1076  row[j + i] += prev_row[j] * t;
1077 
1078  /* Get Poisson distribution value for (i + 1) occurrences */
1079  t *= rest / (float) (i + 1);
1080  }
1081  }
1082 
1083  pfree(prev_row);
1084  return row;
1085 }
1086 
1087 /* Fast function for floor value of 2 based logarithm calculation. */
1088 static int
1090 {
1091  int logval = 0;
1092 
1093  if (n == 0)
1094  return -1;
1095  if (n >= (1 << 16))
1096  {
1097  n >>= 16;
1098  logval += 16;
1099  }
1100  if (n >= (1 << 8))
1101  {
1102  n >>= 8;
1103  logval += 8;
1104  }
1105  if (n >= (1 << 4))
1106  {
1107  n >>= 4;
1108  logval += 4;
1109  }
1110  if (n >= (1 << 2))
1111  {
1112  n >>= 2;
1113  logval += 2;
1114  }
1115  if (n >= (1 << 1))
1116  {
1117  logval += 1;
1118  }
1119  return logval;
1120 }
1121 
1122 /*
1123  * find_next_mcelem binary-searches a most common elements array, starting
1124  * from *index, for the first member >= value. It saves the position of the
1125  * match into *index and returns true if it's an exact match. (Note: we
1126  * assume the mcelem elements are distinct so there can't be more than one
1127  * exact match.)
1128  */
1129 static bool
1130 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1131  TypeCacheEntry *typentry)
1132 {
1133  int l = *index,
1134  r = nmcelem - 1,
1135  i,
1136  res;
1137 
1138  while (l <= r)
1139  {
1140  i = (l + r) / 2;
1141  res = element_compare(&mcelem[i], &value, typentry);
1142  if (res == 0)
1143  {
1144  *index = i;
1145  return true;
1146  }
1147  else if (res < 0)
1148  l = i + 1;
1149  else
1150  r = i - 1;
1151  }
1152  *index = l;
1153  return false;
1154 }
1155 
1156 /*
1157  * Comparison function for elements.
1158  *
1159  * We use the element type's default btree opclass, and its default collation
1160  * if the type is collation-sensitive.
1161  *
1162  * XXX consider using SortSupport infrastructure
1163  */
1164 static int
1165 element_compare(const void *key1, const void *key2, void *arg)
1166 {
1167  Datum d1 = *((const Datum *) key1);
1168  Datum d2 = *((const Datum *) key2);
1169  TypeCacheEntry *typentry = (TypeCacheEntry *) arg;
1170  FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo;
1171  Datum c;
1172 
1173  c = FunctionCall2Coll(cmpfunc, typentry->typcollation, d1, d2);
1174  return DatumGetInt32(c);
1175 }
1176 
1177 /*
1178  * Comparison function for sorting floats into descending order.
1179  */
1180 static int
1181 float_compare_desc(const void *key1, const void *key2)
1182 {
1183  float d1 = *((const float *) key1);
1184  float d2 = *((const float *) key2);
1185 
1186  if (d1 > d2)
1187  return -1;
1188  else if (d1 < d2)
1189  return 1;
1190  else
1191  return 0;
1192 }
#define DatumGetArrayTypeP(X)
Definition: array.h:261
Selectivity scalararraysel_containment(PlannerInfo *root, Node *leftop, Node *rightop, Oid elemtype, bool isEquality, bool useOr, int varRelid)
static int float_compare_desc(const void *key1, const void *key2)
static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, float4 *hist, int nhist, Oid operator, TypeCacheEntry *typentry)
#define DEFAULT_CONTAIN_SEL
static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval, Oid elemtype, Oid operator)
static int floor_log2(uint32 n)
#define EFFORT
static float * calc_hist(const float4 *hist, int nhist, int n)
static int element_compare(const void *key1, const void *key2, void *arg)
static float * calc_distr(const float *p, int n, int m, float rest)
Datum arraycontsel(PG_FUNCTION_ARGS)
static Selectivity mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry, Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, float4 *hist, int nhist, Oid operator)
static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, Oid operator, TypeCacheEntry *typentry)
Datum arraycontjoinsel(PG_FUNCTION_ARGS)
#define DEFAULT_SEL(operator)
static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index, TypeCacheEntry *typentry)
void deconstruct_array(ArrayType *array, Oid elmtype, int elmlen, bool elmbyval, char elmalign, Datum **elemsp, bool **nullsp, int *nelemsp)
Definition: arrayfuncs.c:3612
unsigned int uint32
Definition: c.h:506
#define Min(x, y)
Definition: c.h:1004
double float8
Definition: c.h:630
float float4
Definition: c.h:629
#define OidIsValid(objectId)
Definition: c.h:775
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:367
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
#define nitems(x)
Definition: indent.h:31
long val
Definition: informix.c:670
static struct @155 value
int b
Definition: isn.c:70
int j
Definition: isn.c:74
int i
Definition: isn.c:73
void free_attstatsslot(AttStatsSlot *sslot)
Definition: lsyscache.c:3344
Oid get_base_element_type(Oid typid)
Definition: lsyscache.c:2832
bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags)
Definition: lsyscache.c:3234
#define ATTSTATSSLOT_NUMBERS
Definition: lsyscache.h:43
#define ATTSTATSSLOT_VALUES
Definition: lsyscache.h:42
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc(Size size)
Definition: mcxt.c:1316
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
double Selectivity
Definition: nodes.h:250
void * arg
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:135
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:449
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
char * c
tree ctl root
Definition: radixtree.h:1884
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4883
void examine_variable(PlannerInfo *root, Node *node, int varRelid, VariableStatData *vardata)
Definition: selfuncs.c:5012
bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid)
Definition: selfuncs.c:5735
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:99
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:63
Datum * values
Definition: lsyscache.h:53
float4 * numbers
Definition: lsyscache.h:56
int nnumbers
Definition: lsyscache.h:57
Definition: fmgr.h:57
Oid fn_oid
Definition: fmgr.h:59
Definition: pg_list.h:54
Definition: nodes.h:129
FmgrInfo cmp_proc_finfo
Definition: typcache.h:76
char typalign
Definition: typcache.h:41
bool typbyval
Definition: typcache.h:40
int16 typlen
Definition: typcache.h:39
Oid typcollation
Definition: typcache.h:47
HeapTuple statsTuple
Definition: selfuncs.h:89
RelOptInfo * rel
Definition: selfuncs.h:88
Definition: type.h:95
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition: typcache.c:346
#define TYPECACHE_CMP_PROC_FINFO
Definition: typcache.h:143