PostgreSQL Source Code  git master
array_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * array_selfuncs.c
4  * Functions for selectivity estimation of array operators
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/array_selfuncs.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "utils/array.h"
24 #include "utils/builtins.h"
25 #include "utils/lsyscache.h"
26 #include "utils/selfuncs.h"
27 #include "utils/typcache.h"
28 
29 
30 /* Default selectivity constant for "@>" and "<@" operators */
31 #define DEFAULT_CONTAIN_SEL 0.005
32 
33 /* Default selectivity constant for "&&" operator */
34 #define DEFAULT_OVERLAP_SEL 0.01
35 
36 /* Default selectivity for given operator */
37 #define DEFAULT_SEL(operator) \
38  ((operator) == OID_ARRAY_OVERLAP_OP ? \
39  DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
40 
41 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
42  Oid elemtype, Oid operator);
44  TypeCacheEntry *typentry,
45  Datum *mcelem, int nmcelem,
46  float4 *numbers, int nnumbers,
47  float4 *hist, int nhist,
48  Oid operator);
49 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
50  float4 *numbers, int nnumbers,
51  Datum *array_data, int nitems,
52  Oid operator, TypeCacheEntry *typentry);
53 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
54  float4 *numbers, int nnumbers,
55  Datum *array_data, int nitems,
56  float4 *hist, int nhist,
57  Oid operator, TypeCacheEntry *typentry);
58 static float *calc_hist(const float4 *hist, int nhist, int n);
59 static float *calc_distr(const float *p, int n, int m, float rest);
60 static int floor_log2(uint32 n);
61 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
62  int *index, TypeCacheEntry *typentry);
63 static int element_compare(const void *key1, const void *key2, void *arg);
64 static int float_compare_desc(const void *key1, const void *key2);
65 
66 
67 /*
68  * scalararraysel_containment
69  * Estimate selectivity of ScalarArrayOpExpr via array containment.
70  *
71  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
72  * selectivity as though this were an array containment operator,
73  * array_var op ARRAY[const].
74  *
75  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
76  * is the array element type's default equality or inequality operator, and
77  * has aggressively simplified both inputs to constants.
78  *
79  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
80  */
83  Node *leftop, Node *rightop,
84  Oid elemtype, bool isEquality, bool useOr,
85  int varRelid)
86 {
87  Selectivity selec;
88  VariableStatData vardata;
89  Datum constval;
90  TypeCacheEntry *typentry;
91  FmgrInfo *cmpfunc;
92 
93  /*
94  * rightop must be a variable, else punt.
95  */
96  examine_variable(root, rightop, varRelid, &vardata);
97  if (!vardata.rel)
98  {
99  ReleaseVariableStats(vardata);
100  return -1.0;
101  }
102 
103  /*
104  * leftop must be a constant, else punt.
105  */
106  if (!IsA(leftop, Const))
107  {
108  ReleaseVariableStats(vardata);
109  return -1.0;
110  }
111  if (((Const *) leftop)->constisnull)
112  {
113  /* qual can't succeed if null on left */
114  ReleaseVariableStats(vardata);
115  return (Selectivity) 0.0;
116  }
117  constval = ((Const *) leftop)->constvalue;
118 
119  /* Get element type's default comparison function */
120  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
121  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
122  {
123  ReleaseVariableStats(vardata);
124  return -1.0;
125  }
126  cmpfunc = &typentry->cmp_proc_finfo;
127 
128  /*
129  * If the operator is <>, swap ANY/ALL, then invert the result later.
130  */
131  if (!isEquality)
132  useOr = !useOr;
133 
134  /* Get array element stats for var, if available */
135  if (HeapTupleIsValid(vardata.statsTuple) &&
136  statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
137  {
138  Form_pg_statistic stats;
139  AttStatsSlot sslot;
140  AttStatsSlot hslot;
141 
142  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
143 
144  /* MCELEM will be an array of same type as element */
145  if (get_attstatsslot(&sslot, vardata.statsTuple,
146  STATISTIC_KIND_MCELEM, InvalidOid,
148  {
149  /* For ALL case, also get histogram of distinct-element counts */
150  if (useOr ||
151  !get_attstatsslot(&hslot, vardata.statsTuple,
152  STATISTIC_KIND_DECHIST, InvalidOid,
154  memset(&hslot, 0, sizeof(hslot));
155 
156  /*
157  * For = ANY, estimate as var @> ARRAY[const].
158  *
159  * For = ALL, estimate as var <@ ARRAY[const].
160  */
161  if (useOr)
163  sslot.nvalues,
164  sslot.numbers,
165  sslot.nnumbers,
166  &constval, 1,
167  OID_ARRAY_CONTAINS_OP,
168  typentry);
169  else
170  selec = mcelem_array_contained_selec(sslot.values,
171  sslot.nvalues,
172  sslot.numbers,
173  sslot.nnumbers,
174  &constval, 1,
175  hslot.numbers,
176  hslot.nnumbers,
177  OID_ARRAY_CONTAINED_OP,
178  typentry);
179 
180  free_attstatsslot(&hslot);
181  free_attstatsslot(&sslot);
182  }
183  else
184  {
185  /* No most-common-elements info, so do without */
186  if (useOr)
187  selec = mcelem_array_contain_overlap_selec(NULL, 0,
188  NULL, 0,
189  &constval, 1,
190  OID_ARRAY_CONTAINS_OP,
191  typentry);
192  else
193  selec = mcelem_array_contained_selec(NULL, 0,
194  NULL, 0,
195  &constval, 1,
196  NULL, 0,
197  OID_ARRAY_CONTAINED_OP,
198  typentry);
199  }
200 
201  /*
202  * MCE stats count only non-null rows, so adjust for null rows.
203  */
204  selec *= (1.0 - stats->stanullfrac);
205  }
206  else
207  {
208  /* No stats at all, so do without */
209  if (useOr)
210  selec = mcelem_array_contain_overlap_selec(NULL, 0,
211  NULL, 0,
212  &constval, 1,
213  OID_ARRAY_CONTAINS_OP,
214  typentry);
215  else
216  selec = mcelem_array_contained_selec(NULL, 0,
217  NULL, 0,
218  &constval, 1,
219  NULL, 0,
220  OID_ARRAY_CONTAINED_OP,
221  typentry);
222  /* we assume no nulls here, so no stanullfrac correction */
223  }
224 
225  ReleaseVariableStats(vardata);
226 
227  /*
228  * If the operator is <>, invert the results.
229  */
230  if (!isEquality)
231  selec = 1.0 - selec;
232 
233  CLAMP_PROBABILITY(selec);
234 
235  return selec;
236 }
237 
238 /*
239  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
240  */
241 Datum
243 {
245  Oid operator = PG_GETARG_OID(1);
246  List *args = (List *) PG_GETARG_POINTER(2);
247  int varRelid = PG_GETARG_INT32(3);
248  VariableStatData vardata;
249  Node *other;
250  bool varonleft;
251  Selectivity selec;
252  Oid element_typeid;
253 
254  /*
255  * If expression is not (variable op something) or (something op
256  * variable), then punt and return a default estimate.
257  */
258  if (!get_restriction_variable(root, args, varRelid,
259  &vardata, &other, &varonleft))
260  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
261 
262  /*
263  * Can't do anything useful if the something is not a constant, either.
264  */
265  if (!IsA(other, Const))
266  {
267  ReleaseVariableStats(vardata);
268  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
269  }
270 
271  /*
272  * The "&&", "@>" and "<@" operators are strict, so we can cope with a
273  * NULL constant right away.
274  */
275  if (((Const *) other)->constisnull)
276  {
277  ReleaseVariableStats(vardata);
278  PG_RETURN_FLOAT8(0.0);
279  }
280 
281  /*
282  * If var is on the right, commute the operator, so that we can assume the
283  * var is on the left in what follows.
284  */
285  if (!varonleft)
286  {
287  if (operator == OID_ARRAY_CONTAINS_OP)
288  operator = OID_ARRAY_CONTAINED_OP;
289  else if (operator == OID_ARRAY_CONTAINED_OP)
290  operator = OID_ARRAY_CONTAINS_OP;
291  }
292 
293  /*
294  * OK, there's a Var and a Const we're dealing with here. We need the
295  * Const to be an array with same element type as column, else we can't do
296  * anything useful. (Such cases will likely fail at runtime, but here
297  * we'd rather just return a default estimate.)
298  */
299  element_typeid = get_base_element_type(((Const *) other)->consttype);
300  if (element_typeid != InvalidOid &&
301  element_typeid == get_base_element_type(vardata.vartype))
302  {
303  selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
304  element_typeid, operator);
305  }
306  else
307  {
308  selec = DEFAULT_SEL(operator);
309  }
310 
311  ReleaseVariableStats(vardata);
312 
313  CLAMP_PROBABILITY(selec);
314 
315  PG_RETURN_FLOAT8((float8) selec);
316 }
317 
318 /*
319  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
320  */
321 Datum
323 {
324  /* For the moment this is just a stub */
325  Oid operator = PG_GETARG_OID(1);
326 
327  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
328 }
329 
330 /*
331  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
332  * or "arraycolumn <@ const" based on the statistics
333  *
334  * This function is mainly responsible for extracting the pg_statistic data
335  * to be used; we then pass the problem on to mcelem_array_selec().
336  */
337 static Selectivity
339  Oid elemtype, Oid operator)
340 {
341  Selectivity selec;
342  TypeCacheEntry *typentry;
343  FmgrInfo *cmpfunc;
344  ArrayType *array;
345 
346  /* Get element type's default comparison function */
347  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
348  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
349  return DEFAULT_SEL(operator);
350  cmpfunc = &typentry->cmp_proc_finfo;
351 
352  /*
353  * The caller made sure the const is an array with same element type, so
354  * get it now
355  */
356  array = DatumGetArrayTypeP(constval);
357 
358  if (HeapTupleIsValid(vardata->statsTuple) &&
359  statistic_proc_security_check(vardata, cmpfunc->fn_oid))
360  {
361  Form_pg_statistic stats;
362  AttStatsSlot sslot;
363  AttStatsSlot hslot;
364 
365  stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
366 
367  /* MCELEM will be an array of same type as column */
368  if (get_attstatsslot(&sslot, vardata->statsTuple,
369  STATISTIC_KIND_MCELEM, InvalidOid,
371  {
372  /*
373  * For "array <@ const" case we also need histogram of distinct
374  * element counts.
375  */
376  if (operator != OID_ARRAY_CONTAINED_OP ||
377  !get_attstatsslot(&hslot, vardata->statsTuple,
378  STATISTIC_KIND_DECHIST, InvalidOid,
380  memset(&hslot, 0, sizeof(hslot));
381 
382  /* Use the most-common-elements slot for the array Var. */
383  selec = mcelem_array_selec(array, typentry,
384  sslot.values, sslot.nvalues,
385  sslot.numbers, sslot.nnumbers,
386  hslot.numbers, hslot.nnumbers,
387  operator);
388 
389  free_attstatsslot(&hslot);
390  free_attstatsslot(&sslot);
391  }
392  else
393  {
394  /* No most-common-elements info, so do without */
395  selec = mcelem_array_selec(array, typentry,
396  NULL, 0, NULL, 0, NULL, 0,
397  operator);
398  }
399 
400  /*
401  * MCE stats count only non-null rows, so adjust for null rows.
402  */
403  selec *= (1.0 - stats->stanullfrac);
404  }
405  else
406  {
407  /* No stats at all, so do without */
408  selec = mcelem_array_selec(array, typentry,
409  NULL, 0, NULL, 0, NULL, 0,
410  operator);
411  /* we assume no nulls here, so no stanullfrac correction */
412  }
413 
414  /* If constant was toasted, release the copy we made */
415  if (PointerGetDatum(array) != constval)
416  pfree(array);
417 
418  return selec;
419 }
420 
421 /*
422  * Array selectivity estimation based on most common elements statistics
423  *
424  * This function just deconstructs and sorts the array constant's contents,
425  * and then passes the problem on to mcelem_array_contain_overlap_selec or
426  * mcelem_array_contained_selec depending on the operator.
427  */
428 static Selectivity
430  Datum *mcelem, int nmcelem,
431  float4 *numbers, int nnumbers,
432  float4 *hist, int nhist,
433  Oid operator)
434 {
435  Selectivity selec;
436  int num_elems;
437  Datum *elem_values;
438  bool *elem_nulls;
439  bool null_present;
440  int nonnull_nitems;
441  int i;
442 
443  /*
444  * Prepare constant array data for sorting. Sorting lets us find unique
445  * elements and efficiently merge with the MCELEM array.
446  */
447  deconstruct_array(array,
448  typentry->type_id,
449  typentry->typlen,
450  typentry->typbyval,
451  typentry->typalign,
452  &elem_values, &elem_nulls, &num_elems);
453 
454  /* Collapse out any null elements */
455  nonnull_nitems = 0;
456  null_present = false;
457  for (i = 0; i < num_elems; i++)
458  {
459  if (elem_nulls[i])
460  null_present = true;
461  else
462  elem_values[nonnull_nitems++] = elem_values[i];
463  }
464 
465  /*
466  * Query "column @> '{anything, null}'" matches nothing. For the other
467  * two operators, presence of a null in the constant can be ignored.
468  */
469  if (null_present && operator == OID_ARRAY_CONTAINS_OP)
470  {
471  pfree(elem_values);
472  pfree(elem_nulls);
473  return (Selectivity) 0.0;
474  }
475 
476  /* Sort extracted elements using their default comparison function. */
477  qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
478  element_compare, typentry);
479 
480  /* Separate cases according to operator */
481  if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
482  selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
483  numbers, nnumbers,
484  elem_values, nonnull_nitems,
485  operator, typentry);
486  else if (operator == OID_ARRAY_CONTAINED_OP)
487  selec = mcelem_array_contained_selec(mcelem, nmcelem,
488  numbers, nnumbers,
489  elem_values, nonnull_nitems,
490  hist, nhist,
491  operator, typentry);
492  else
493  {
494  elog(ERROR, "arraycontsel called for unrecognized operator %u",
495  operator);
496  selec = 0.0; /* keep compiler quiet */
497  }
498 
499  pfree(elem_values);
500  pfree(elem_nulls);
501  return selec;
502 }
503 
504 /*
505  * Estimate selectivity of "column @> const" and "column && const" based on
506  * most common element statistics. This estimation assumes element
507  * occurrences are independent.
508  *
509  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
510  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
511  * not available. array_data (of length nitems) is the constant's elements.
512  *
513  * Both the mcelem and array_data arrays are assumed presorted according
514  * to the element type's cmpfunc. Null elements are not present.
515  *
516  * TODO: this estimate probably could be improved by using the distinct
517  * elements count histogram. For example, excepting the special case of
518  * "column @> '{}'", we can multiply the calculated selectivity by the
519  * fraction of nonempty arrays in the column.
520  */
521 static Selectivity
523  float4 *numbers, int nnumbers,
524  Datum *array_data, int nitems,
525  Oid operator, TypeCacheEntry *typentry)
526 {
527  Selectivity selec,
528  elem_selec;
529  int mcelem_index,
530  i;
531  bool use_bsearch;
532  float4 minfreq;
533 
534  /*
535  * There should be three more Numbers than Values, because the last three
536  * cells should hold minimal and maximal frequency among the non-null
537  * elements, and then the frequency of null elements. Ignore the Numbers
538  * if not right.
539  */
540  if (nnumbers != nmcelem + 3)
541  {
542  numbers = NULL;
543  nnumbers = 0;
544  }
545 
546  if (numbers)
547  {
548  /* Grab the lowest observed frequency */
549  minfreq = numbers[nmcelem];
550  }
551  else
552  {
553  /* Without statistics make some default assumptions */
554  minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
555  }
556 
557  /* Decide whether it is faster to use binary search or not. */
558  if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
559  use_bsearch = true;
560  else
561  use_bsearch = false;
562 
563  if (operator == OID_ARRAY_CONTAINS_OP)
564  {
565  /*
566  * Initial selectivity for "column @> const" query is 1.0, and it will
567  * be decreased with each element of constant array.
568  */
569  selec = 1.0;
570  }
571  else
572  {
573  /*
574  * Initial selectivity for "column && const" query is 0.0, and it will
575  * be increased with each element of constant array.
576  */
577  selec = 0.0;
578  }
579 
580  /* Scan mcelem and array in parallel. */
581  mcelem_index = 0;
582  for (i = 0; i < nitems; i++)
583  {
584  bool match = false;
585 
586  /* Ignore any duplicates in the array data. */
587  if (i > 0 &&
588  element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
589  continue;
590 
591  /* Find the smallest MCELEM >= this array item. */
592  if (use_bsearch)
593  {
594  match = find_next_mcelem(mcelem, nmcelem, array_data[i],
595  &mcelem_index, typentry);
596  }
597  else
598  {
599  while (mcelem_index < nmcelem)
600  {
601  int cmp = element_compare(&mcelem[mcelem_index],
602  &array_data[i],
603  typentry);
604 
605  if (cmp < 0)
606  mcelem_index++;
607  else
608  {
609  if (cmp == 0)
610  match = true; /* mcelem is found */
611  break;
612  }
613  }
614  }
615 
616  if (match && numbers)
617  {
618  /* MCELEM matches the array item; use its frequency. */
619  elem_selec = numbers[mcelem_index];
620  mcelem_index++;
621  }
622  else
623  {
624  /*
625  * The element is not in MCELEM. Punt, but assume that the
626  * selectivity cannot be more than minfreq / 2.
627  */
628  elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
629  }
630 
631  /*
632  * Update overall selectivity using the current element's selectivity
633  * and an assumption of element occurrence independence.
634  */
635  if (operator == OID_ARRAY_CONTAINS_OP)
636  selec *= elem_selec;
637  else
638  selec = selec + elem_selec - selec * elem_selec;
639 
640  /* Clamp intermediate results to stay sane despite roundoff error */
641  CLAMP_PROBABILITY(selec);
642  }
643 
644  return selec;
645 }
646 
647 /*
648  * Estimate selectivity of "column <@ const" based on most common element
649  * statistics.
650  *
651  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
652  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
653  * not available. array_data (of length nitems) is the constant's elements.
654  * hist (of length nhist) is from the array column's DECHIST statistics slot,
655  * or is NULL/0 if those stats are not available.
656  *
657  * Both the mcelem and array_data arrays are assumed presorted according
658  * to the element type's cmpfunc. Null elements are not present.
659  *
660  * Independent element occurrence would imply a particular distribution of
661  * distinct element counts among matching rows. Real data usually falsifies
662  * that assumption. For example, in a set of 11-element integer arrays having
663  * elements in the range [0..10], element occurrences are typically not
664  * independent. If they were, a sufficiently-large set would include all
665  * distinct element counts 0 through 11. We correct for this using the
666  * histogram of distinct element counts.
667  *
668  * In the "column @> const" and "column && const" cases, we usually have a
669  * "const" with low number of elements (otherwise we have selectivity close
670  * to 0 or 1 respectively). That's why the effect of dependence related
671  * to distinct element count distribution is negligible there. In the
672  * "column <@ const" case, number of elements is usually high (otherwise we
673  * have selectivity close to 0). That's why we should do a correction with
674  * the array distinct element count distribution here.
675  *
676  * Using the histogram of distinct element counts produces a different
677  * distribution law than independent occurrences of elements. This
678  * distribution law can be described as follows:
679  *
680  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
681  * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
682  *
683  * where:
684  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
685  * (1 - occurrence, 0 - no occurrence) in row
686  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
687  * (scalar values in [0..1]) according to collected statistics
688  * m = o1 + o2 + ... + on = total number of distinct elements in row
689  * hist[m] - histogram data for occurrence of m elements.
690  * ind[m] - probability of m occurrences from n events assuming their
691  * probabilities to be equal to frequencies of array elements.
692  *
693  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
694  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
695  */
696 static Selectivity
697 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
698  float4 *numbers, int nnumbers,
699  Datum *array_data, int nitems,
700  float4 *hist, int nhist,
701  Oid operator, TypeCacheEntry *typentry)
702 {
703  int mcelem_index,
704  i,
705  unique_nitems = 0;
706  float selec,
707  minfreq,
708  nullelem_freq;
709  float *dist,
710  *mcelem_dist,
711  *hist_part;
712  float avg_count,
713  mult,
714  rest;
715  float *elem_selec;
716 
717  /*
718  * There should be three more Numbers than Values in the MCELEM slot,
719  * because the last three cells should hold minimal and maximal frequency
720  * among the non-null elements, and then the frequency of null elements.
721  * Punt if not right, because we can't do much without the element freqs.
722  */
723  if (numbers == NULL || nnumbers != nmcelem + 3)
724  return DEFAULT_CONTAIN_SEL;
725 
726  /* Can't do much without a count histogram, either */
727  if (hist == NULL || nhist < 3)
728  return DEFAULT_CONTAIN_SEL;
729 
730  /*
731  * Grab some of the summary statistics that compute_array_stats() stores:
732  * lowest frequency, frequency of null elements, and average distinct
733  * element count.
734  */
735  minfreq = numbers[nmcelem];
736  nullelem_freq = numbers[nmcelem + 2];
737  avg_count = hist[nhist - 1];
738 
739  /*
740  * "rest" will be the sum of the frequencies of all elements not
741  * represented in MCELEM. The average distinct element count is the sum
742  * of the frequencies of *all* elements. Begin with that; we will proceed
743  * to subtract the MCELEM frequencies.
744  */
745  rest = avg_count;
746 
747  /*
748  * mult is a multiplier representing estimate of probability that each
749  * mcelem that is not present in constant doesn't occur.
750  */
751  mult = 1.0f;
752 
753  /*
754  * elem_selec is array of estimated frequencies for elements in the
755  * constant.
756  */
757  elem_selec = (float *) palloc(sizeof(float) * nitems);
758 
759  /* Scan mcelem and array in parallel. */
760  mcelem_index = 0;
761  for (i = 0; i < nitems; i++)
762  {
763  bool match = false;
764 
765  /* Ignore any duplicates in the array data. */
766  if (i > 0 &&
767  element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
768  continue;
769 
770  /*
771  * Iterate over MCELEM until we find an entry greater than or equal to
772  * this element of the constant. Update "rest" and "mult" for mcelem
773  * entries skipped over.
774  */
775  while (mcelem_index < nmcelem)
776  {
777  int cmp = element_compare(&mcelem[mcelem_index],
778  &array_data[i],
779  typentry);
780 
781  if (cmp < 0)
782  {
783  mult *= (1.0f - numbers[mcelem_index]);
784  rest -= numbers[mcelem_index];
785  mcelem_index++;
786  }
787  else
788  {
789  if (cmp == 0)
790  match = true; /* mcelem is found */
791  break;
792  }
793  }
794 
795  if (match)
796  {
797  /* MCELEM matches the array item. */
798  elem_selec[unique_nitems] = numbers[mcelem_index];
799  /* "rest" is decremented for all mcelems, matched or not */
800  rest -= numbers[mcelem_index];
801  mcelem_index++;
802  }
803  else
804  {
805  /*
806  * The element is not in MCELEM. Punt, but assume that the
807  * selectivity cannot be more than minfreq / 2.
808  */
809  elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
810  minfreq / 2);
811  }
812 
813  unique_nitems++;
814  }
815 
816  /*
817  * If we handled all constant elements without exhausting the MCELEM
818  * array, finish walking it to complete calculation of "rest" and "mult".
819  */
820  while (mcelem_index < nmcelem)
821  {
822  mult *= (1.0f - numbers[mcelem_index]);
823  rest -= numbers[mcelem_index];
824  mcelem_index++;
825  }
826 
827  /*
828  * The presence of many distinct rare elements materially decreases
829  * selectivity. Use the Poisson distribution to estimate the probability
830  * of a column value having zero occurrences of such elements. See above
831  * for the definition of "rest".
832  */
833  mult *= exp(-rest);
834 
835  /*----------
836  * Using the distinct element count histogram requires
837  * O(unique_nitems * (nmcelem + unique_nitems))
838  * operations. Beyond a certain computational cost threshold, it's
839  * reasonable to sacrifice accuracy for decreased planning time. We limit
840  * the number of operations to EFFORT * nmcelem; since nmcelem is limited
841  * by the column's statistics target, the work done is user-controllable.
842  *
843  * If the number of operations would be too large, we can reduce it
844  * without losing all accuracy by reducing unique_nitems and considering
845  * only the most-common elements of the constant array. To make the
846  * results exactly match what we would have gotten with only those
847  * elements to start with, we'd have to remove any discarded elements'
848  * frequencies from "mult", but since this is only an approximation
849  * anyway, we don't bother with that. Therefore it's sufficient to qsort
850  * elem_selec[] and take the largest elements. (They will no longer match
851  * up with the elements of array_data[], but we don't care.)
852  *----------
853  */
854 #define EFFORT 100
855 
856  if ((nmcelem + unique_nitems) > 0 &&
857  unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
858  {
859  /*
860  * Use the quadratic formula to solve for largest allowable N. We
861  * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
862  */
863  double b = (double) nmcelem;
864  int n;
865 
866  n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
867 
868  /* Sort, then take just the first n elements */
869  qsort(elem_selec, unique_nitems, sizeof(float),
871  unique_nitems = n;
872  }
873 
874  /*
875  * Calculate probabilities of each distinct element count for both mcelems
876  * and constant elements. At this point, assume independent element
877  * occurrence.
878  */
879  dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
880  mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
881 
882  /* ignore hist[nhist-1], which is the average not a histogram member */
883  hist_part = calc_hist(hist, nhist - 1, unique_nitems);
884 
885  selec = 0.0f;
886  for (i = 0; i <= unique_nitems; i++)
887  {
888  /*
889  * mult * dist[i] / mcelem_dist[i] gives us probability of qual
890  * matching from assumption of independent element occurrence with the
891  * condition that distinct element count = i.
892  */
893  if (mcelem_dist[i] > 0)
894  selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
895  }
896 
897  pfree(dist);
898  pfree(mcelem_dist);
899  pfree(hist_part);
900  pfree(elem_selec);
901 
902  /* Take into account occurrence of NULL element. */
903  selec *= (1.0f - nullelem_freq);
904 
905  CLAMP_PROBABILITY(selec);
906 
907  return selec;
908 }
909 
910 /*
911  * Calculate the first n distinct element count probabilities from a
912  * histogram of distinct element counts.
913  *
914  * Returns a palloc'd array of n+1 entries, with array[k] being the
915  * probability of element count k, k in [0..n].
916  *
917  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
918  * (nhist - 1)) probability to each value in (a,b) and an additional half of
919  * that to a and b themselves.
920  */
921 static float *
922 calc_hist(const float4 *hist, int nhist, int n)
923 {
924  float *hist_part;
925  int k,
926  i = 0;
927  float prev_interval = 0,
928  next_interval;
929  float frac;
930 
931  hist_part = (float *) palloc((n + 1) * sizeof(float));
932 
933  /*
934  * frac is a probability contribution for each interval between histogram
935  * values. We have nhist - 1 intervals, so contribution of each one will
936  * be 1 / (nhist - 1).
937  */
938  frac = 1.0f / ((float) (nhist - 1));
939 
940  for (k = 0; k <= n; k++)
941  {
942  int count = 0;
943 
944  /*
945  * Count the histogram boundaries equal to k. (Although the histogram
946  * should theoretically contain only exact integers, entries are
947  * floats so there could be roundoff error in large values. Treat any
948  * fractional value as equal to the next larger k.)
949  */
950  while (i < nhist && hist[i] <= k)
951  {
952  count++;
953  i++;
954  }
955 
956  if (count > 0)
957  {
958  /* k is an exact bound for at least one histogram box. */
959  float val;
960 
961  /* Find length between current histogram value and the next one */
962  if (i < nhist)
963  next_interval = hist[i] - hist[i - 1];
964  else
965  next_interval = 0;
966 
967  /*
968  * count - 1 histogram boxes contain k exclusively. They
969  * contribute a total of (count - 1) * frac probability. Also
970  * factor in the partial histogram boxes on either side.
971  */
972  val = (float) (count - 1);
973  if (next_interval > 0)
974  val += 0.5f / next_interval;
975  if (prev_interval > 0)
976  val += 0.5f / prev_interval;
977  hist_part[k] = frac * val;
978 
979  prev_interval = next_interval;
980  }
981  else
982  {
983  /* k does not appear as an exact histogram bound. */
984  if (prev_interval > 0)
985  hist_part[k] = frac / prev_interval;
986  else
987  hist_part[k] = 0.0f;
988  }
989  }
990 
991  return hist_part;
992 }
993 
994 /*
995  * Consider n independent events with probabilities p[]. This function
996  * calculates probabilities of exact k of events occurrence for k in [0..m].
997  * Returns a palloc'd array of size m+1.
998  *
999  * "rest" is the sum of the probabilities of all low-probability events not
1000  * included in p.
1001  *
1002  * Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1003  * probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1004  * For any constant j, each increment of i increases the probability iff the
1005  * event occurs. So, by the law of total probability:
1006  * M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1007  * for i > 0, j > 0.
1008  * M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1009  */
1010 static float *
1011 calc_distr(const float *p, int n, int m, float rest)
1012 {
1013  float *row,
1014  *prev_row,
1015  *tmp;
1016  int i,
1017  j;
1018 
1019  /*
1020  * Since we return only the last row of the matrix and need only the
1021  * current and previous row for calculations, allocate two rows.
1022  */
1023  row = (float *) palloc((m + 1) * sizeof(float));
1024  prev_row = (float *) palloc((m + 1) * sizeof(float));
1025 
1026  /* M[0,0] = 1 */
1027  row[0] = 1.0f;
1028  for (i = 1; i <= n; i++)
1029  {
1030  float t = p[i - 1];
1031 
1032  /* Swap rows */
1033  tmp = row;
1034  row = prev_row;
1035  prev_row = tmp;
1036 
1037  /* Calculate next row */
1038  for (j = 0; j <= i && j <= m; j++)
1039  {
1040  float val = 0.0f;
1041 
1042  if (j < i)
1043  val += prev_row[j] * (1.0f - t);
1044  if (j > 0)
1045  val += prev_row[j - 1] * t;
1046  row[j] = val;
1047  }
1048  }
1049 
1050  /*
1051  * The presence of many distinct rare (not in "p") elements materially
1052  * decreases selectivity. Model their collective occurrence with the
1053  * Poisson distribution.
1054  */
1055  if (rest > DEFAULT_CONTAIN_SEL)
1056  {
1057  float t;
1058 
1059  /* Swap rows */
1060  tmp = row;
1061  row = prev_row;
1062  prev_row = tmp;
1063 
1064  for (i = 0; i <= m; i++)
1065  row[i] = 0.0f;
1066 
1067  /* Value of Poisson distribution for 0 occurrences */
1068  t = exp(-rest);
1069 
1070  /*
1071  * Calculate convolution of previously computed distribution and the
1072  * Poisson distribution.
1073  */
1074  for (i = 0; i <= m; i++)
1075  {
1076  for (j = 0; j <= m - i; j++)
1077  row[j + i] += prev_row[j] * t;
1078 
1079  /* Get Poisson distribution value for (i + 1) occurrences */
1080  t *= rest / (float) (i + 1);
1081  }
1082  }
1083 
1084  pfree(prev_row);
1085  return row;
1086 }
1087 
1088 /* Fast function for floor value of 2 based logarithm calculation. */
1089 static int
1091 {
1092  int logval = 0;
1093 
1094  if (n == 0)
1095  return -1;
1096  if (n >= (1 << 16))
1097  {
1098  n >>= 16;
1099  logval += 16;
1100  }
1101  if (n >= (1 << 8))
1102  {
1103  n >>= 8;
1104  logval += 8;
1105  }
1106  if (n >= (1 << 4))
1107  {
1108  n >>= 4;
1109  logval += 4;
1110  }
1111  if (n >= (1 << 2))
1112  {
1113  n >>= 2;
1114  logval += 2;
1115  }
1116  if (n >= (1 << 1))
1117  {
1118  logval += 1;
1119  }
1120  return logval;
1121 }
1122 
1123 /*
1124  * find_next_mcelem binary-searches a most common elements array, starting
1125  * from *index, for the first member >= value. It saves the position of the
1126  * match into *index and returns true if it's an exact match. (Note: we
1127  * assume the mcelem elements are distinct so there can't be more than one
1128  * exact match.)
1129  */
1130 static bool
1131 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1132  TypeCacheEntry *typentry)
1133 {
1134  int l = *index,
1135  r = nmcelem - 1,
1136  i,
1137  res;
1138 
1139  while (l <= r)
1140  {
1141  i = (l + r) / 2;
1142  res = element_compare(&mcelem[i], &value, typentry);
1143  if (res == 0)
1144  {
1145  *index = i;
1146  return true;
1147  }
1148  else if (res < 0)
1149  l = i + 1;
1150  else
1151  r = i - 1;
1152  }
1153  *index = l;
1154  return false;
1155 }
1156 
1157 /*
1158  * Comparison function for elements.
1159  *
1160  * We use the element type's default btree opclass, and its default collation
1161  * if the type is collation-sensitive.
1162  *
1163  * XXX consider using SortSupport infrastructure
1164  */
1165 static int
1166 element_compare(const void *key1, const void *key2, void *arg)
1167 {
1168  Datum d1 = *((const Datum *) key1);
1169  Datum d2 = *((const Datum *) key2);
1170  TypeCacheEntry *typentry = (TypeCacheEntry *) arg;
1171  FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo;
1172  Datum c;
1173 
1174  c = FunctionCall2Coll(cmpfunc, typentry->typcollation, d1, d2);
1175  return DatumGetInt32(c);
1176 }
1177 
1178 /*
1179  * Comparison function for sorting floats into descending order.
1180  */
1181 static int
1182 float_compare_desc(const void *key1, const void *key2)
1183 {
1184  float d1 = *((const float *) key1);
1185  float d2 = *((const float *) key2);
1186 
1187  if (d1 > d2)
1188  return -1;
1189  else if (d1 < d2)
1190  return 1;
1191  else
1192  return 0;
1193 }
#define DatumGetArrayTypeP(X)
Definition: array.h:261
Selectivity scalararraysel_containment(PlannerInfo *root, Node *leftop, Node *rightop, Oid elemtype, bool isEquality, bool useOr, int varRelid)
static int float_compare_desc(const void *key1, const void *key2)
static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, float4 *hist, int nhist, Oid operator, TypeCacheEntry *typentry)
#define DEFAULT_CONTAIN_SEL
static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval, Oid elemtype, Oid operator)
static int floor_log2(uint32 n)
#define EFFORT
static float * calc_hist(const float4 *hist, int nhist, int n)
static int element_compare(const void *key1, const void *key2, void *arg)
static float * calc_distr(const float *p, int n, int m, float rest)
Datum arraycontsel(PG_FUNCTION_ARGS)
static Selectivity mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry, Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, float4 *hist, int nhist, Oid operator)
static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, Oid operator, TypeCacheEntry *typentry)
Datum arraycontjoinsel(PG_FUNCTION_ARGS)
#define DEFAULT_SEL(operator)
static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index, TypeCacheEntry *typentry)
void deconstruct_array(ArrayType *array, Oid elmtype, int elmlen, bool elmbyval, char elmalign, Datum **elemsp, bool **nullsp, int *nelemsp)
Definition: arrayfuncs.c:3613
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
double float8
Definition: c.h:619
float float4
Definition: c.h:618
#define OidIsValid(objectId)
Definition: c.h:764
#define ERROR
Definition: elog.h:39
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1132
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:367
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
#define nitems(x)
Definition: indent.h:31
long val
Definition: informix.c:664
static struct @148 value
int b
Definition: isn.c:70
int j
Definition: isn.c:74
int i
Definition: isn.c:73
void free_attstatsslot(AttStatsSlot *sslot)
Definition: lsyscache.c:3326
Oid get_base_element_type(Oid typid)
Definition: lsyscache.c:2814
bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags)
Definition: lsyscache.c:3216
#define ATTSTATSSLOT_NUMBERS
Definition: lsyscache.h:43
#define ATTSTATSSLOT_VALUES
Definition: lsyscache.h:42
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1226
#define IsA(nodeptr, _type_)
Definition: nodes.h:179
double Selectivity
Definition: nodes.h:261
void * arg
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:135
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:445
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
char * c
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4853
void examine_variable(PlannerInfo *root, Node *node, int varRelid, VariableStatData *vardata)
Definition: selfuncs.c:4982
bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid)
Definition: selfuncs.c:5692
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:99
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:63
Datum * values
Definition: lsyscache.h:53
float4 * numbers
Definition: lsyscache.h:56
int nnumbers
Definition: lsyscache.h:57
Definition: fmgr.h:57
Oid fn_oid
Definition: fmgr.h:59
Definition: pg_list.h:54
Definition: nodes.h:129
FmgrInfo cmp_proc_finfo
Definition: typcache.h:76
char typalign
Definition: typcache.h:41
bool typbyval
Definition: typcache.h:40
int16 typlen
Definition: typcache.h:39
Oid typcollation
Definition: typcache.h:47
HeapTuple statsTuple
Definition: selfuncs.h:89
RelOptInfo * rel
Definition: selfuncs.h:88
Definition: type.h:95
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition: typcache.c:344
#define TYPECACHE_CMP_PROC_FINFO
Definition: typcache.h:142