PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
array_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * array_selfuncs.c
4  * Functions for selectivity estimation of array operators
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/array_selfuncs.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "optimizer/clauses.h"
24 #include "utils/array.h"
25 #include "utils/builtins.h"
26 #include "utils/lsyscache.h"
27 #include "utils/selfuncs.h"
28 #include "utils/typcache.h"
29 
30 
31 /* Default selectivity constant for "@>" and "<@" operators */
32 #define DEFAULT_CONTAIN_SEL 0.005
33 
34 /* Default selectivity constant for "&&" operator */
35 #define DEFAULT_OVERLAP_SEL 0.01
36 
37 /* Default selectivity for given operator */
38 #define DEFAULT_SEL(operator) \
39  ((operator) == OID_ARRAY_OVERLAP_OP ? \
40  DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
41 
42 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
43  Oid elemtype, Oid operator);
45  TypeCacheEntry *typentry,
46  Datum *mcelem, int nmcelem,
47  float4 *numbers, int nnumbers,
48  float4 *hist, int nhist,
49  Oid operator, FmgrInfo *cmpfunc);
50 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
51  float4 *numbers, int nnumbers,
52  Datum *array_data, int nitems,
53  Oid operator, FmgrInfo *cmpfunc);
54 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
55  float4 *numbers, int nnumbers,
56  Datum *array_data, int nitems,
57  float4 *hist, int nhist,
58  Oid operator, FmgrInfo *cmpfunc);
59 static float *calc_hist(const float4 *hist, int nhist, int n);
60 static float *calc_distr(const float *p, int n, int m, float rest);
61 static int floor_log2(uint32 n);
62 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
63  int *index, FmgrInfo *cmpfunc);
64 static int element_compare(const void *key1, const void *key2, void *arg);
65 static int float_compare_desc(const void *key1, const void *key2);
66 
67 
68 /*
69  * scalararraysel_containment
70  * Estimate selectivity of ScalarArrayOpExpr via array containment.
71  *
72  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
73  * selectivity as though this were an array containment operator,
74  * array_var op ARRAY[const].
75  *
76  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
77  * is the array element type's default equality or inequality operator, and
78  * has aggressively simplified both inputs to constants.
79  *
80  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
81  */
84  Node *leftop, Node *rightop,
85  Oid elemtype, bool isEquality, bool useOr,
86  int varRelid)
87 {
88  Selectivity selec;
89  VariableStatData vardata;
90  Datum constval;
91  TypeCacheEntry *typentry;
92  FmgrInfo *cmpfunc;
93 
94  /*
95  * rightop must be a variable, else punt.
96  */
97  examine_variable(root, rightop, varRelid, &vardata);
98  if (!vardata.rel)
99  {
100  ReleaseVariableStats(vardata);
101  return -1.0;
102  }
103 
104  /*
105  * leftop must be a constant, else punt.
106  */
107  if (!IsA(leftop, Const))
108  {
109  ReleaseVariableStats(vardata);
110  return -1.0;
111  }
112  if (((Const *) leftop)->constisnull)
113  {
114  /* qual can't succeed if null on left */
115  ReleaseVariableStats(vardata);
116  return (Selectivity) 0.0;
117  }
118  constval = ((Const *) leftop)->constvalue;
119 
120  /* Get element type's default comparison function */
121  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
122  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
123  {
124  ReleaseVariableStats(vardata);
125  return -1.0;
126  }
127  cmpfunc = &typentry->cmp_proc_finfo;
128 
129  /*
130  * If the operator is <>, swap ANY/ALL, then invert the result later.
131  */
132  if (!isEquality)
133  useOr = !useOr;
134 
135  /* Get array element stats for var, if available */
136  if (HeapTupleIsValid(vardata.statsTuple))
137  {
138  Form_pg_statistic stats;
139  Datum *values;
140  int nvalues;
141  float4 *numbers;
142  int nnumbers;
143  float4 *hist;
144  int nhist;
145 
146  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
147 
148  /* MCELEM will be an array of same type as element */
149  if (get_attstatsslot(vardata.statsTuple,
150  elemtype, vardata.atttypmod,
152  NULL,
153  &values, &nvalues,
154  &numbers, &nnumbers))
155  {
156  /* For ALL case, also get histogram of distinct-element counts */
157  if (useOr ||
158  !get_attstatsslot(vardata.statsTuple,
159  elemtype, vardata.atttypmod,
161  NULL,
162  NULL, NULL,
163  &hist, &nhist))
164  {
165  hist = NULL;
166  nhist = 0;
167  }
168 
169  /*
170  * For = ANY, estimate as var @> ARRAY[const].
171  *
172  * For = ALL, estimate as var <@ ARRAY[const].
173  */
174  if (useOr)
175  selec = mcelem_array_contain_overlap_selec(values, nvalues,
176  numbers, nnumbers,
177  &constval, 1,
179  cmpfunc);
180  else
181  selec = mcelem_array_contained_selec(values, nvalues,
182  numbers, nnumbers,
183  &constval, 1,
184  hist, nhist,
186  cmpfunc);
187 
188  if (hist)
189  free_attstatsslot(elemtype, NULL, 0, hist, nhist);
190  free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
191  }
192  else
193  {
194  /* No most-common-elements info, so do without */
195  if (useOr)
197  NULL, 0,
198  &constval, 1,
200  cmpfunc);
201  else
203  NULL, 0,
204  &constval, 1,
205  NULL, 0,
207  cmpfunc);
208  }
209 
210  /*
211  * MCE stats count only non-null rows, so adjust for null rows.
212  */
213  selec *= (1.0 - stats->stanullfrac);
214  }
215  else
216  {
217  /* No stats at all, so do without */
218  if (useOr)
220  NULL, 0,
221  &constval, 1,
223  cmpfunc);
224  else
226  NULL, 0,
227  &constval, 1,
228  NULL, 0,
230  cmpfunc);
231  /* we assume no nulls here, so no stanullfrac correction */
232  }
233 
234  ReleaseVariableStats(vardata);
235 
236  /*
237  * If the operator is <>, invert the results.
238  */
239  if (!isEquality)
240  selec = 1.0 - selec;
241 
242  CLAMP_PROBABILITY(selec);
243 
244  return selec;
245 }
246 
247 /*
248  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
249  */
250 Datum
252 {
254  Oid operator = PG_GETARG_OID(1);
255  List *args = (List *) PG_GETARG_POINTER(2);
256  int varRelid = PG_GETARG_INT32(3);
257  VariableStatData vardata;
258  Node *other;
259  bool varonleft;
260  Selectivity selec;
261  Oid element_typeid;
262 
263  /*
264  * If expression is not (variable op something) or (something op
265  * variable), then punt and return a default estimate.
266  */
267  if (!get_restriction_variable(root, args, varRelid,
268  &vardata, &other, &varonleft))
269  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
270 
271  /*
272  * Can't do anything useful if the something is not a constant, either.
273  */
274  if (!IsA(other, Const))
275  {
276  ReleaseVariableStats(vardata);
277  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
278  }
279 
280  /*
281  * The "&&", "@>" and "<@" operators are strict, so we can cope with a
282  * NULL constant right away.
283  */
284  if (((Const *) other)->constisnull)
285  {
286  ReleaseVariableStats(vardata);
287  PG_RETURN_FLOAT8(0.0);
288  }
289 
290  /*
291  * If var is on the right, commute the operator, so that we can assume the
292  * var is on the left in what follows.
293  */
294  if (!varonleft)
295  {
296  if (operator == OID_ARRAY_CONTAINS_OP)
297  operator = OID_ARRAY_CONTAINED_OP;
298  else if (operator == OID_ARRAY_CONTAINED_OP)
299  operator = OID_ARRAY_CONTAINS_OP;
300  }
301 
302  /*
303  * OK, there's a Var and a Const we're dealing with here. We need the
304  * Const to be an array with same element type as column, else we can't do
305  * anything useful. (Such cases will likely fail at runtime, but here
306  * we'd rather just return a default estimate.)
307  */
308  element_typeid = get_base_element_type(((Const *) other)->consttype);
309  if (element_typeid != InvalidOid &&
310  element_typeid == get_base_element_type(vardata.vartype))
311  {
312  selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
313  element_typeid, operator);
314  }
315  else
316  {
317  selec = DEFAULT_SEL(operator);
318  }
319 
320  ReleaseVariableStats(vardata);
321 
322  CLAMP_PROBABILITY(selec);
323 
324  PG_RETURN_FLOAT8((float8) selec);
325 }
326 
327 /*
328  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
329  */
330 Datum
332 {
333  /* For the moment this is just a stub */
334  Oid operator = PG_GETARG_OID(1);
335 
336  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
337 }
338 
339 /*
340  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
341  * or "arraycolumn <@ const" based on the statistics
342  *
343  * This function is mainly responsible for extracting the pg_statistic data
344  * to be used; we then pass the problem on to mcelem_array_selec().
345  */
346 static Selectivity
348  Oid elemtype, Oid operator)
349 {
350  Selectivity selec;
351  TypeCacheEntry *typentry;
352  FmgrInfo *cmpfunc;
353  ArrayType *array;
354 
355  /* Get element type's default comparison function */
356  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
357  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
358  return DEFAULT_SEL(operator);
359  cmpfunc = &typentry->cmp_proc_finfo;
360 
361  /*
362  * The caller made sure the const is an array with same element type, so
363  * get it now
364  */
365  array = DatumGetArrayTypeP(constval);
366 
367  if (HeapTupleIsValid(vardata->statsTuple))
368  {
369  Form_pg_statistic stats;
370  Datum *values;
371  int nvalues;
372  float4 *numbers;
373  int nnumbers;
374  float4 *hist;
375  int nhist;
376 
377  stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
378 
379  /* MCELEM will be an array of same type as column */
380  if (get_attstatsslot(vardata->statsTuple,
381  elemtype, vardata->atttypmod,
383  NULL,
384  &values, &nvalues,
385  &numbers, &nnumbers))
386  {
387  /*
388  * For "array <@ const" case we also need histogram of distinct
389  * element counts.
390  */
391  if (operator != OID_ARRAY_CONTAINED_OP ||
392  !get_attstatsslot(vardata->statsTuple,
393  elemtype, vardata->atttypmod,
395  NULL,
396  NULL, NULL,
397  &hist, &nhist))
398  {
399  hist = NULL;
400  nhist = 0;
401  }
402 
403  /* Use the most-common-elements slot for the array Var. */
404  selec = mcelem_array_selec(array, typentry,
405  values, nvalues,
406  numbers, nnumbers,
407  hist, nhist,
408  operator, cmpfunc);
409 
410  if (hist)
411  free_attstatsslot(elemtype, NULL, 0, hist, nhist);
412  free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
413  }
414  else
415  {
416  /* No most-common-elements info, so do without */
417  selec = mcelem_array_selec(array, typentry,
418  NULL, 0, NULL, 0, NULL, 0,
419  operator, cmpfunc);
420  }
421 
422  /*
423  * MCE stats count only non-null rows, so adjust for null rows.
424  */
425  selec *= (1.0 - stats->stanullfrac);
426  }
427  else
428  {
429  /* No stats at all, so do without */
430  selec = mcelem_array_selec(array, typentry,
431  NULL, 0, NULL, 0, NULL, 0,
432  operator, cmpfunc);
433  /* we assume no nulls here, so no stanullfrac correction */
434  }
435 
436  /* If constant was toasted, release the copy we made */
437  if (PointerGetDatum(array) != constval)
438  pfree(array);
439 
440  return selec;
441 }
442 
443 /*
444  * Array selectivity estimation based on most common elements statistics
445  *
446  * This function just deconstructs and sorts the array constant's contents,
447  * and then passes the problem on to mcelem_array_contain_overlap_selec or
448  * mcelem_array_contained_selec depending on the operator.
449  */
450 static Selectivity
452  Datum *mcelem, int nmcelem,
453  float4 *numbers, int nnumbers,
454  float4 *hist, int nhist,
455  Oid operator, FmgrInfo *cmpfunc)
456 {
457  Selectivity selec;
458  int num_elems;
459  Datum *elem_values;
460  bool *elem_nulls;
461  bool null_present;
462  int nonnull_nitems;
463  int i;
464 
465  /*
466  * Prepare constant array data for sorting. Sorting lets us find unique
467  * elements and efficiently merge with the MCELEM array.
468  */
469  deconstruct_array(array,
470  typentry->type_id,
471  typentry->typlen,
472  typentry->typbyval,
473  typentry->typalign,
474  &elem_values, &elem_nulls, &num_elems);
475 
476  /* Collapse out any null elements */
477  nonnull_nitems = 0;
478  null_present = false;
479  for (i = 0; i < num_elems; i++)
480  {
481  if (elem_nulls[i])
482  null_present = true;
483  else
484  elem_values[nonnull_nitems++] = elem_values[i];
485  }
486 
487  /*
488  * Query "column @> '{anything, null}'" matches nothing. For the other
489  * two operators, presence of a null in the constant can be ignored.
490  */
491  if (null_present && operator == OID_ARRAY_CONTAINS_OP)
492  {
493  pfree(elem_values);
494  pfree(elem_nulls);
495  return (Selectivity) 0.0;
496  }
497 
498  /* Sort extracted elements using their default comparison function. */
499  qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
500  element_compare, cmpfunc);
501 
502  /* Separate cases according to operator */
503  if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
504  selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
505  numbers, nnumbers,
506  elem_values, nonnull_nitems,
507  operator, cmpfunc);
508  else if (operator == OID_ARRAY_CONTAINED_OP)
509  selec = mcelem_array_contained_selec(mcelem, nmcelem,
510  numbers, nnumbers,
511  elem_values, nonnull_nitems,
512  hist, nhist,
513  operator, cmpfunc);
514  else
515  {
516  elog(ERROR, "arraycontsel called for unrecognized operator %u",
517  operator);
518  selec = 0.0; /* keep compiler quiet */
519  }
520 
521  pfree(elem_values);
522  pfree(elem_nulls);
523  return selec;
524 }
525 
526 /*
527  * Estimate selectivity of "column @> const" and "column && const" based on
528  * most common element statistics. This estimation assumes element
529  * occurrences are independent.
530  *
531  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
532  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
533  * not available. array_data (of length nitems) is the constant's elements.
534  *
535  * Both the mcelem and array_data arrays are assumed presorted according
536  * to the element type's cmpfunc. Null elements are not present.
537  *
538  * TODO: this estimate probably could be improved by using the distinct
539  * elements count histogram. For example, excepting the special case of
540  * "column @> '{}'", we can multiply the calculated selectivity by the
541  * fraction of nonempty arrays in the column.
542  */
543 static Selectivity
545  float4 *numbers, int nnumbers,
546  Datum *array_data, int nitems,
547  Oid operator, FmgrInfo *cmpfunc)
548 {
549  Selectivity selec,
550  elem_selec;
551  int mcelem_index,
552  i;
553  bool use_bsearch;
554  float4 minfreq;
555 
556  /*
557  * There should be three more Numbers than Values, because the last three
558  * cells should hold minimal and maximal frequency among the non-null
559  * elements, and then the frequency of null elements. Ignore the Numbers
560  * if not right.
561  */
562  if (nnumbers != nmcelem + 3)
563  {
564  numbers = NULL;
565  nnumbers = 0;
566  }
567 
568  if (numbers)
569  {
570  /* Grab the lowest observed frequency */
571  minfreq = numbers[nmcelem];
572  }
573  else
574  {
575  /* Without statistics make some default assumptions */
576  minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
577  }
578 
579  /* Decide whether it is faster to use binary search or not. */
580  if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
581  use_bsearch = true;
582  else
583  use_bsearch = false;
584 
585  if (operator == OID_ARRAY_CONTAINS_OP)
586  {
587  /*
588  * Initial selectivity for "column @> const" query is 1.0, and it will
589  * be decreased with each element of constant array.
590  */
591  selec = 1.0;
592  }
593  else
594  {
595  /*
596  * Initial selectivity for "column && const" query is 0.0, and it will
597  * be increased with each element of constant array.
598  */
599  selec = 0.0;
600  }
601 
602  /* Scan mcelem and array in parallel. */
603  mcelem_index = 0;
604  for (i = 0; i < nitems; i++)
605  {
606  bool match = false;
607 
608  /* Ignore any duplicates in the array data. */
609  if (i > 0 &&
610  element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
611  continue;
612 
613  /* Find the smallest MCELEM >= this array item. */
614  if (use_bsearch)
615  {
616  match = find_next_mcelem(mcelem, nmcelem, array_data[i],
617  &mcelem_index, cmpfunc);
618  }
619  else
620  {
621  while (mcelem_index < nmcelem)
622  {
623  int cmp = element_compare(&mcelem[mcelem_index],
624  &array_data[i],
625  cmpfunc);
626 
627  if (cmp < 0)
628  mcelem_index++;
629  else
630  {
631  if (cmp == 0)
632  match = true; /* mcelem is found */
633  break;
634  }
635  }
636  }
637 
638  if (match && numbers)
639  {
640  /* MCELEM matches the array item; use its frequency. */
641  elem_selec = numbers[mcelem_index];
642  mcelem_index++;
643  }
644  else
645  {
646  /*
647  * The element is not in MCELEM. Punt, but assume that the
648  * selectivity cannot be more than minfreq / 2.
649  */
650  elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
651  }
652 
653  /*
654  * Update overall selectivity using the current element's selectivity
655  * and an assumption of element occurrence independence.
656  */
657  if (operator == OID_ARRAY_CONTAINS_OP)
658  selec *= elem_selec;
659  else
660  selec = selec + elem_selec - selec * elem_selec;
661 
662  /* Clamp intermediate results to stay sane despite roundoff error */
663  CLAMP_PROBABILITY(selec);
664  }
665 
666  return selec;
667 }
668 
669 /*
670  * Estimate selectivity of "column <@ const" based on most common element
671  * statistics.
672  *
673  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
674  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
675  * not available. array_data (of length nitems) is the constant's elements.
676  * hist (of length nhist) is from the array column's DECHIST statistics slot,
677  * or is NULL/0 if those stats are not available.
678  *
679  * Both the mcelem and array_data arrays are assumed presorted according
680  * to the element type's cmpfunc. Null elements are not present.
681  *
682  * Independent element occurrence would imply a particular distribution of
683  * distinct element counts among matching rows. Real data usually falsifies
684  * that assumption. For example, in a set of 11-element integer arrays having
685  * elements in the range [0..10], element occurrences are typically not
686  * independent. If they were, a sufficiently-large set would include all
687  * distinct element counts 0 through 11. We correct for this using the
688  * histogram of distinct element counts.
689  *
690  * In the "column @> const" and "column && const" cases, we usually have a
691  * "const" with low number of elements (otherwise we have selectivity close
692  * to 0 or 1 respectively). That's why the effect of dependence related
693  * to distinct element count distribution is negligible there. In the
694  * "column <@ const" case, number of elements is usually high (otherwise we
695  * have selectivity close to 0). That's why we should do a correction with
696  * the array distinct element count distribution here.
697  *
698  * Using the histogram of distinct element counts produces a different
699  * distribution law than independent occurrences of elements. This
700  * distribution law can be described as follows:
701  *
702  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
703  * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
704  *
705  * where:
706  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
707  * (1 - occurrence, 0 - no occurrence) in row
708  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
709  * (scalar values in [0..1]) according to collected statistics
710  * m = o1 + o2 + ... + on = total number of distinct elements in row
711  * hist[m] - histogram data for occurrence of m elements.
712  * ind[m] - probability of m occurrences from n events assuming their
713  * probabilities to be equal to frequencies of array elements.
714  *
715  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
716  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
717  */
718 static Selectivity
719 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
720  float4 *numbers, int nnumbers,
721  Datum *array_data, int nitems,
722  float4 *hist, int nhist,
723  Oid operator, FmgrInfo *cmpfunc)
724 {
725  int mcelem_index,
726  i,
727  unique_nitems = 0;
728  float selec,
729  minfreq,
730  nullelem_freq;
731  float *dist,
732  *mcelem_dist,
733  *hist_part;
734  float avg_count,
735  mult,
736  rest;
737  float *elem_selec;
738 
739  /*
740  * There should be three more Numbers than Values in the MCELEM slot,
741  * because the last three cells should hold minimal and maximal frequency
742  * among the non-null elements, and then the frequency of null elements.
743  * Punt if not right, because we can't do much without the element freqs.
744  */
745  if (numbers == NULL || nnumbers != nmcelem + 3)
746  return DEFAULT_CONTAIN_SEL;
747 
748  /* Can't do much without a count histogram, either */
749  if (hist == NULL || nhist < 3)
750  return DEFAULT_CONTAIN_SEL;
751 
752  /*
753  * Grab some of the summary statistics that compute_array_stats() stores:
754  * lowest frequency, frequency of null elements, and average distinct
755  * element count.
756  */
757  minfreq = numbers[nmcelem];
758  nullelem_freq = numbers[nmcelem + 2];
759  avg_count = hist[nhist - 1];
760 
761  /*
762  * "rest" will be the sum of the frequencies of all elements not
763  * represented in MCELEM. The average distinct element count is the sum
764  * of the frequencies of *all* elements. Begin with that; we will proceed
765  * to subtract the MCELEM frequencies.
766  */
767  rest = avg_count;
768 
769  /*
770  * mult is a multiplier representing estimate of probability that each
771  * mcelem that is not present in constant doesn't occur.
772  */
773  mult = 1.0f;
774 
775  /*
776  * elem_selec is array of estimated frequencies for elements in the
777  * constant.
778  */
779  elem_selec = (float *) palloc(sizeof(float) * nitems);
780 
781  /* Scan mcelem and array in parallel. */
782  mcelem_index = 0;
783  for (i = 0; i < nitems; i++)
784  {
785  bool match = false;
786 
787  /* Ignore any duplicates in the array data. */
788  if (i > 0 &&
789  element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
790  continue;
791 
792  /*
793  * Iterate over MCELEM until we find an entry greater than or equal to
794  * this element of the constant. Update "rest" and "mult" for mcelem
795  * entries skipped over.
796  */
797  while (mcelem_index < nmcelem)
798  {
799  int cmp = element_compare(&mcelem[mcelem_index],
800  &array_data[i],
801  cmpfunc);
802 
803  if (cmp < 0)
804  {
805  mult *= (1.0f - numbers[mcelem_index]);
806  rest -= numbers[mcelem_index];
807  mcelem_index++;
808  }
809  else
810  {
811  if (cmp == 0)
812  match = true; /* mcelem is found */
813  break;
814  }
815  }
816 
817  if (match)
818  {
819  /* MCELEM matches the array item. */
820  elem_selec[unique_nitems] = numbers[mcelem_index];
821  /* "rest" is decremented for all mcelems, matched or not */
822  rest -= numbers[mcelem_index];
823  mcelem_index++;
824  }
825  else
826  {
827  /*
828  * The element is not in MCELEM. Punt, but assume that the
829  * selectivity cannot be more than minfreq / 2.
830  */
831  elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
832  minfreq / 2);
833  }
834 
835  unique_nitems++;
836  }
837 
838  /*
839  * If we handled all constant elements without exhausting the MCELEM
840  * array, finish walking it to complete calculation of "rest" and "mult".
841  */
842  while (mcelem_index < nmcelem)
843  {
844  mult *= (1.0f - numbers[mcelem_index]);
845  rest -= numbers[mcelem_index];
846  mcelem_index++;
847  }
848 
849  /*
850  * The presence of many distinct rare elements materially decreases
851  * selectivity. Use the Poisson distribution to estimate the probability
852  * of a column value having zero occurrences of such elements. See above
853  * for the definition of "rest".
854  */
855  mult *= exp(-rest);
856 
857  /*----------
858  * Using the distinct element count histogram requires
859  * O(unique_nitems * (nmcelem + unique_nitems))
860  * operations. Beyond a certain computational cost threshold, it's
861  * reasonable to sacrifice accuracy for decreased planning time. We limit
862  * the number of operations to EFFORT * nmcelem; since nmcelem is limited
863  * by the column's statistics target, the work done is user-controllable.
864  *
865  * If the number of operations would be too large, we can reduce it
866  * without losing all accuracy by reducing unique_nitems and considering
867  * only the most-common elements of the constant array. To make the
868  * results exactly match what we would have gotten with only those
869  * elements to start with, we'd have to remove any discarded elements'
870  * frequencies from "mult", but since this is only an approximation
871  * anyway, we don't bother with that. Therefore it's sufficient to qsort
872  * elem_selec[] and take the largest elements. (They will no longer match
873  * up with the elements of array_data[], but we don't care.)
874  *----------
875  */
876 #define EFFORT 100
877 
878  if ((nmcelem + unique_nitems) > 0 &&
879  unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
880  {
881  /*
882  * Use the quadratic formula to solve for largest allowable N. We
883  * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
884  */
885  double b = (double) nmcelem;
886  int n;
887 
888  n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
889 
890  /* Sort, then take just the first n elements */
891  qsort(elem_selec, unique_nitems, sizeof(float),
893  unique_nitems = n;
894  }
895 
896  /*
897  * Calculate probabilities of each distinct element count for both mcelems
898  * and constant elements. At this point, assume independent element
899  * occurrence.
900  */
901  dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
902  mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
903 
904  /* ignore hist[nhist-1], which is the average not a histogram member */
905  hist_part = calc_hist(hist, nhist - 1, unique_nitems);
906 
907  selec = 0.0f;
908  for (i = 0; i <= unique_nitems; i++)
909  {
910  /*
911  * mult * dist[i] / mcelem_dist[i] gives us probability of qual
912  * matching from assumption of independent element occurrence with the
913  * condition that distinct element count = i.
914  */
915  if (mcelem_dist[i] > 0)
916  selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
917  }
918 
919  pfree(dist);
920  pfree(mcelem_dist);
921  pfree(hist_part);
922  pfree(elem_selec);
923 
924  /* Take into account occurrence of NULL element. */
925  selec *= (1.0f - nullelem_freq);
926 
927  CLAMP_PROBABILITY(selec);
928 
929  return selec;
930 }
931 
932 /*
933  * Calculate the first n distinct element count probabilities from a
934  * histogram of distinct element counts.
935  *
936  * Returns a palloc'd array of n+1 entries, with array[k] being the
937  * probability of element count k, k in [0..n].
938  *
939  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
940  * (nhist - 1)) probability to each value in (a,b) and an additional half of
941  * that to a and b themselves.
942  */
943 static float *
944 calc_hist(const float4 *hist, int nhist, int n)
945 {
946  float *hist_part;
947  int k,
948  i = 0;
949  float prev_interval = 0,
950  next_interval;
951  float frac;
952 
953  hist_part = (float *) palloc((n + 1) * sizeof(float));
954 
955  /*
956  * frac is a probability contribution for each interval between histogram
957  * values. We have nhist - 1 intervals, so contribution of each one will
958  * be 1 / (nhist - 1).
959  */
960  frac = 1.0f / ((float) (nhist - 1));
961 
962  for (k = 0; k <= n; k++)
963  {
964  int count = 0;
965 
966  /*
967  * Count the histogram boundaries equal to k. (Although the histogram
968  * should theoretically contain only exact integers, entries are
969  * floats so there could be roundoff error in large values. Treat any
970  * fractional value as equal to the next larger k.)
971  */
972  while (i < nhist && hist[i] <= k)
973  {
974  count++;
975  i++;
976  }
977 
978  if (count > 0)
979  {
980  /* k is an exact bound for at least one histogram box. */
981  float val;
982 
983  /* Find length between current histogram value and the next one */
984  if (i < nhist)
985  next_interval = hist[i] - hist[i - 1];
986  else
987  next_interval = 0;
988 
989  /*
990  * count - 1 histogram boxes contain k exclusively. They
991  * contribute a total of (count - 1) * frac probability. Also
992  * factor in the partial histogram boxes on either side.
993  */
994  val = (float) (count - 1);
995  if (next_interval > 0)
996  val += 0.5f / next_interval;
997  if (prev_interval > 0)
998  val += 0.5f / prev_interval;
999  hist_part[k] = frac * val;
1000 
1001  prev_interval = next_interval;
1002  }
1003  else
1004  {
1005  /* k does not appear as an exact histogram bound. */
1006  if (prev_interval > 0)
1007  hist_part[k] = frac / prev_interval;
1008  else
1009  hist_part[k] = 0.0f;
1010  }
1011  }
1012 
1013  return hist_part;
1014 }
1015 
1016 /*
1017  * Consider n independent events with probabilities p[]. This function
1018  * calculates probabilities of exact k of events occurrence for k in [0..m].
1019  * Returns a palloc'd array of size m+1.
1020  *
1021  * "rest" is the sum of the probabilities of all low-probability events not
1022  * included in p.
1023  *
1024  * Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1025  * probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1026  * For any constant j, each increment of i increases the probability iff the
1027  * event occurs. So, by the law of total probability:
1028  * M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1029  * for i > 0, j > 0.
1030  * M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1031  */
1032 static float *
1033 calc_distr(const float *p, int n, int m, float rest)
1034 {
1035  float *row,
1036  *prev_row,
1037  *tmp;
1038  int i,
1039  j;
1040 
1041  /*
1042  * Since we return only the last row of the matrix and need only the
1043  * current and previous row for calculations, allocate two rows.
1044  */
1045  row = (float *) palloc((m + 1) * sizeof(float));
1046  prev_row = (float *) palloc((m + 1) * sizeof(float));
1047 
1048  /* M[0,0] = 1 */
1049  row[0] = 1.0f;
1050  for (i = 1; i <= n; i++)
1051  {
1052  float t = p[i - 1];
1053 
1054  /* Swap rows */
1055  tmp = row;
1056  row = prev_row;
1057  prev_row = tmp;
1058 
1059  /* Calculate next row */
1060  for (j = 0; j <= i && j <= m; j++)
1061  {
1062  float val = 0.0f;
1063 
1064  if (j < i)
1065  val += prev_row[j] * (1.0f - t);
1066  if (j > 0)
1067  val += prev_row[j - 1] * t;
1068  row[j] = val;
1069  }
1070  }
1071 
1072  /*
1073  * The presence of many distinct rare (not in "p") elements materially
1074  * decreases selectivity. Model their collective occurrence with the
1075  * Poisson distribution.
1076  */
1077  if (rest > DEFAULT_CONTAIN_SEL)
1078  {
1079  float t;
1080 
1081  /* Swap rows */
1082  tmp = row;
1083  row = prev_row;
1084  prev_row = tmp;
1085 
1086  for (i = 0; i <= m; i++)
1087  row[i] = 0.0f;
1088 
1089  /* Value of Poisson distribution for 0 occurrences */
1090  t = exp(-rest);
1091 
1092  /*
1093  * Calculate convolution of previously computed distribution and the
1094  * Poisson distribution.
1095  */
1096  for (i = 0; i <= m; i++)
1097  {
1098  for (j = 0; j <= m - i; j++)
1099  row[j + i] += prev_row[j] * t;
1100 
1101  /* Get Poisson distribution value for (i + 1) occurrences */
1102  t *= rest / (float) (i + 1);
1103  }
1104  }
1105 
1106  pfree(prev_row);
1107  return row;
1108 }
1109 
1110 /* Fast function for floor value of 2 based logarithm calculation. */
1111 static int
1113 {
1114  int logval = 0;
1115 
1116  if (n == 0)
1117  return -1;
1118  if (n >= (1 << 16))
1119  {
1120  n >>= 16;
1121  logval += 16;
1122  }
1123  if (n >= (1 << 8))
1124  {
1125  n >>= 8;
1126  logval += 8;
1127  }
1128  if (n >= (1 << 4))
1129  {
1130  n >>= 4;
1131  logval += 4;
1132  }
1133  if (n >= (1 << 2))
1134  {
1135  n >>= 2;
1136  logval += 2;
1137  }
1138  if (n >= (1 << 1))
1139  {
1140  logval += 1;
1141  }
1142  return logval;
1143 }
1144 
1145 /*
1146  * find_next_mcelem binary-searches a most common elements array, starting
1147  * from *index, for the first member >= value. It saves the position of the
1148  * match into *index and returns true if it's an exact match. (Note: we
1149  * assume the mcelem elements are distinct so there can't be more than one
1150  * exact match.)
1151  */
1152 static bool
1153 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1154  FmgrInfo *cmpfunc)
1155 {
1156  int l = *index,
1157  r = nmcelem - 1,
1158  i,
1159  res;
1160 
1161  while (l <= r)
1162  {
1163  i = (l + r) / 2;
1164  res = element_compare(&mcelem[i], &value, cmpfunc);
1165  if (res == 0)
1166  {
1167  *index = i;
1168  return true;
1169  }
1170  else if (res < 0)
1171  l = i + 1;
1172  else
1173  r = i - 1;
1174  }
1175  *index = l;
1176  return false;
1177 }
1178 
1179 /*
1180  * Comparison function for elements.
1181  *
1182  * We use the element type's default btree opclass, and the default collation
1183  * if the type is collation-sensitive.
1184  *
1185  * XXX consider using SortSupport infrastructure
1186  */
1187 static int
1188 element_compare(const void *key1, const void *key2, void *arg)
1189 {
1190  Datum d1 = *((const Datum *) key1);
1191  Datum d2 = *((const Datum *) key2);
1192  FmgrInfo *cmpfunc = (FmgrInfo *) arg;
1193  Datum c;
1194 
1195  c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
1196  return DatumGetInt32(c);
1197 }
1198 
1199 /*
1200  * Comparison function for sorting floats into descending order.
1201  */
1202 static int
1203 float_compare_desc(const void *key1, const void *key2)
1204 {
1205  float d1 = *((const float *) key1);
1206  float d2 = *((const float *) key2);
1207 
1208  if (d1 > d2)
1209  return -1;
1210  else if (d1 < d2)
1211  return 1;
1212  else
1213  return 0;
1214 }
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
Definition: fmgr.h:56
#define IsA(nodeptr, _type_)
Definition: nodes.h:557
#define GETSTRUCT(TUP)
Definition: htup_details.h:656
static int element_compare(const void *key1, const void *key2, void *arg)
#define DatumGetInt32(X)
Definition: postgres.h:478
HeapTuple statsTuple
Definition: selfuncs.h:71
#define PointerGetDatum(X)
Definition: postgres.h:562
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4445
#define DEFAULT_CONTAIN_SEL
RelOptInfo * rel
Definition: selfuncs.h:70
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:326
#define Min(x, y)
Definition: c.h:806
Definition: nodes.h:506
static float * calc_hist(const float4 *hist, int nhist, int n)
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
double Selectivity
Definition: nodes.h:629
bool get_attstatsslot(HeapTuple statstuple, Oid atttype, int32 atttypmod, int reqkind, Oid reqop, Oid *actualop, Datum **values, int *nvalues, float4 **numbers, int *nnumbers)
Definition: lsyscache.c:2854
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1356
unsigned int Oid
Definition: postgres_ext.h:31
int16 typlen
Definition: typcache.h:35
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:129
bool typbyval
Definition: typcache.h:36
#define OidIsValid(objectId)
Definition: c.h:538
int32 atttypmod
Definition: selfuncs.h:76
Datum arraycontsel(PG_FUNCTION_ARGS)
Definition: type.h:90
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:57
static struct @114 value
FmgrInfo cmp_proc_finfo
Definition: typcache.h:68
#define OID_ARRAY_CONTAINS_OP
Definition: pg_operator.h:1569
void pfree(void *pointer)
Definition: mcxt.c:950
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:381
#define STATISTIC_KIND_DECHIST
Definition: pg_statistic.h:270
#define OID_ARRAY_OVERLAP_OP
Definition: pg_operator.h:1566
char * c
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:74
unsigned int uint32
Definition: c.h:268
static float * calc_distr(const float *p, int n, int m, float rest)
static Selectivity mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry, Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, float4 *hist, int nhist, Oid operator, FmgrInfo *cmpfunc)
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
#define OID_ARRAY_CONTAINED_OP
Definition: pg_operator.h:1572
float float4
Definition: c.h:380
uintptr_t Datum
Definition: postgres.h:372
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition: typcache.c:191
#define InvalidOid
Definition: postgres_ext.h:36
Oid fn_oid
Definition: fmgr.h:59
static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index, FmgrInfo *cmpfunc)
#define HeapTupleIsValid(tuple)
Definition: htup.h:77
void examine_variable(PlannerInfo *root, Node *node, int varRelid, VariableStatData *vardata)
Definition: selfuncs.c:4564
#define EFFORT
#define NULL
Definition: c.h:229
Selectivity scalararraysel_containment(PlannerInfo *root, Node *leftop, Node *rightop, Oid elemtype, bool isEquality, bool useOr, int varRelid)
static int floor_log2(uint32 n)
static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval, Oid elemtype, Oid operator)
static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, float4 *hist, int nhist, Oid operator, FmgrInfo *cmpfunc)
void deconstruct_array(ArrayType *array, Oid elmtype, int elmlen, bool elmbyval, char elmalign, Datum **elemsp, bool **nullsp, int *nelemsp)
Definition: arrayfuncs.c:3475
static Datum values[MAXATTR]
Definition: bootstrap.c:162
Oid get_base_element_type(Oid typid)
Definition: lsyscache.c:2525
#define DEFAULT_SEL(operator)
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:80
char typalign
Definition: typcache.h:37
void * palloc(Size size)
Definition: mcxt.c:849
int i
#define STATISTIC_KIND_MCELEM
Definition: pg_statistic.h:257
void * arg
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define TYPECACHE_CMP_PROC_FINFO
Definition: typcache.h:116
#define elog
Definition: elog.h:219
#define qsort(a, b, c, d)
Definition: port.h:440
Definition: pg_list.h:45
long val
Definition: informix.c:689
void free_attstatsslot(Oid atttype, Datum *values, int nvalues, float4 *numbers, int nnumbers)
Definition: lsyscache.c:2978
Datum arraycontjoinsel(PG_FUNCTION_ARGS)
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742
static int float_compare_desc(const void *key1, const void *key2)
static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, Oid operator, FmgrInfo *cmpfunc)
#define DatumGetArrayTypeP(X)
Definition: array.h:242