PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
rangetypes_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_selfuncs.c
4  * Functions for selectivity estimation of range operators
5  *
6  * Estimates are based on histograms of lower and upper bounds, and the
7  * fraction of empty ranges.
8  *
9  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  * src/backend/utils/adt/rangetypes_selfuncs.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/htup_details.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "catalog/pg_type.h"
24 #include "utils/builtins.h"
25 #include "utils/lsyscache.h"
26 #include "utils/rangetypes.h"
27 #include "utils/selfuncs.h"
28 #include "utils/typcache.h"
29 
30 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
31  RangeType *constval, Oid operator);
32 static double default_range_selectivity(Oid operator);
33 static double calc_hist_selectivity(TypeCacheEntry *typcache,
34  VariableStatData *vardata, RangeType *constval,
35  Oid operator);
36 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
37  RangeBound *constbound,
38  RangeBound *hist, int hist_nvalues,
39  bool equal);
40 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
41  RangeBound *hist, int hist_length, bool equal);
43  RangeBound *hist1, RangeBound *hist2);
44 static float8 get_len_position(double value, double hist1, double hist2);
45 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
46  RangeBound *bound2);
47 static int length_hist_bsearch(Datum *length_hist_values,
48  int length_hist_nvalues, double value, bool equal);
49 static double calc_length_hist_frac(Datum *length_hist_values,
50  int length_hist_nvalues, double length1, double length2, bool equal);
51 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
53  RangeBound *hist_lower, int hist_nvalues,
54  Datum *length_hist_values, int length_hist_nvalues);
55 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
57  RangeBound *hist_lower, int hist_nvalues,
58  Datum *length_hist_values, int length_hist_nvalues);
59 
60 /*
61  * Returns a default selectivity estimate for given operator, when we don't
62  * have statistics or cannot use them for some reason.
63  */
64 static double
66 {
67  switch (operator)
68  {
70  return 0.01;
71 
74  return 0.005;
75 
78 
79  /*
80  * "range @> elem" is more or less identical to a scalar
81  * inequality "A >= b AND A <= c".
82  */
84 
85  case OID_RANGE_LESS_OP:
89  case OID_RANGE_LEFT_OP:
90  case OID_RANGE_RIGHT_OP:
93  /* these are similar to regular scalar inequalities */
94  return DEFAULT_INEQ_SEL;
95 
96  default:
97  /* all range operators should be handled above, but just in case */
98  return 0.01;
99  }
100 }
101 
102 /*
103  * rangesel -- restriction selectivity for range operators
104  */
105 Datum
107 {
109  Oid operator = PG_GETARG_OID(1);
110  List *args = (List *) PG_GETARG_POINTER(2);
111  int varRelid = PG_GETARG_INT32(3);
112  VariableStatData vardata;
113  Node *other;
114  bool varonleft;
115  Selectivity selec;
116  TypeCacheEntry *typcache = NULL;
117  RangeType *constrange = NULL;
118 
119  /*
120  * If expression is not (variable op something) or (something op
121  * variable), then punt and return a default estimate.
122  */
123  if (!get_restriction_variable(root, args, varRelid,
124  &vardata, &other, &varonleft))
126 
127  /*
128  * Can't do anything useful if the something is not a constant, either.
129  */
130  if (!IsA(other, Const))
131  {
132  ReleaseVariableStats(vardata);
134  }
135 
136  /*
137  * All the range operators are strict, so we can cope with a NULL constant
138  * right away.
139  */
140  if (((Const *) other)->constisnull)
141  {
142  ReleaseVariableStats(vardata);
143  PG_RETURN_FLOAT8(0.0);
144  }
145 
146  /*
147  * If var is on the right, commute the operator, so that we can assume the
148  * var is on the left in what follows.
149  */
150  if (!varonleft)
151  {
152  /* we have other Op var, commute to make var Op other */
153  operator = get_commutator(operator);
154  if (!operator)
155  {
156  /* Use default selectivity (should we raise an error instead?) */
157  ReleaseVariableStats(vardata);
159  }
160  }
161 
162  /*
163  * OK, there's a Var and a Const we're dealing with here. We need the
164  * Const to be of same range type as the column, else we can't do anything
165  * useful. (Such cases will likely fail at runtime, but here we'd rather
166  * just return a default estimate.)
167  *
168  * If the operator is "range @> element", the constant should be of the
169  * element type of the range column. Convert it to a range that includes
170  * only that single point, so that we don't need special handling for that
171  * in what follows.
172  */
173  if (operator == OID_RANGE_CONTAINS_ELEM_OP)
174  {
175  typcache = range_get_typcache(fcinfo, vardata.vartype);
176 
177  if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
178  {
180  upper;
181 
182  lower.inclusive = true;
183  lower.val = ((Const *) other)->constvalue;
184  lower.infinite = false;
185  lower.lower = true;
186  upper.inclusive = true;
187  upper.val = ((Const *) other)->constvalue;
188  upper.infinite = false;
189  upper.lower = false;
190  constrange = range_serialize(typcache, &lower, &upper, false);
191  }
192  }
193  else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
194  {
195  /*
196  * Here, the Var is the elem, not the range. For now we just punt and
197  * return the default estimate. In future we could disassemble the
198  * range constant and apply scalarineqsel ...
199  */
200  }
201  else if (((Const *) other)->consttype == vardata.vartype)
202  {
203  /* Both sides are the same range type */
204  typcache = range_get_typcache(fcinfo, vardata.vartype);
205 
206  constrange = DatumGetRangeType(((Const *) other)->constvalue);
207  }
208 
209  /*
210  * If we got a valid constant on one side of the operator, proceed to
211  * estimate using statistics. Otherwise punt and return a default constant
212  * estimate. Note that calc_rangesel need not handle
213  * OID_RANGE_ELEM_CONTAINED_OP.
214  */
215  if (constrange)
216  selec = calc_rangesel(typcache, &vardata, constrange, operator);
217  else
218  selec = default_range_selectivity(operator);
219 
220  ReleaseVariableStats(vardata);
221 
222  CLAMP_PROBABILITY(selec);
223 
224  PG_RETURN_FLOAT8((float8) selec);
225 }
226 
227 static double
229  RangeType *constval, Oid operator)
230 {
231  double hist_selec;
232  double selec;
233  float4 empty_frac,
234  null_frac;
235 
236  /*
237  * First look up the fraction of NULLs and empty ranges from pg_statistic.
238  */
239  if (HeapTupleIsValid(vardata->statsTuple))
240  {
241  Form_pg_statistic stats;
242  AttStatsSlot sslot;
243 
244  stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
245  null_frac = stats->stanullfrac;
246 
247  /* Try to get fraction of empty ranges */
248  if (get_attstatsslot(&sslot, vardata->statsTuple,
250  InvalidOid,
252  {
253  if (sslot.nnumbers != 1)
254  elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
255  empty_frac = sslot.numbers[0];
256  free_attstatsslot(&sslot);
257  }
258  else
259  {
260  /* No empty fraction statistic. Assume no empty ranges. */
261  empty_frac = 0.0;
262  }
263  }
264  else
265  {
266  /*
267  * No stats are available. Follow through the calculations below
268  * anyway, assuming no NULLs and no empty ranges. This still allows us
269  * to give a better-than-nothing estimate based on whether the
270  * constant is an empty range or not.
271  */
272  null_frac = 0.0;
273  empty_frac = 0.0;
274  }
275 
276  if (RangeIsEmpty(constval))
277  {
278  /*
279  * An empty range matches all ranges, all empty ranges, or nothing,
280  * depending on the operator
281  */
282  switch (operator)
283  {
284  /* these return false if either argument is empty */
288  case OID_RANGE_LEFT_OP:
289  case OID_RANGE_RIGHT_OP:
290  /* nothing is less than an empty range */
291  case OID_RANGE_LESS_OP:
292  selec = 0.0;
293  break;
294 
295  /* only empty ranges can be contained by an empty range */
297  /* only empty ranges are <= an empty range */
299  selec = empty_frac;
300  break;
301 
302  /* everything contains an empty range */
304  /* everything is >= an empty range */
306  selec = 1.0;
307  break;
308 
309  /* all non-empty ranges are > an empty range */
311  selec = 1.0 - empty_frac;
312  break;
313 
314  /* an element cannot be empty */
316  default:
317  elog(ERROR, "unexpected operator %u", operator);
318  selec = 0.0; /* keep compiler quiet */
319  break;
320  }
321  }
322  else
323  {
324  /*
325  * Calculate selectivity using bound histograms. If that fails for
326  * some reason, e.g no histogram in pg_statistic, use the default
327  * constant estimate for the fraction of non-empty values. This is
328  * still somewhat better than just returning the default estimate,
329  * because this still takes into account the fraction of empty and
330  * NULL tuples, if we had statistics for them.
331  */
332  hist_selec = calc_hist_selectivity(typcache, vardata, constval,
333  operator);
334  if (hist_selec < 0.0)
335  hist_selec = default_range_selectivity(operator);
336 
337  /*
338  * Now merge the results for the empty ranges and histogram
339  * calculations, realizing that the histogram covers only the
340  * non-null, non-empty values.
341  */
342  if (operator == OID_RANGE_CONTAINED_OP)
343  {
344  /* empty is contained by anything non-empty */
345  selec = (1.0 - empty_frac) * hist_selec + empty_frac;
346  }
347  else
348  {
349  /* with any other operator, empty Op non-empty matches nothing */
350  selec = (1.0 - empty_frac) * hist_selec;
351  }
352  }
353 
354  /* all range operators are strict */
355  selec *= (1.0 - null_frac);
356 
357  /* result should be in range, but make sure... */
358  CLAMP_PROBABILITY(selec);
359 
360  return selec;
361 }
362 
363 /*
364  * Calculate range operator selectivity using histograms of range bounds.
365  *
366  * This estimate is for the portion of values that are not empty and not
367  * NULL.
368  */
369 static double
371  RangeType *constval, Oid operator)
372 {
373  AttStatsSlot hslot;
374  AttStatsSlot lslot;
375  int nhist;
376  RangeBound *hist_lower;
377  RangeBound *hist_upper;
378  int i;
379  RangeBound const_lower;
380  RangeBound const_upper;
381  bool empty;
382  double hist_selec;
383 
384  /* Can't use the histogram with insecure range support functions */
385  if (!statistic_proc_security_check(vardata,
386  typcache->rng_cmp_proc_finfo.fn_oid))
387  return -1;
388  if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
390  typcache->rng_subdiff_finfo.fn_oid))
391  return -1;
392 
393  /* Try to get histogram of ranges */
394  if (!(HeapTupleIsValid(vardata->statsTuple) &&
395  get_attstatsslot(&hslot, vardata->statsTuple,
398  return -1.0;
399 
400  /*
401  * Convert histogram of ranges into histograms of its lower and upper
402  * bounds.
403  */
404  nhist = hslot.nvalues;
405  hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
406  hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
407  for (i = 0; i < nhist; i++)
408  {
409  range_deserialize(typcache, DatumGetRangeType(hslot.values[i]),
410  &hist_lower[i], &hist_upper[i], &empty);
411  /* The histogram should not contain any empty ranges */
412  if (empty)
413  elog(ERROR, "bounds histogram contains an empty range");
414  }
415 
416  /* @> and @< also need a histogram of range lengths */
417  if (operator == OID_RANGE_CONTAINS_OP ||
418  operator == OID_RANGE_CONTAINED_OP)
419  {
420  if (!(HeapTupleIsValid(vardata->statsTuple) &&
421  get_attstatsslot(&lslot, vardata->statsTuple,
423  InvalidOid,
425  {
426  free_attstatsslot(&hslot);
427  return -1.0;
428  }
429 
430  /* check that it's a histogram, not just a dummy entry */
431  if (lslot.nvalues < 2)
432  {
433  free_attstatsslot(&lslot);
434  free_attstatsslot(&hslot);
435  return -1.0;
436  }
437  }
438  else
439  memset(&lslot, 0, sizeof(lslot));
440 
441  /* Extract the bounds of the constant value. */
442  range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
443  Assert(!empty);
444 
445  /*
446  * Calculate selectivity comparing the lower or upper bound of the
447  * constant with the histogram of lower or upper bounds.
448  */
449  switch (operator)
450  {
451  case OID_RANGE_LESS_OP:
452 
453  /*
454  * The regular b-tree comparison operators (<, <=, >, >=) compare
455  * the lower bounds first, and the upper bounds for values with
456  * equal lower bounds. Estimate that by comparing the lower bounds
457  * only. This gives a fairly accurate estimate assuming there
458  * aren't many rows with a lower bound equal to the constant's
459  * lower bound.
460  */
461  hist_selec =
462  calc_hist_selectivity_scalar(typcache, &const_lower,
463  hist_lower, nhist, false);
464  break;
465 
467  hist_selec =
468  calc_hist_selectivity_scalar(typcache, &const_lower,
469  hist_lower, nhist, true);
470  break;
471 
473  hist_selec =
474  1 - calc_hist_selectivity_scalar(typcache, &const_lower,
475  hist_lower, nhist, false);
476  break;
477 
479  hist_selec =
480  1 - calc_hist_selectivity_scalar(typcache, &const_lower,
481  hist_lower, nhist, true);
482  break;
483 
484  case OID_RANGE_LEFT_OP:
485  /* var << const when upper(var) < lower(const) */
486  hist_selec =
487  calc_hist_selectivity_scalar(typcache, &const_lower,
488  hist_upper, nhist, false);
489  break;
490 
491  case OID_RANGE_RIGHT_OP:
492  /* var >> const when lower(var) > upper(const) */
493  hist_selec =
494  1 - calc_hist_selectivity_scalar(typcache, &const_upper,
495  hist_lower, nhist, true);
496  break;
497 
499  /* compare lower bounds */
500  hist_selec =
501  1 - calc_hist_selectivity_scalar(typcache, &const_lower,
502  hist_lower, nhist, false);
503  break;
504 
506  /* compare upper bounds */
507  hist_selec =
508  calc_hist_selectivity_scalar(typcache, &const_upper,
509  hist_upper, nhist, true);
510  break;
511 
514 
515  /*
516  * A && B <=> NOT (A << B OR A >> B).
517  *
518  * Since A << B and A >> B are mutually exclusive events we can
519  * sum their probabilities to find probability of (A << B OR A >>
520  * B).
521  *
522  * "range @> elem" is equivalent to "range && [elem,elem]". The
523  * caller already constructed the singular range from the element
524  * constant, so just treat it the same as &&.
525  */
526  hist_selec =
527  calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
528  nhist, false);
529  hist_selec +=
530  (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
531  nhist, true));
532  hist_selec = 1.0 - hist_selec;
533  break;
534 
536  hist_selec =
537  calc_hist_selectivity_contains(typcache, &const_lower,
538  &const_upper, hist_lower, nhist,
539  lslot.values, lslot.nvalues);
540  break;
541 
543  if (const_lower.infinite)
544  {
545  /*
546  * Lower bound no longer matters. Just estimate the fraction
547  * with an upper bound <= const upper bound
548  */
549  hist_selec =
550  calc_hist_selectivity_scalar(typcache, &const_upper,
551  hist_upper, nhist, true);
552  }
553  else if (const_upper.infinite)
554  {
555  hist_selec =
556  1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
557  hist_lower, nhist, false);
558  }
559  else
560  {
561  hist_selec =
562  calc_hist_selectivity_contained(typcache, &const_lower,
563  &const_upper, hist_lower, nhist,
564  lslot.values, lslot.nvalues);
565  }
566  break;
567 
568  default:
569  elog(ERROR, "unknown range operator %u", operator);
570  hist_selec = -1.0; /* keep compiler quiet */
571  break;
572  }
573 
574  free_attstatsslot(&lslot);
575  free_attstatsslot(&hslot);
576 
577  return hist_selec;
578 }
579 
580 
581 /*
582  * Look up the fraction of values less than (or equal, if 'equal' argument
583  * is true) a given const in a histogram of range bounds.
584  */
585 static double
587  RangeBound *hist, int hist_nvalues, bool equal)
588 {
589  Selectivity selec;
590  int index;
591 
592  /*
593  * Find the histogram bin the given constant falls into. Estimate
594  * selectivity as the number of preceding whole bins.
595  */
596  index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
597  selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
598 
599  /* Adjust using linear interpolation within the bin */
600  if (index >= 0 && index < hist_nvalues - 1)
601  selec += get_position(typcache, constbound, &hist[index],
602  &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
603 
604  return selec;
605 }
606 
607 /*
608  * Binary search on an array of range bounds. Returns greatest index of range
609  * bound in array which is less(less or equal) than given range bound. If all
610  * range bounds in array are greater or equal(greater) than given range bound,
611  * return -1. When "equal" flag is set conditions in brackets are used.
612  *
613  * This function is used in scalar operator selectivity estimation. Another
614  * goal of this function is to find a histogram bin where to stop
615  * interpolation of portion of bounds which are less or equal to given bound.
616  */
617 static int
619  int hist_length, bool equal)
620 {
621  int lower = -1,
622  upper = hist_length - 1,
623  cmp,
624  middle;
625 
626  while (lower < upper)
627  {
628  middle = (lower + upper + 1) / 2;
629  cmp = range_cmp_bounds(typcache, &hist[middle], value);
630 
631  if (cmp < 0 || (equal && cmp == 0))
632  lower = middle;
633  else
634  upper = middle - 1;
635  }
636  return lower;
637 }
638 
639 
640 /*
641  * Binary search on length histogram. Returns greatest index of range length in
642  * histogram which is less than (less than or equal) the given length value. If
643  * all lengths in the histogram are greater than (greater than or equal) the
644  * given length, returns -1.
645  */
646 static int
647 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
648  double value, bool equal)
649 {
650  int lower = -1,
651  upper = length_hist_nvalues - 1,
652  middle;
653 
654  while (lower < upper)
655  {
656  double middleval;
657 
658  middle = (lower + upper + 1) / 2;
659 
660  middleval = DatumGetFloat8(length_hist_values[middle]);
661  if (middleval < value || (equal && middleval <= value))
662  lower = middle;
663  else
664  upper = middle - 1;
665  }
666  return lower;
667 }
668 
669 /*
670  * Get relative position of value in histogram bin in [0,1] range.
671  */
672 static float8
674  RangeBound *hist2)
675 {
676  bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
677  float8 position;
678 
679  if (!hist1->infinite && !hist2->infinite)
680  {
681  float8 bin_width;
682 
683  /*
684  * Both bounds are finite. Assuming the subtype's comparison function
685  * works sanely, the value must be finite, too, because it lies
686  * somewhere between the bounds. If it doesn't, just return something.
687  */
688  if (value->infinite)
689  return 0.5;
690 
691  /* Can't interpolate without subdiff function */
692  if (!has_subdiff)
693  return 0.5;
694 
695  /* Calculate relative position using subdiff function. */
696  bin_width = DatumGetFloat8(FunctionCall2Coll(
697  &typcache->rng_subdiff_finfo,
698  typcache->rng_collation,
699  hist2->val,
700  hist1->val));
701  if (bin_width <= 0.0)
702  return 0.5; /* zero width bin */
703 
705  &typcache->rng_subdiff_finfo,
706  typcache->rng_collation,
707  value->val,
708  hist1->val))
709  / bin_width;
710 
711  /* Relative position must be in [0,1] range */
712  position = Max(position, 0.0);
713  position = Min(position, 1.0);
714  return position;
715  }
716  else if (hist1->infinite && !hist2->infinite)
717  {
718  /*
719  * Lower bin boundary is -infinite, upper is finite. If the value is
720  * -infinite, return 0.0 to indicate it's equal to the lower bound.
721  * Otherwise return 1.0 to indicate it's infinitely far from the lower
722  * bound.
723  */
724  return ((value->infinite && value->lower) ? 0.0 : 1.0);
725  }
726  else if (!hist1->infinite && hist2->infinite)
727  {
728  /* same as above, but in reverse */
729  return ((value->infinite && !value->lower) ? 1.0 : 0.0);
730  }
731  else
732  {
733  /*
734  * If both bin boundaries are infinite, they should be equal to each
735  * other, and the value should also be infinite and equal to both
736  * bounds. (But don't Assert that, to avoid crashing if a user creates
737  * a datatype with a broken comparison function).
738  *
739  * Assume the value to lie in the middle of the infinite bounds.
740  */
741  return 0.5;
742  }
743 }
744 
745 
746 /*
747  * Get relative position of value in a length histogram bin in [0,1] range.
748  */
749 static double
750 get_len_position(double value, double hist1, double hist2)
751 {
752  if (!is_infinite(hist1) && !is_infinite(hist2))
753  {
754  /*
755  * Both bounds are finite. The value should be finite too, because it
756  * lies somewhere between the bounds. If it doesn't, just return
757  * something.
758  */
759  if (is_infinite(value))
760  return 0.5;
761 
762  return 1.0 - (hist2 - value) / (hist2 - hist1);
763  }
764  else if (is_infinite(hist1) && !is_infinite(hist2))
765  {
766  /*
767  * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
768  * indicate the value is infinitely far from the lower bound.
769  */
770  return 1.0;
771  }
772  else if (is_infinite(hist1) && is_infinite(hist2))
773  {
774  /* same as above, but in reverse */
775  return 0.0;
776  }
777  else
778  {
779  /*
780  * If both bin boundaries are infinite, they should be equal to each
781  * other, and the value should also be infinite and equal to both
782  * bounds. (But don't Assert that, to avoid crashing unnecessarily if
783  * the caller messes up)
784  *
785  * Assume the value to lie in the middle of the infinite bounds.
786  */
787  return 0.5;
788  }
789 }
790 
791 /*
792  * Measure distance between two range bounds.
793  */
794 static float8
795 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
796 {
797  bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
798 
799  if (!bound1->infinite && !bound2->infinite)
800  {
801  /*
802  * No bounds are infinite, use subdiff function or return default
803  * value of 1.0 if no subdiff is available.
804  */
805  if (has_subdiff)
806  return
808  typcache->rng_collation,
809  bound2->val,
810  bound1->val));
811  else
812  return 1.0;
813  }
814  else if (bound1->infinite && bound2->infinite)
815  {
816  /* Both bounds are infinite */
817  if (bound1->lower == bound2->lower)
818  return 0.0;
819  else
820  return get_float8_infinity();
821  }
822  else
823  {
824  /* One bound is infinite, another is not */
825  return get_float8_infinity();
826  }
827 }
828 
829 /*
830  * Calculate the average of function P(x), in the interval [length1, length2],
831  * where P(x) is the fraction of tuples with length < x (or length <= x if
832  * 'equal' is true).
833  */
834 static double
835 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
836  double length1, double length2, bool equal)
837 {
838  double frac;
839  double A,
840  B,
841  PA,
842  PB;
843  double pos;
844  int i;
845  double area;
846 
847  Assert(length2 >= length1);
848 
849  if (length2 < 0.0)
850  return 0.0; /* shouldn't happen, but doesn't hurt to check */
851 
852  /* All lengths in the table are <= infinite. */
853  if (is_infinite(length2) && equal)
854  return 1.0;
855 
856  /*----------
857  * The average of a function between A and B can be calculated by the
858  * formula:
859  *
860  * B
861  * 1 /
862  * ------- | P(x)dx
863  * B - A /
864  * A
865  *
866  * The geometrical interpretation of the integral is the area under the
867  * graph of P(x). P(x) is defined by the length histogram. We calculate
868  * the area in a piecewise fashion, iterating through the length histogram
869  * bins. Each bin is a trapezoid:
870  *
871  * P(x2)
872  * /|
873  * / |
874  * P(x1)/ |
875  * | |
876  * | |
877  * ---+---+--
878  * x1 x2
879  *
880  * where x1 and x2 are the boundaries of the current histogram, and P(x1)
881  * and P(x1) are the cumulative fraction of tuples at the boundaries.
882  *
883  * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
884  *
885  * The first bin contains the lower bound passed by the caller, so we
886  * use linear interpolation between the previous and next histogram bin
887  * boundary to calculate P(x1). Likewise for the last bin: we use linear
888  * interpolation to calculate P(x2). For the bins in between, x1 and x2
889  * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
890  * P(x1) = (bin index) / (number of bins)
891  * P(x2) = (bin index + 1 / (number of bins)
892  */
893 
894  /* First bin, the one that contains lower bound */
895  i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
896  if (i >= length_hist_nvalues - 1)
897  return 1.0;
898 
899  if (i < 0)
900  {
901  i = 0;
902  pos = 0.0;
903  }
904  else
905  {
906  /* interpolate length1's position in the bin */
907  pos = get_len_position(length1,
908  DatumGetFloat8(length_hist_values[i]),
909  DatumGetFloat8(length_hist_values[i + 1]));
910  }
911  PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
912  B = length1;
913 
914  /*
915  * In the degenerate case that length1 == length2, simply return
916  * P(length1). This is not merely an optimization: if length1 == length2,
917  * we'd divide by zero later on.
918  */
919  if (length2 == length1)
920  return PB;
921 
922  /*
923  * Loop through all the bins, until we hit the last bin, the one that
924  * contains the upper bound. (if lower and upper bounds are in the same
925  * bin, this falls out immediately)
926  */
927  area = 0.0;
928  for (; i < length_hist_nvalues - 1; i++)
929  {
930  double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
931 
932  /* check if we've reached the last bin */
933  if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
934  break;
935 
936  /* the upper bound of previous bin is the lower bound of this bin */
937  A = B;
938  PA = PB;
939 
940  B = bin_upper;
941  PB = (double) i / (double) (length_hist_nvalues - 1);
942 
943  /*
944  * Add the area of this trapezoid to the total. The point of the
945  * if-check is to avoid NaN, in the corner case that PA == PB == 0,
946  * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
947  * 0) is zero, regardless of the width (B - A).
948  */
949  if (PA > 0 || PB > 0)
950  area += 0.5 * (PB + PA) * (B - A);
951  }
952 
953  /* Last bin */
954  A = B;
955  PA = PB;
956 
957  B = length2; /* last bin ends at the query upper bound */
958  if (i >= length_hist_nvalues - 1)
959  pos = 0.0;
960  else
961  {
962  if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
963  pos = 0.0;
964  else
965  pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
966  }
967  PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
968 
969  if (PA > 0 || PB > 0)
970  area += 0.5 * (PB + PA) * (B - A);
971 
972  /*
973  * Ok, we have calculated the area, ie. the integral. Divide by width to
974  * get the requested average.
975  *
976  * Avoid NaN arising from infinite / infinite. This happens at least if
977  * length2 is infinite. It's not clear what the correct value would be in
978  * that case, so 0.5 seems as good as any value.
979  */
980  if (is_infinite(area) && is_infinite(length2))
981  frac = 0.5;
982  else
983  frac = area / (length2 - length1);
984 
985  return frac;
986 }
987 
988 /*
989  * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
990  * of ranges that fall within the constant lower and upper bounds. This uses
991  * the histograms of range lower bounds and range lengths, on the assumption
992  * that the range lengths are independent of the lower bounds.
993  *
994  * The caller has already checked that constant lower and upper bounds are
995  * finite.
996  */
997 static double
1000  RangeBound *hist_lower, int hist_nvalues,
1001  Datum *length_hist_values, int length_hist_nvalues)
1002 {
1003  int i,
1004  upper_index;
1005  float8 prev_dist;
1006  double bin_width;
1007  double upper_bin_width;
1008  double sum_frac;
1009 
1010  /*
1011  * Begin by finding the bin containing the upper bound, in the lower bound
1012  * histogram. Any range with a lower bound > constant upper bound can't
1013  * match, ie. there are no matches in bins greater than upper_index.
1014  */
1015  upper->inclusive = !upper->inclusive;
1016  upper->lower = true;
1017  upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1018  false);
1019 
1020  /*
1021  * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1022  * upper_index + 1) bin which is greater than upper bound of query range
1023  * using linear interpolation of subdiff function.
1024  */
1025  if (upper_index >= 0 && upper_index < hist_nvalues - 1)
1026  upper_bin_width = get_position(typcache, upper,
1027  &hist_lower[upper_index],
1028  &hist_lower[upper_index + 1]);
1029  else
1030  upper_bin_width = 0.0;
1031 
1032  /*
1033  * In the loop, dist and prev_dist are the distance of the "current" bin's
1034  * lower and upper bounds from the constant upper bound.
1035  *
1036  * bin_width represents the width of the current bin. Normally it is 1.0,
1037  * meaning a full width bin, but can be less in the corner cases: start
1038  * and end of the loop. We start with bin_width = upper_bin_width, because
1039  * we begin at the bin containing the upper bound.
1040  */
1041  prev_dist = 0.0;
1042  bin_width = upper_bin_width;
1043 
1044  sum_frac = 0.0;
1045  for (i = upper_index; i >= 0; i--)
1046  {
1047  double dist;
1048  double length_hist_frac;
1049  bool final_bin = false;
1050 
1051  /*
1052  * dist -- distance from upper bound of query range to lower bound of
1053  * the current bin in the lower bound histogram. Or to the lower bound
1054  * of the constant range, if this is the final bin, containing the
1055  * constant lower bound.
1056  */
1057  if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1058  {
1059  dist = get_distance(typcache, lower, upper);
1060 
1061  /*
1062  * Subtract from bin_width the portion of this bin that we want to
1063  * ignore.
1064  */
1065  bin_width -= get_position(typcache, lower, &hist_lower[i],
1066  &hist_lower[i + 1]);
1067  if (bin_width < 0.0)
1068  bin_width = 0.0;
1069  final_bin = true;
1070  }
1071  else
1072  dist = get_distance(typcache, &hist_lower[i], upper);
1073 
1074  /*
1075  * Estimate the fraction of tuples in this bin that are narrow enough
1076  * to not exceed the distance to the upper bound of the query range.
1077  */
1078  length_hist_frac = calc_length_hist_frac(length_hist_values,
1079  length_hist_nvalues,
1080  prev_dist, dist, true);
1081 
1082  /*
1083  * Add the fraction of tuples in this bin, with a suitable length, to
1084  * the total.
1085  */
1086  sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1087 
1088  if (final_bin)
1089  break;
1090 
1091  bin_width = 1.0;
1092  prev_dist = dist;
1093  }
1094 
1095  return sum_frac;
1096 }
1097 
1098 /*
1099  * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1100  * of ranges that contain the constant lower and upper bounds. This uses
1101  * the histograms of range lower bounds and range lengths, on the assumption
1102  * that the range lengths are independent of the lower bounds.
1103  *
1104  * Note, this is "var @> const", ie. estimate the fraction of ranges that
1105  * contain the constant lower and upper bounds.
1106  */
1107 static double
1110  RangeBound *hist_lower, int hist_nvalues,
1111  Datum *length_hist_values, int length_hist_nvalues)
1112 {
1113  int i,
1114  lower_index;
1115  double bin_width,
1116  lower_bin_width;
1117  double sum_frac;
1118  float8 prev_dist;
1119 
1120  /* Find the bin containing the lower bound of query range. */
1121  lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1122  true);
1123 
1124  /*
1125  * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1126  * lower_index + 1) bin which is greater than lower bound of query range
1127  * using linear interpolation of subdiff function.
1128  */
1129  if (lower_index >= 0 && lower_index < hist_nvalues - 1)
1130  lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1131  &hist_lower[lower_index + 1]);
1132  else
1133  lower_bin_width = 0.0;
1134 
1135  /*
1136  * Loop through all the lower bound bins, smaller than the query lower
1137  * bound. In the loop, dist and prev_dist are the distance of the
1138  * "current" bin's lower and upper bounds from the constant upper bound.
1139  * We begin from query lower bound, and walk backwards, so the first bin's
1140  * upper bound is the query lower bound, and its distance to the query
1141  * upper bound is the length of the query range.
1142  *
1143  * bin_width represents the width of the current bin. Normally it is 1.0,
1144  * meaning a full width bin, except for the first bin, which is only
1145  * counted up to the constant lower bound.
1146  */
1147  prev_dist = get_distance(typcache, lower, upper);
1148  sum_frac = 0.0;
1149  bin_width = lower_bin_width;
1150  for (i = lower_index; i >= 0; i--)
1151  {
1152  float8 dist;
1153  double length_hist_frac;
1154 
1155  /*
1156  * dist -- distance from upper bound of query range to current value
1157  * of lower bound histogram or lower bound of query range (if we've
1158  * reach it).
1159  */
1160  dist = get_distance(typcache, &hist_lower[i], upper);
1161 
1162  /*
1163  * Get average fraction of length histogram which covers intervals
1164  * longer than (or equal to) distance to upper bound of query range.
1165  */
1166  length_hist_frac =
1167  1.0 - calc_length_hist_frac(length_hist_values,
1168  length_hist_nvalues,
1169  prev_dist, dist, false);
1170 
1171  sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1172 
1173  bin_width = 1.0;
1174  prev_dist = dist;
1175  }
1176 
1177  return sum_frac;
1178 }
#define OID_RANGE_ELEM_CONTAINED_OP
Definition: pg_operator.h:1776
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
FmgrInfo rng_cmp_proc_finfo
Definition: typcache.h:86
#define IsA(nodeptr, _type_)
Definition: nodes.h:560
#define RangeIsEmpty(r)
Definition: rangetypes.h:54
#define OID_RANGE_RIGHT_OP
Definition: pg_operator.h:1785
Oid get_commutator(Oid opno)
Definition: lsyscache.c:1313
int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1, RangeBound *b2)
Definition: rangetypes.c:1768
#define OID_RANGE_LESS_EQUAL_OP
Definition: pg_operator.h:1758
#define GETSTRUCT(TUP)
Definition: htup_details.h:656
#define OID_RANGE_CONTAINS_OP
Definition: pg_operator.h:1773
#define OID_RANGE_GREATER_EQUAL_OP
Definition: pg_operator.h:1761
TypeCacheEntry * range_get_typcache(FunctionCallInfo fcinfo, Oid rngtypid)
Definition: rangetypes.c:1479
#define ATTSTATSSLOT_VALUES
Definition: lsyscache.h:39
bool equal(const void *a, const void *b)
Definition: equalfuncs.c:2962
HeapTuple statsTuple
Definition: selfuncs.h:71
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:43
static float8 get_len_position(double value, double hist1, double hist2)
#define DEFAULT_INEQ_SEL
Definition: selfuncs.h:37
int nnumbers
Definition: lsyscache.h:53
bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid)
Definition: selfuncs.c:4934
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4434
#define OID_RANGE_OVERLAPS_LEFT_OP
Definition: pg_operator.h:1788
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:326
#define Min(x, y)
Definition: c.h:806
static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues)
#define DatumGetRangeType(X)
Definition: rangetypes.h:71
static float8 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1, RangeBound *hist2)
Datum val
Definition: rangetypes.h:62
static int length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues, double value, bool equal)
Definition: nodes.h:509
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
static double calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, RangeType *constval, Oid operator)
double Selectivity
Definition: nodes.h:639
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:74
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1047
unsigned int Oid
Definition: postgres_ext.h:31
#define OID_RANGE_CONTAINS_ELEM_OP
Definition: pg_operator.h:1770
#define OID_RANGE_CONTAINED_OP
Definition: pg_operator.h:1779
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:129
#define OidIsValid(objectId)
Definition: c.h:538
Definition: type.h:89
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:57
FmgrInfo rng_subdiff_finfo
Definition: typcache.h:88
bool inclusive
Definition: rangetypes.h:64
#define ATTSTATSSLOT_NUMBERS
Definition: lsyscache.h:40
static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues)
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:381
static struct @121 value
#define STATISTIC_KIND_BOUNDS_HISTOGRAM
Definition: pg_statistic.h:292
Datum rangesel(PG_FUNCTION_ARGS)
#define OID_RANGE_LEFT_OP
Definition: pg_operator.h:1782
#define OID_RANGE_LESS_OP
Definition: pg_operator.h:1755
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
float4 * numbers
Definition: lsyscache.h:52
static double default_range_selectivity(Oid operator)
static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
RangeType * range_serialize(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, bool empty)
Definition: rangetypes.c:1503
bool lower
Definition: rangetypes.h:65
struct TypeCacheEntry * rngelemtype
Definition: typcache.h:84
double get_float8_infinity(void)
Definition: float.c:121
void range_deserialize(TypeCacheEntry *typcache, RangeType *range, RangeBound *lower, RangeBound *upper, bool *empty)
Definition: rangetypes.c:1632
float float4
Definition: c.h:380
#define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM
Definition: pg_statistic.h:281
static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound, RangeBound *hist, int hist_nvalues, bool equal)
static double calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal)
#define DatumGetFloat8(X)
Definition: postgres.h:734
uintptr_t Datum
Definition: postgres.h:372
#define InvalidOid
Definition: postgres_ext.h:36
Oid fn_oid
Definition: fmgr.h:59
#define Max(x, y)
Definition: c.h:800
#define HeapTupleIsValid(tuple)
Definition: htup.h:77
#define NULL
Definition: c.h:229
bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags)
Definition: lsyscache.c:2895
#define Assert(condition)
Definition: c.h:675
Datum * values
Definition: lsyscache.h:49
static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata, RangeType *constval, Oid operator)
bool infinite
Definition: rangetypes.h:63
#define OID_RANGE_OVERLAPS_RIGHT_OP
Definition: pg_operator.h:1791
static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist, int hist_length, bool equal)
Oid rng_collation
Definition: typcache.h:85
int is_infinite(double val)
Definition: float.c:199
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:81
void * palloc(Size size)
Definition: mcxt.c:849
#define DEFAULT_RANGE_INEQ_SEL
Definition: selfuncs.h:40
int i
#define OID_RANGE_OVERLAP_OP
Definition: pg_operator.h:1767
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define elog
Definition: elog.h:219
#define OID_RANGE_GREATER_OP
Definition: pg_operator.h:1764
Definition: pg_list.h:45
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742
void free_attstatsslot(AttStatsSlot *sslot)
Definition: lsyscache.c:3011