PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
rangetypes_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_selfuncs.c
4  * Functions for selectivity estimation of range operators
5  *
6  * Estimates are based on histograms of lower and upper bounds, and the
7  * fraction of empty ranges.
8  *
9  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  * src/backend/utils/adt/rangetypes_selfuncs.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/htup_details.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "utils/builtins.h"
24 #include "utils/lsyscache.h"
25 #include "utils/rangetypes.h"
26 #include "utils/selfuncs.h"
27 #include "utils/typcache.h"
28 
29 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
30  RangeType *constval, Oid operator);
31 static double default_range_selectivity(Oid operator);
32 static double calc_hist_selectivity(TypeCacheEntry *typcache,
33  VariableStatData *vardata, RangeType *constval,
34  Oid operator);
35 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
36  RangeBound *constbound,
37  RangeBound *hist, int hist_nvalues,
38  bool equal);
39 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
40  RangeBound *hist, int hist_length, bool equal);
42  RangeBound *hist1, RangeBound *hist2);
43 static float8 get_len_position(double value, double hist1, double hist2);
44 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
45  RangeBound *bound2);
46 static int length_hist_bsearch(Datum *length_hist_values,
47  int length_hist_nvalues, double value, bool equal);
48 static double calc_length_hist_frac(Datum *length_hist_values,
49  int length_hist_nvalues, double length1, double length2, bool equal);
50 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
52  RangeBound *hist_lower, int hist_nvalues,
53  Datum *length_hist_values, int length_hist_nvalues);
54 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
56  RangeBound *hist_lower, int hist_nvalues,
57  Datum *length_hist_values, int length_hist_nvalues);
58 
59 /*
60  * Returns a default selectivity estimate for given operator, when we don't
61  * have statistics or cannot use them for some reason.
62  */
63 static double
65 {
66  switch (operator)
67  {
69  return 0.01;
70 
73  return 0.005;
74 
77 
78  /*
79  * "range @> elem" is more or less identical to a scalar
80  * inequality "A >= b AND A <= c".
81  */
83 
84  case OID_RANGE_LESS_OP:
88  case OID_RANGE_LEFT_OP:
89  case OID_RANGE_RIGHT_OP:
92  /* these are similar to regular scalar inequalities */
93  return DEFAULT_INEQ_SEL;
94 
95  default:
96  /* all range operators should be handled above, but just in case */
97  return 0.01;
98  }
99 }
100 
101 /*
102  * rangesel -- restriction selectivity for range operators
103  */
104 Datum
106 {
108  Oid operator = PG_GETARG_OID(1);
109  List *args = (List *) PG_GETARG_POINTER(2);
110  int varRelid = PG_GETARG_INT32(3);
111  VariableStatData vardata;
112  Node *other;
113  bool varonleft;
114  Selectivity selec;
115  TypeCacheEntry *typcache = NULL;
116  RangeType *constrange = NULL;
117 
118  /*
119  * If expression is not (variable op something) or (something op
120  * variable), then punt and return a default estimate.
121  */
122  if (!get_restriction_variable(root, args, varRelid,
123  &vardata, &other, &varonleft))
125 
126  /*
127  * Can't do anything useful if the something is not a constant, either.
128  */
129  if (!IsA(other, Const))
130  {
131  ReleaseVariableStats(vardata);
133  }
134 
135  /*
136  * All the range operators are strict, so we can cope with a NULL constant
137  * right away.
138  */
139  if (((Const *) other)->constisnull)
140  {
141  ReleaseVariableStats(vardata);
142  PG_RETURN_FLOAT8(0.0);
143  }
144 
145  /*
146  * If var is on the right, commute the operator, so that we can assume the
147  * var is on the left in what follows.
148  */
149  if (!varonleft)
150  {
151  /* we have other Op var, commute to make var Op other */
152  operator = get_commutator(operator);
153  if (!operator)
154  {
155  /* Use default selectivity (should we raise an error instead?) */
156  ReleaseVariableStats(vardata);
158  }
159  }
160 
161  /*
162  * OK, there's a Var and a Const we're dealing with here. We need the
163  * Const to be of same range type as the column, else we can't do anything
164  * useful. (Such cases will likely fail at runtime, but here we'd rather
165  * just return a default estimate.)
166  *
167  * If the operator is "range @> element", the constant should be of the
168  * element type of the range column. Convert it to a range that includes
169  * only that single point, so that we don't need special handling for that
170  * in what follows.
171  */
172  if (operator == OID_RANGE_CONTAINS_ELEM_OP)
173  {
174  typcache = range_get_typcache(fcinfo, vardata.vartype);
175 
176  if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
177  {
179  upper;
180 
181  lower.inclusive = true;
182  lower.val = ((Const *) other)->constvalue;
183  lower.infinite = false;
184  lower.lower = true;
185  upper.inclusive = true;
186  upper.val = ((Const *) other)->constvalue;
187  upper.infinite = false;
188  upper.lower = false;
189  constrange = range_serialize(typcache, &lower, &upper, false);
190  }
191  }
192  else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
193  {
194  /*
195  * Here, the Var is the elem, not the range. For now we just punt and
196  * return the default estimate. In future we could disassemble the
197  * range constant and apply scalarineqsel ...
198  */
199  }
200  else if (((Const *) other)->consttype == vardata.vartype)
201  {
202  /* Both sides are the same range type */
203  typcache = range_get_typcache(fcinfo, vardata.vartype);
204 
205  constrange = DatumGetRangeType(((Const *) other)->constvalue);
206  }
207 
208  /*
209  * If we got a valid constant on one side of the operator, proceed to
210  * estimate using statistics. Otherwise punt and return a default constant
211  * estimate. Note that calc_rangesel need not handle
212  * OID_RANGE_ELEM_CONTAINED_OP.
213  */
214  if (constrange)
215  selec = calc_rangesel(typcache, &vardata, constrange, operator);
216  else
217  selec = default_range_selectivity(operator);
218 
219  ReleaseVariableStats(vardata);
220 
221  CLAMP_PROBABILITY(selec);
222 
223  PG_RETURN_FLOAT8((float8) selec);
224 }
225 
226 static double
228  RangeType *constval, Oid operator)
229 {
230  double hist_selec;
231  double selec;
232  float4 empty_frac,
233  null_frac;
234 
235  /*
236  * First look up the fraction of NULLs and empty ranges from pg_statistic.
237  */
238  if (HeapTupleIsValid(vardata->statsTuple))
239  {
240  Form_pg_statistic stats;
241  float4 *numbers;
242  int nnumbers;
243 
244  stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
245  null_frac = stats->stanullfrac;
246 
247  /* Try to get fraction of empty ranges */
248  if (get_attstatsslot(vardata->statsTuple,
249  vardata->atttype, vardata->atttypmod,
251  NULL,
252  NULL, NULL,
253  &numbers, &nnumbers))
254  {
255  if (nnumbers != 1)
256  elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
257  empty_frac = numbers[0];
258  }
259  else
260  {
261  /* No empty fraction statistic. Assume no empty ranges. */
262  empty_frac = 0.0;
263  }
264  }
265  else
266  {
267  /*
268  * No stats are available. Follow through the calculations below
269  * anyway, assuming no NULLs and no empty ranges. This still allows us
270  * to give a better-than-nothing estimate based on whether the
271  * constant is an empty range or not.
272  */
273  null_frac = 0.0;
274  empty_frac = 0.0;
275  }
276 
277  if (RangeIsEmpty(constval))
278  {
279  /*
280  * An empty range matches all ranges, all empty ranges, or nothing,
281  * depending on the operator
282  */
283  switch (operator)
284  {
285  /* these return false if either argument is empty */
289  case OID_RANGE_LEFT_OP:
290  case OID_RANGE_RIGHT_OP:
291  /* nothing is less than an empty range */
292  case OID_RANGE_LESS_OP:
293  selec = 0.0;
294  break;
295 
296  /* only empty ranges can be contained by an empty range */
298  /* only empty ranges are <= an empty range */
300  selec = empty_frac;
301  break;
302 
303  /* everything contains an empty range */
305  /* everything is >= an empty range */
307  selec = 1.0;
308  break;
309 
310  /* all non-empty ranges are > an empty range */
312  selec = 1.0 - empty_frac;
313  break;
314 
315  /* an element cannot be empty */
317  default:
318  elog(ERROR, "unexpected operator %u", operator);
319  selec = 0.0; /* keep compiler quiet */
320  break;
321  }
322  }
323  else
324  {
325  /*
326  * Calculate selectivity using bound histograms. If that fails for
327  * some reason, e.g no histogram in pg_statistic, use the default
328  * constant estimate for the fraction of non-empty values. This is
329  * still somewhat better than just returning the default estimate,
330  * because this still takes into account the fraction of empty and
331  * NULL tuples, if we had statistics for them.
332  */
333  hist_selec = calc_hist_selectivity(typcache, vardata, constval,
334  operator);
335  if (hist_selec < 0.0)
336  hist_selec = default_range_selectivity(operator);
337 
338  /*
339  * Now merge the results for the empty ranges and histogram
340  * calculations, realizing that the histogram covers only the
341  * non-null, non-empty values.
342  */
343  if (operator == OID_RANGE_CONTAINED_OP)
344  {
345  /* empty is contained by anything non-empty */
346  selec = (1.0 - empty_frac) * hist_selec + empty_frac;
347  }
348  else
349  {
350  /* with any other operator, empty Op non-empty matches nothing */
351  selec = (1.0 - empty_frac) * hist_selec;
352  }
353  }
354 
355  /* all range operators are strict */
356  selec *= (1.0 - null_frac);
357 
358  /* result should be in range, but make sure... */
359  CLAMP_PROBABILITY(selec);
360 
361  return selec;
362 }
363 
364 /*
365  * Calculate range operator selectivity using histograms of range bounds.
366  *
367  * This estimate is for the portion of values that are not empty and not
368  * NULL.
369  */
370 static double
372  RangeType *constval, Oid operator)
373 {
374  Datum *hist_values;
375  int nhist;
376  Datum *length_hist_values;
377  int length_nhist;
378  RangeBound *hist_lower;
379  RangeBound *hist_upper;
380  int i;
381  RangeBound const_lower;
382  RangeBound const_upper;
383  bool empty;
384  double hist_selec;
385 
386  /* Try to get histogram of ranges */
387  if (!(HeapTupleIsValid(vardata->statsTuple) &&
388  get_attstatsslot(vardata->statsTuple,
389  vardata->atttype, vardata->atttypmod,
391  NULL,
392  &hist_values, &nhist,
393  NULL, NULL)))
394  return -1.0;
395 
396  /*
397  * Convert histogram of ranges into histograms of its lower and upper
398  * bounds.
399  */
400  hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
401  hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
402  for (i = 0; i < nhist; i++)
403  {
404  range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
405  &hist_lower[i], &hist_upper[i], &empty);
406  /* The histogram should not contain any empty ranges */
407  if (empty)
408  elog(ERROR, "bounds histogram contains an empty range");
409  }
410 
411  /* @> and @< also need a histogram of range lengths */
412  if (operator == OID_RANGE_CONTAINS_OP ||
413  operator == OID_RANGE_CONTAINED_OP)
414  {
415  if (!(HeapTupleIsValid(vardata->statsTuple) &&
416  get_attstatsslot(vardata->statsTuple,
417  vardata->atttype, vardata->atttypmod,
419  InvalidOid,
420  NULL,
421  &length_hist_values, &length_nhist,
422  NULL, NULL)))
423  return -1.0;
424 
425  /* check that it's a histogram, not just a dummy entry */
426  if (length_nhist < 2)
427  return -1.0;
428  }
429 
430  /* Extract the bounds of the constant value. */
431  range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
432  Assert(!empty);
433 
434  /*
435  * Calculate selectivity comparing the lower or upper bound of the
436  * constant with the histogram of lower or upper bounds.
437  */
438  switch (operator)
439  {
440  case OID_RANGE_LESS_OP:
441 
442  /*
443  * The regular b-tree comparison operators (<, <=, >, >=) compare
444  * the lower bounds first, and the upper bounds for values with
445  * equal lower bounds. Estimate that by comparing the lower bounds
446  * only. This gives a fairly accurate estimate assuming there
447  * aren't many rows with a lower bound equal to the constant's
448  * lower bound.
449  */
450  hist_selec =
451  calc_hist_selectivity_scalar(typcache, &const_lower,
452  hist_lower, nhist, false);
453  break;
454 
456  hist_selec =
457  calc_hist_selectivity_scalar(typcache, &const_lower,
458  hist_lower, nhist, true);
459  break;
460 
462  hist_selec =
463  1 - calc_hist_selectivity_scalar(typcache, &const_lower,
464  hist_lower, nhist, false);
465  break;
466 
468  hist_selec =
469  1 - calc_hist_selectivity_scalar(typcache, &const_lower,
470  hist_lower, nhist, true);
471  break;
472 
473  case OID_RANGE_LEFT_OP:
474  /* var << const when upper(var) < lower(const) */
475  hist_selec =
476  calc_hist_selectivity_scalar(typcache, &const_lower,
477  hist_upper, nhist, false);
478  break;
479 
480  case OID_RANGE_RIGHT_OP:
481  /* var >> const when lower(var) > upper(const) */
482  hist_selec =
483  1 - calc_hist_selectivity_scalar(typcache, &const_upper,
484  hist_lower, nhist, true);
485  break;
486 
488  /* compare lower bounds */
489  hist_selec =
490  1 - calc_hist_selectivity_scalar(typcache, &const_lower,
491  hist_lower, nhist, false);
492  break;
493 
495  /* compare upper bounds */
496  hist_selec =
497  calc_hist_selectivity_scalar(typcache, &const_upper,
498  hist_upper, nhist, true);
499  break;
500 
503 
504  /*
505  * A && B <=> NOT (A << B OR A >> B).
506  *
507  * Since A << B and A >> B are mutually exclusive events we can
508  * sum their probabilities to find probability of (A << B OR A >>
509  * B).
510  *
511  * "range @> elem" is equivalent to "range && [elem,elem]". The
512  * caller already constructed the singular range from the element
513  * constant, so just treat it the same as &&.
514  */
515  hist_selec =
516  calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
517  nhist, false);
518  hist_selec +=
519  (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
520  nhist, true));
521  hist_selec = 1.0 - hist_selec;
522  break;
523 
525  hist_selec =
526  calc_hist_selectivity_contains(typcache, &const_lower,
527  &const_upper, hist_lower, nhist,
528  length_hist_values, length_nhist);
529  break;
530 
532  if (const_lower.infinite)
533  {
534  /*
535  * Lower bound no longer matters. Just estimate the fraction
536  * with an upper bound <= const upper bound
537  */
538  hist_selec =
539  calc_hist_selectivity_scalar(typcache, &const_upper,
540  hist_upper, nhist, true);
541  }
542  else if (const_upper.infinite)
543  {
544  hist_selec =
545  1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
546  hist_lower, nhist, false);
547  }
548  else
549  {
550  hist_selec =
551  calc_hist_selectivity_contained(typcache, &const_lower,
552  &const_upper, hist_lower, nhist,
553  length_hist_values, length_nhist);
554  }
555  break;
556 
557  default:
558  elog(ERROR, "unknown range operator %u", operator);
559  hist_selec = -1.0; /* keep compiler quiet */
560  break;
561  }
562 
563  return hist_selec;
564 }
565 
566 
567 /*
568  * Look up the fraction of values less than (or equal, if 'equal' argument
569  * is true) a given const in a histogram of range bounds.
570  */
571 static double
573  RangeBound *hist, int hist_nvalues, bool equal)
574 {
575  Selectivity selec;
576  int index;
577 
578  /*
579  * Find the histogram bin the given constant falls into. Estimate
580  * selectivity as the number of preceding whole bins.
581  */
582  index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
583  selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
584 
585  /* Adjust using linear interpolation within the bin */
586  if (index >= 0 && index < hist_nvalues - 1)
587  selec += get_position(typcache, constbound, &hist[index],
588  &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
589 
590  return selec;
591 }
592 
593 /*
594  * Binary search on an array of range bounds. Returns greatest index of range
595  * bound in array which is less(less or equal) than given range bound. If all
596  * range bounds in array are greater or equal(greater) than given range bound,
597  * return -1. When "equal" flag is set conditions in brackets are used.
598  *
599  * This function is used in scalar operator selectivity estimation. Another
600  * goal of this function is to find a histogram bin where to stop
601  * interpolation of portion of bounds which are less or equal to given bound.
602  */
603 static int
605  int hist_length, bool equal)
606 {
607  int lower = -1,
608  upper = hist_length - 1,
609  cmp,
610  middle;
611 
612  while (lower < upper)
613  {
614  middle = (lower + upper + 1) / 2;
615  cmp = range_cmp_bounds(typcache, &hist[middle], value);
616 
617  if (cmp < 0 || (equal && cmp == 0))
618  lower = middle;
619  else
620  upper = middle - 1;
621  }
622  return lower;
623 }
624 
625 
626 /*
627  * Binary search on length histogram. Returns greatest index of range length in
628  * histogram which is less than (less than or equal) the given length value. If
629  * all lengths in the histogram are greater than (greater than or equal) the
630  * given length, returns -1.
631  */
632 static int
633 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
634  double value, bool equal)
635 {
636  int lower = -1,
637  upper = length_hist_nvalues - 1,
638  middle;
639 
640  while (lower < upper)
641  {
642  double middleval;
643 
644  middle = (lower + upper + 1) / 2;
645 
646  middleval = DatumGetFloat8(length_hist_values[middle]);
647  if (middleval < value || (equal && middleval <= value))
648  lower = middle;
649  else
650  upper = middle - 1;
651  }
652  return lower;
653 }
654 
655 /*
656  * Get relative position of value in histogram bin in [0,1] range.
657  */
658 static float8
660  RangeBound *hist2)
661 {
662  bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
663  float8 position;
664 
665  if (!hist1->infinite && !hist2->infinite)
666  {
667  float8 bin_width;
668 
669  /*
670  * Both bounds are finite. Assuming the subtype's comparison function
671  * works sanely, the value must be finite, too, because it lies
672  * somewhere between the bounds. If it doesn't, just return something.
673  */
674  if (value->infinite)
675  return 0.5;
676 
677  /* Can't interpolate without subdiff function */
678  if (!has_subdiff)
679  return 0.5;
680 
681  /* Calculate relative position using subdiff function. */
682  bin_width = DatumGetFloat8(FunctionCall2Coll(
683  &typcache->rng_subdiff_finfo,
684  typcache->rng_collation,
685  hist2->val,
686  hist1->val));
687  if (bin_width <= 0.0)
688  return 0.5; /* zero width bin */
689 
691  &typcache->rng_subdiff_finfo,
692  typcache->rng_collation,
693  value->val,
694  hist1->val))
695  / bin_width;
696 
697  /* Relative position must be in [0,1] range */
698  position = Max(position, 0.0);
699  position = Min(position, 1.0);
700  return position;
701  }
702  else if (hist1->infinite && !hist2->infinite)
703  {
704  /*
705  * Lower bin boundary is -infinite, upper is finite. If the value is
706  * -infinite, return 0.0 to indicate it's equal to the lower bound.
707  * Otherwise return 1.0 to indicate it's infinitely far from the lower
708  * bound.
709  */
710  return ((value->infinite && value->lower) ? 0.0 : 1.0);
711  }
712  else if (!hist1->infinite && hist2->infinite)
713  {
714  /* same as above, but in reverse */
715  return ((value->infinite && !value->lower) ? 1.0 : 0.0);
716  }
717  else
718  {
719  /*
720  * If both bin boundaries are infinite, they should be equal to each
721  * other, and the value should also be infinite and equal to both
722  * bounds. (But don't Assert that, to avoid crashing if a user creates
723  * a datatype with a broken comparison function).
724  *
725  * Assume the value to lie in the middle of the infinite bounds.
726  */
727  return 0.5;
728  }
729 }
730 
731 
732 /*
733  * Get relative position of value in a length histogram bin in [0,1] range.
734  */
735 static double
736 get_len_position(double value, double hist1, double hist2)
737 {
738  if (!is_infinite(hist1) && !is_infinite(hist2))
739  {
740  /*
741  * Both bounds are finite. The value should be finite too, because it
742  * lies somewhere between the bounds. If it doesn't, just return
743  * something.
744  */
745  if (is_infinite(value))
746  return 0.5;
747 
748  return 1.0 - (hist2 - value) / (hist2 - hist1);
749  }
750  else if (is_infinite(hist1) && !is_infinite(hist2))
751  {
752  /*
753  * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
754  * indicate the value is infinitely far from the lower bound.
755  */
756  return 1.0;
757  }
758  else if (is_infinite(hist1) && is_infinite(hist2))
759  {
760  /* same as above, but in reverse */
761  return 0.0;
762  }
763  else
764  {
765  /*
766  * If both bin boundaries are infinite, they should be equal to each
767  * other, and the value should also be infinite and equal to both
768  * bounds. (But don't Assert that, to avoid crashing unnecessarily if
769  * the caller messes up)
770  *
771  * Assume the value to lie in the middle of the infinite bounds.
772  */
773  return 0.5;
774  }
775 }
776 
777 /*
778  * Measure distance between two range bounds.
779  */
780 static float8
781 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
782 {
783  bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
784 
785  if (!bound1->infinite && !bound2->infinite)
786  {
787  /*
788  * No bounds are infinite, use subdiff function or return default
789  * value of 1.0 if no subdiff is available.
790  */
791  if (has_subdiff)
792  return
794  typcache->rng_collation,
795  bound2->val,
796  bound1->val));
797  else
798  return 1.0;
799  }
800  else if (bound1->infinite && bound2->infinite)
801  {
802  /* Both bounds are infinite */
803  if (bound1->lower == bound2->lower)
804  return 0.0;
805  else
806  return get_float8_infinity();
807  }
808  else
809  {
810  /* One bound is infinite, another is not */
811  return get_float8_infinity();
812  }
813 }
814 
815 /*
816  * Calculate the average of function P(x), in the interval [length1, length2],
817  * where P(x) is the fraction of tuples with length < x (or length <= x if
818  * 'equal' is true).
819  */
820 static double
821 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
822  double length1, double length2, bool equal)
823 {
824  double frac;
825  double A,
826  B,
827  PA,
828  PB;
829  double pos;
830  int i;
831  double area;
832 
833  Assert(length2 >= length1);
834 
835  if (length2 < 0.0)
836  return 0.0; /* shouldn't happen, but doesn't hurt to check */
837 
838  /* All lengths in the table are <= infinite. */
839  if (is_infinite(length2) && equal)
840  return 1.0;
841 
842  /*----------
843  * The average of a function between A and B can be calculated by the
844  * formula:
845  *
846  * B
847  * 1 /
848  * ------- | P(x)dx
849  * B - A /
850  * A
851  *
852  * The geometrical interpretation of the integral is the area under the
853  * graph of P(x). P(x) is defined by the length histogram. We calculate
854  * the area in a piecewise fashion, iterating through the length histogram
855  * bins. Each bin is a trapezoid:
856  *
857  * P(x2)
858  * /|
859  * / |
860  * P(x1)/ |
861  * | |
862  * | |
863  * ---+---+--
864  * x1 x2
865  *
866  * where x1 and x2 are the boundaries of the current histogram, and P(x1)
867  * and P(x1) are the cumulative fraction of tuples at the boundaries.
868  *
869  * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
870  *
871  * The first bin contains the lower bound passed by the caller, so we
872  * use linear interpolation between the previous and next histogram bin
873  * boundary to calculate P(x1). Likewise for the last bin: we use linear
874  * interpolation to calculate P(x2). For the bins in between, x1 and x2
875  * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
876  * P(x1) = (bin index) / (number of bins)
877  * P(x2) = (bin index + 1 / (number of bins)
878  */
879 
880  /* First bin, the one that contains lower bound */
881  i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
882  if (i >= length_hist_nvalues - 1)
883  return 1.0;
884 
885  if (i < 0)
886  {
887  i = 0;
888  pos = 0.0;
889  }
890  else
891  {
892  /* interpolate length1's position in the bin */
893  pos = get_len_position(length1,
894  DatumGetFloat8(length_hist_values[i]),
895  DatumGetFloat8(length_hist_values[i + 1]));
896  }
897  PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
898  B = length1;
899 
900  /*
901  * In the degenerate case that length1 == length2, simply return
902  * P(length1). This is not merely an optimization: if length1 == length2,
903  * we'd divide by zero later on.
904  */
905  if (length2 == length1)
906  return PB;
907 
908  /*
909  * Loop through all the bins, until we hit the last bin, the one that
910  * contains the upper bound. (if lower and upper bounds are in the same
911  * bin, this falls out immediately)
912  */
913  area = 0.0;
914  for (; i < length_hist_nvalues - 1; i++)
915  {
916  double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
917 
918  /* check if we've reached the last bin */
919  if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
920  break;
921 
922  /* the upper bound of previous bin is the lower bound of this bin */
923  A = B;
924  PA = PB;
925 
926  B = bin_upper;
927  PB = (double) i / (double) (length_hist_nvalues - 1);
928 
929  /*
930  * Add the area of this trapezoid to the total. The point of the
931  * if-check is to avoid NaN, in the corner case that PA == PB == 0,
932  * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
933  * 0) is zero, regardless of the width (B - A).
934  */
935  if (PA > 0 || PB > 0)
936  area += 0.5 * (PB + PA) * (B - A);
937  }
938 
939  /* Last bin */
940  A = B;
941  PA = PB;
942 
943  B = length2; /* last bin ends at the query upper bound */
944  if (i >= length_hist_nvalues - 1)
945  pos = 0.0;
946  else
947  {
948  if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
949  pos = 0.0;
950  else
951  pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
952  }
953  PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
954 
955  if (PA > 0 || PB > 0)
956  area += 0.5 * (PB + PA) * (B - A);
957 
958  /*
959  * Ok, we have calculated the area, ie. the integral. Divide by width to
960  * get the requested average.
961  *
962  * Avoid NaN arising from infinite / infinite. This happens at least if
963  * length2 is infinite. It's not clear what the correct value would be in
964  * that case, so 0.5 seems as good as any value.
965  */
966  if (is_infinite(area) && is_infinite(length2))
967  frac = 0.5;
968  else
969  frac = area / (length2 - length1);
970 
971  return frac;
972 }
973 
974 /*
975  * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
976  * of ranges that fall within the constant lower and upper bounds. This uses
977  * the histograms of range lower bounds and range lengths, on the assumption
978  * that the range lengths are independent of the lower bounds.
979  *
980  * The caller has already checked that constant lower and upper bounds are
981  * finite.
982  */
983 static double
986  RangeBound *hist_lower, int hist_nvalues,
987  Datum *length_hist_values, int length_hist_nvalues)
988 {
989  int i,
990  upper_index;
991  float8 prev_dist;
992  double bin_width;
993  double upper_bin_width;
994  double sum_frac;
995 
996  /*
997  * Begin by finding the bin containing the upper bound, in the lower bound
998  * histogram. Any range with a lower bound > constant upper bound can't
999  * match, ie. there are no matches in bins greater than upper_index.
1000  */
1001  upper->inclusive = !upper->inclusive;
1002  upper->lower = true;
1003  upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1004  false);
1005 
1006  /*
1007  * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1008  * upper_index + 1) bin which is greater than upper bound of query range
1009  * using linear interpolation of subdiff function.
1010  */
1011  if (upper_index >= 0 && upper_index < hist_nvalues - 1)
1012  upper_bin_width = get_position(typcache, upper,
1013  &hist_lower[upper_index],
1014  &hist_lower[upper_index + 1]);
1015  else
1016  upper_bin_width = 0.0;
1017 
1018  /*
1019  * In the loop, dist and prev_dist are the distance of the "current" bin's
1020  * lower and upper bounds from the constant upper bound.
1021  *
1022  * bin_width represents the width of the current bin. Normally it is 1.0,
1023  * meaning a full width bin, but can be less in the corner cases: start
1024  * and end of the loop. We start with bin_width = upper_bin_width, because
1025  * we begin at the bin containing the upper bound.
1026  */
1027  prev_dist = 0.0;
1028  bin_width = upper_bin_width;
1029 
1030  sum_frac = 0.0;
1031  for (i = upper_index; i >= 0; i--)
1032  {
1033  double dist;
1034  double length_hist_frac;
1035  bool final_bin = false;
1036 
1037  /*
1038  * dist -- distance from upper bound of query range to lower bound of
1039  * the current bin in the lower bound histogram. Or to the lower bound
1040  * of the constant range, if this is the final bin, containing the
1041  * constant lower bound.
1042  */
1043  if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1044  {
1045  dist = get_distance(typcache, lower, upper);
1046 
1047  /*
1048  * Subtract from bin_width the portion of this bin that we want to
1049  * ignore.
1050  */
1051  bin_width -= get_position(typcache, lower, &hist_lower[i],
1052  &hist_lower[i + 1]);
1053  if (bin_width < 0.0)
1054  bin_width = 0.0;
1055  final_bin = true;
1056  }
1057  else
1058  dist = get_distance(typcache, &hist_lower[i], upper);
1059 
1060  /*
1061  * Estimate the fraction of tuples in this bin that are narrow enough
1062  * to not exceed the distance to the upper bound of the query range.
1063  */
1064  length_hist_frac = calc_length_hist_frac(length_hist_values,
1065  length_hist_nvalues,
1066  prev_dist, dist, true);
1067 
1068  /*
1069  * Add the fraction of tuples in this bin, with a suitable length, to
1070  * the total.
1071  */
1072  sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1073 
1074  if (final_bin)
1075  break;
1076 
1077  bin_width = 1.0;
1078  prev_dist = dist;
1079  }
1080 
1081  return sum_frac;
1082 }
1083 
1084 /*
1085  * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1086  * of ranges that contain the constant lower and upper bounds. This uses
1087  * the histograms of range lower bounds and range lengths, on the assumption
1088  * that the range lengths are independent of the lower bounds.
1089  *
1090  * Note, this is "var @> const", ie. estimate the fraction of ranges that
1091  * contain the constant lower and upper bounds.
1092  */
1093 static double
1096  RangeBound *hist_lower, int hist_nvalues,
1097  Datum *length_hist_values, int length_hist_nvalues)
1098 {
1099  int i,
1100  lower_index;
1101  double bin_width,
1102  lower_bin_width;
1103  double sum_frac;
1104  float8 prev_dist;
1105 
1106  /* Find the bin containing the lower bound of query range. */
1107  lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1108  true);
1109 
1110  /*
1111  * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1112  * lower_index + 1) bin which is greater than lower bound of query range
1113  * using linear interpolation of subdiff function.
1114  */
1115  if (lower_index >= 0 && lower_index < hist_nvalues - 1)
1116  lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1117  &hist_lower[lower_index + 1]);
1118  else
1119  lower_bin_width = 0.0;
1120 
1121  /*
1122  * Loop through all the lower bound bins, smaller than the query lower
1123  * bound. In the loop, dist and prev_dist are the distance of the
1124  * "current" bin's lower and upper bounds from the constant upper bound.
1125  * We begin from query lower bound, and walk backwards, so the first bin's
1126  * upper bound is the query lower bound, and its distance to the query
1127  * upper bound is the length of the query range.
1128  *
1129  * bin_width represents the width of the current bin. Normally it is 1.0,
1130  * meaning a full width bin, except for the first bin, which is only
1131  * counted up to the constant lower bound.
1132  */
1133  prev_dist = get_distance(typcache, lower, upper);
1134  sum_frac = 0.0;
1135  bin_width = lower_bin_width;
1136  for (i = lower_index; i >= 0; i--)
1137  {
1138  float8 dist;
1139  double length_hist_frac;
1140 
1141  /*
1142  * dist -- distance from upper bound of query range to current value
1143  * of lower bound histogram or lower bound of query range (if we've
1144  * reach it).
1145  */
1146  dist = get_distance(typcache, &hist_lower[i], upper);
1147 
1148  /*
1149  * Get average fraction of length histogram which covers intervals
1150  * longer than (or equal to) distance to upper bound of query range.
1151  */
1152  length_hist_frac =
1153  1.0 - calc_length_hist_frac(length_hist_values,
1154  length_hist_nvalues,
1155  prev_dist, dist, false);
1156 
1157  sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1158 
1159  bin_width = 1.0;
1160  prev_dist = dist;
1161  }
1162 
1163  return sum_frac;
1164 }
#define OID_RANGE_ELEM_CONTAINED_OP
Definition: pg_operator.h:1776
#define PG_GETARG_INT32(n)
Definition: fmgr.h:226
#define IsA(nodeptr, _type_)
Definition: nodes.h:573
#define RangeIsEmpty(r)
Definition: rangetypes.h:54
#define OID_RANGE_RIGHT_OP
Definition: pg_operator.h:1785
Oid get_commutator(Oid opno)
Definition: lsyscache.c:1281
int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1, RangeBound *b2)
Definition: rangetypes.c:1768
#define OID_RANGE_LESS_EQUAL_OP
Definition: pg_operator.h:1758
#define GETSTRUCT(TUP)
Definition: htup_details.h:656
#define OID_RANGE_CONTAINS_OP
Definition: pg_operator.h:1773
static struct @78 value
#define OID_RANGE_GREATER_EQUAL_OP
Definition: pg_operator.h:1761
TypeCacheEntry * range_get_typcache(FunctionCallInfo fcinfo, Oid rngtypid)
Definition: rangetypes.c:1479
bool equal(const void *a, const void *b)
Definition: equalfuncs.c:2946
HeapTuple statsTuple
Definition: selfuncs.h:71
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:43
static float8 get_len_position(double value, double hist1, double hist2)
#define DEFAULT_INEQ_SEL
Definition: selfuncs.h:37
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4441
#define OID_RANGE_OVERLAPS_LEFT_OP
Definition: pg_operator.h:1788
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:318
#define Min(x, y)
Definition: c.h:806
static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues)
#define DatumGetRangeType(X)
Definition: rangetypes.h:71
static float8 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1, RangeBound *hist2)
Datum val
Definition: rangetypes.h:62
static int length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues, double value, bool equal)
Definition: nodes.h:522
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:233
static double calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, RangeType *constval, Oid operator)
double Selectivity
Definition: nodes.h:645
bool get_attstatsslot(HeapTuple statstuple, Oid atttype, int32 atttypmod, int reqkind, Oid reqop, Oid *actualop, Datum **values, int *nvalues, float4 **numbers, int *nnumbers)
Definition: lsyscache.c:2854
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:74
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1356
unsigned int Oid
Definition: postgres_ext.h:31
#define OID_RANGE_CONTAINS_ELEM_OP
Definition: pg_operator.h:1770
#define OID_RANGE_CONTAINED_OP
Definition: pg_operator.h:1779
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:129
#define OidIsValid(objectId)
Definition: c.h:538
int32 atttypmod
Definition: selfuncs.h:76
Definition: type.h:90
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:57
FmgrInfo rng_subdiff_finfo
Definition: typcache.h:88
bool inclusive
Definition: rangetypes.h:64
static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues)
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:381
#define STATISTIC_KIND_BOUNDS_HISTOGRAM
Definition: pg_statistic.h:292
Datum rangesel(PG_FUNCTION_ARGS)
#define OID_RANGE_LEFT_OP
Definition: pg_operator.h:1782
#define OID_RANGE_LESS_OP
Definition: pg_operator.h:1755
#define PG_GETARG_OID(n)
Definition: fmgr.h:232
static double default_range_selectivity(Oid operator)
static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
RangeType * range_serialize(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, bool empty)
Definition: rangetypes.c:1503
bool lower
Definition: rangetypes.h:65
struct TypeCacheEntry * rngelemtype
Definition: typcache.h:84
double get_float8_infinity(void)
Definition: float.c:121
void range_deserialize(TypeCacheEntry *typcache, RangeType *range, RangeBound *lower, RangeBound *upper, bool *empty)
Definition: rangetypes.c:1632
float float4
Definition: c.h:380
#define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM
Definition: pg_statistic.h:281
static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound, RangeBound *hist, int hist_nvalues, bool equal)
static double calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal)
#define DatumGetFloat8(X)
Definition: postgres.h:734
uintptr_t Datum
Definition: postgres.h:372
#define InvalidOid
Definition: postgres_ext.h:36
Oid fn_oid
Definition: fmgr.h:56
#define Max(x, y)
Definition: c.h:800
#define HeapTupleIsValid(tuple)
Definition: htup.h:77
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata, RangeType *constval, Oid operator)
bool infinite
Definition: rangetypes.h:63
#define OID_RANGE_OVERLAPS_RIGHT_OP
Definition: pg_operator.h:1791
static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist, int hist_length, bool equal)
Oid rng_collation
Definition: typcache.h:85
int is_infinite(double val)
Definition: float.c:199
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:80
void * palloc(Size size)
Definition: mcxt.c:849
#define DEFAULT_RANGE_INEQ_SEL
Definition: selfuncs.h:40
int i
#define OID_RANGE_OVERLAP_OP
Definition: pg_operator.h:1767
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
#define elog
Definition: elog.h:219
#define OID_RANGE_GREATER_OP
Definition: pg_operator.h:1764
Definition: pg_list.h:45
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742