PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
rangetypes_typanalyze.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ragetypes_typanalyze.c
4  * Functions for gathering statistics from range columns
5  *
6  * For a range type column, histograms of lower and upper bounds, and
7  * the fraction of NULL and empty ranges are collected.
8  *
9  * Both histograms have the same length, and they are combined into a
10  * single array of ranges. This has the same shape as the histogram that
11  * std_typanalyze would collect, but the values are different. Each range
12  * in the array is a valid range, even though the lower and upper bounds
13  * come from different tuples. In theory, the standard scalar selectivity
14  * functions could be used with the combined histogram.
15  *
16  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
17  * Portions Copyright (c) 1994, Regents of the University of California
18  *
19  *
20  * IDENTIFICATION
21  * src/backend/utils/adt/rangetypes_typanalyze.c
22  *
23  *-------------------------------------------------------------------------
24  */
25 #include "postgres.h"
26 
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/builtins.h"
30 #include "utils/lsyscache.h"
31 #include "utils/rangetypes.h"
32 
33 static int float8_qsort_cmp(const void *a1, const void *a2);
34 static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
35 static void compute_range_stats(VacAttrStats *stats,
36  AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
37 
38 /*
39  * range_typanalyze -- typanalyze function for range columns
40  */
41 Datum
43 {
45  TypeCacheEntry *typcache;
46  Form_pg_attribute attr = stats->attr;
47 
48  /* Get information about range type; note column might be a domain */
49  typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
50 
51  if (attr->attstattarget < 0)
52  attr->attstattarget = default_statistics_target;
53 
55  stats->extra_data = typcache;
56  /* same as in std_typanalyze */
57  stats->minrows = 300 * attr->attstattarget;
58 
59  PG_RETURN_BOOL(true);
60 }
61 
62 /*
63  * Comparison function for sorting float8s, used for range lengths.
64  */
65 static int
66 float8_qsort_cmp(const void *a1, const void *a2)
67 {
68  const float8 *f1 = (const float8 *) a1;
69  const float8 *f2 = (const float8 *) a2;
70 
71  if (*f1 < *f2)
72  return -1;
73  else if (*f1 == *f2)
74  return 0;
75  else
76  return 1;
77 }
78 
79 /*
80  * Comparison function for sorting RangeBounds.
81  */
82 static int
83 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
84 {
85  RangeBound *b1 = (RangeBound *) a1;
86  RangeBound *b2 = (RangeBound *) a2;
87  TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
88 
89  return range_cmp_bounds(typcache, b1, b2);
90 }
91 
92 /*
93  * compute_range_stats() -- compute statistics for a range column
94  */
95 static void
97  int samplerows, double totalrows)
98 {
99  TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
100  bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
101  int null_cnt = 0;
102  int non_null_cnt = 0;
103  int non_empty_cnt = 0;
104  int empty_cnt = 0;
105  int range_no;
106  int slot_idx;
107  int num_bins = stats->attr->attstattarget;
108  int num_hist;
109  float8 *lengths;
110  RangeBound *lowers,
111  *uppers;
112  double total_width = 0;
113 
114  /* Allocate memory to hold range bounds and lengths of the sample ranges. */
115  lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
116  uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
117  lengths = (float8 *) palloc(sizeof(float8) * samplerows);
118 
119  /* Loop over the sample ranges. */
120  for (range_no = 0; range_no < samplerows; range_no++)
121  {
122  Datum value;
123  bool isnull,
124  empty;
125  RangeType *range;
127  upper;
128  float8 length;
129 
131 
132  value = fetchfunc(stats, range_no, &isnull);
133  if (isnull)
134  {
135  /* range is null, just count that */
136  null_cnt++;
137  continue;
138  }
139 
140  /*
141  * XXX: should we ignore wide values, like std_typanalyze does, to
142  * avoid bloating the statistics table?
143  */
144  total_width += VARSIZE_ANY(DatumGetPointer(value));
145 
146  /* Get range and deserialize it for further analysis. */
147  range = DatumGetRangeTypeP(value);
148  range_deserialize(typcache, range, &lower, &upper, &empty);
149 
150  if (!empty)
151  {
152  /* Remember bounds and length for further usage in histograms */
153  lowers[non_empty_cnt] = lower;
154  uppers[non_empty_cnt] = upper;
155 
156  if (lower.infinite || upper.infinite)
157  {
158  /* Length of any kind of an infinite range is infinite */
159  length = get_float8_infinity();
160  }
161  else if (has_subdiff)
162  {
163  /*
164  * For an ordinary range, use subdiff function between upper
165  * and lower bound values.
166  */
168  &typcache->rng_subdiff_finfo,
169  typcache->rng_collation,
170  upper.val, lower.val));
171  }
172  else
173  {
174  /* Use default value of 1.0 if no subdiff is available. */
175  length = 1.0;
176  }
177  lengths[non_empty_cnt] = length;
178 
179  non_empty_cnt++;
180  }
181  else
182  empty_cnt++;
183 
184  non_null_cnt++;
185  }
186 
187  slot_idx = 0;
188 
189  /* We can only compute real stats if we found some non-null values. */
190  if (non_null_cnt > 0)
191  {
192  Datum *bound_hist_values;
193  Datum *length_hist_values;
194  int pos,
195  posfrac,
196  delta,
197  deltafrac,
198  i;
199  MemoryContext old_cxt;
200  float4 *emptyfrac;
201 
202  stats->stats_valid = true;
203  /* Do the simple null-frac and width stats */
204  stats->stanullfrac = (double) null_cnt / (double) samplerows;
205  stats->stawidth = total_width / (double) non_null_cnt;
206 
207  /* Estimate that non-null values are unique */
208  stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
209 
210  /* Must copy the target values into anl_context */
211  old_cxt = MemoryContextSwitchTo(stats->anl_context);
212 
213  /*
214  * Generate a bounds histogram slot entry if there are at least two
215  * values.
216  */
217  if (non_empty_cnt >= 2)
218  {
219  /* Sort bound values */
220  qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
221  range_bound_qsort_cmp, typcache);
222  qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
223  range_bound_qsort_cmp, typcache);
224 
225  num_hist = non_empty_cnt;
226  if (num_hist > num_bins)
227  num_hist = num_bins + 1;
228 
229  bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
230 
231  /*
232  * The object of this loop is to construct ranges from first and
233  * last entries in lowers[] and uppers[] along with evenly-spaced
234  * values in between. So the i'th value is a range of lowers[(i *
235  * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
236  * (num_hist - 1)]. But computing that subscript directly risks
237  * integer overflow when the stats target is more than a couple
238  * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
239  * at each step, tracking the integral and fractional parts of the
240  * sum separately.
241  */
242  delta = (non_empty_cnt - 1) / (num_hist - 1);
243  deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
244  pos = posfrac = 0;
245 
246  for (i = 0; i < num_hist; i++)
247  {
248  bound_hist_values[i] = PointerGetDatum(range_serialize(
249  typcache, &lowers[pos], &uppers[pos], false));
250  pos += delta;
251  posfrac += deltafrac;
252  if (posfrac >= (num_hist - 1))
253  {
254  /* fractional part exceeds 1, carry to integer part */
255  pos++;
256  posfrac -= (num_hist - 1);
257  }
258  }
259 
260  stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
261  stats->stavalues[slot_idx] = bound_hist_values;
262  stats->numvalues[slot_idx] = num_hist;
263  slot_idx++;
264  }
265 
266  /*
267  * Generate a length histogram slot entry if there are at least two
268  * values.
269  */
270  if (non_empty_cnt >= 2)
271  {
272  /*
273  * Ascending sort of range lengths for further filling of
274  * histogram
275  */
276  qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
277 
278  num_hist = non_empty_cnt;
279  if (num_hist > num_bins)
280  num_hist = num_bins + 1;
281 
282  length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
283 
284  /*
285  * The object of this loop is to copy the first and last lengths[]
286  * entries along with evenly-spaced values in between. So the i'th
287  * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
288  * computing that subscript directly risks integer overflow when
289  * the stats target is more than a couple thousand. Instead we
290  * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
291  * the integral and fractional parts of the sum separately.
292  */
293  delta = (non_empty_cnt - 1) / (num_hist - 1);
294  deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
295  pos = posfrac = 0;
296 
297  for (i = 0; i < num_hist; i++)
298  {
299  length_hist_values[i] = Float8GetDatum(lengths[pos]);
300  pos += delta;
301  posfrac += deltafrac;
302  if (posfrac >= (num_hist - 1))
303  {
304  /* fractional part exceeds 1, carry to integer part */
305  pos++;
306  posfrac -= (num_hist - 1);
307  }
308  }
309  }
310  else
311  {
312  /*
313  * Even when we don't create the histogram, store an empty array
314  * to mean "no histogram". We can't just leave stavalues NULL,
315  * because get_attstatsslot() errors if you ask for stavalues, and
316  * it's NULL. We'll still store the empty fraction in stanumbers.
317  */
318  length_hist_values = palloc(0);
319  num_hist = 0;
320  }
321  stats->staop[slot_idx] = Float8LessOperator;
322  stats->stavalues[slot_idx] = length_hist_values;
323  stats->numvalues[slot_idx] = num_hist;
324  stats->statypid[slot_idx] = FLOAT8OID;
325  stats->statyplen[slot_idx] = sizeof(float8);
326 #ifdef USE_FLOAT8_BYVAL
327  stats->statypbyval[slot_idx] = true;
328 #else
329  stats->statypbyval[slot_idx] = false;
330 #endif
331  stats->statypalign[slot_idx] = 'd';
332 
333  /* Store the fraction of empty ranges */
334  emptyfrac = (float4 *) palloc(sizeof(float4));
335  *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
336  stats->stanumbers[slot_idx] = emptyfrac;
337  stats->numnumbers[slot_idx] = 1;
338 
340  slot_idx++;
341 
342  MemoryContextSwitchTo(old_cxt);
343  }
344  else if (null_cnt > 0)
345  {
346  /* We found only nulls; assume the column is entirely null */
347  stats->stats_valid = true;
348  stats->stanullfrac = 1.0;
349  stats->stawidth = 0; /* "unknown" */
350  stats->stadistinct = 0.0; /* "unknown" */
351  }
352 
353  /*
354  * We don't need to bother cleaning up any of our temporary palloc's. The
355  * hashtable should also go away, as it used a child memory context.
356  */
357 }
int length(const List *list)
Definition: list.c:1271
int minrows
Definition: vacuum.h:92
int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1, RangeBound *b2)
Definition: rangetypes.c:1832
TypeCacheEntry * range_get_typcache(FunctionCallInfo fcinfo, Oid rngtypid)
Definition: rangetypes.c:1543
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:43
#define PointerGetDatum(X)
Definition: postgres.h:562
Datum * stavalues[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:108
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Datum val
Definition: rangetypes.h:62
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
bool statypbyval[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:118
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:74
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1047
static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
#define OidIsValid(objectId)
Definition: c.h:532
char statypalign[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:119
Form_pg_attribute attr
Definition: vacuum.h:81
Datum Float8GetDatum(float8 X)
Definition: fmgr.c:1815
static int float8_qsort_cmp(const void *a1, const void *a2)
FmgrInfo rng_subdiff_finfo
Definition: typcache.h:92
Oid attrtypid
Definition: vacuum.h:82
double float8
Definition: c.h:375
int32 stawidth
Definition: vacuum.h:101
#define Float8LessOperator
Definition: pg_operator.h:534
static struct @121 value
#define STATISTIC_KIND_BOUNDS_HISTOGRAM
Definition: pg_statistic.h:292
int numnumbers[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:105
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:187
Datum range_typanalyze(PG_FUNCTION_ARGS)
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
Definition: regc_locale.c:416
RangeType * range_serialize(TypeCacheEntry *typcache, RangeBound *lower, RangeBound *upper, bool empty)
Definition: rangetypes.c:1567
float4 stanullfrac
Definition: vacuum.h:100
double get_float8_infinity(void)
Definition: float.c:121
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
void range_deserialize(TypeCacheEntry *typcache, RangeType *range, RangeBound *lower, RangeBound *upper, bool *empty)
Definition: rangetypes.c:1696
Oid staop[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:104
bool stats_valid
Definition: vacuum.h:99
float float4
Definition: c.h:374
#define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM
Definition: pg_statistic.h:281
#define DatumGetFloat8(X)
Definition: postgres.h:734
#define PG_RETURN_BOOL(x)
Definition: fmgr.h:319
uintptr_t Datum
Definition: postgres.h:372
int16 stakind[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:103
Datum(* AnalyzeAttrFetchFunc)(VacAttrStatsP stats, int rownum, bool *isNull)
Definition: vacuum.h:61
#define VARSIZE_ANY(PTR)
Definition: postgres.h:334
Oid statypid[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:116
Oid fn_oid
Definition: fmgr.h:59
static FormData_pg_attribute a1
Definition: heap.c:144
static void compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows)
float4 * stanumbers[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:106
#define FLOAT8OID
Definition: pg_type.h:419
#define DatumGetRangeTypeP(X)
Definition: rangetypes.h:71
bool infinite
Definition: rangetypes.h:63
Oid rng_collation
Definition: typcache.h:89
MemoryContext anl_context
Definition: vacuum.h:85
#define DatumGetPointer(X)
Definition: postgres.h:555
int numvalues[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:107
void * palloc(Size size)
Definition: mcxt.c:848
int16 statyplen[STATISTIC_NUM_SLOTS]
Definition: vacuum.h:117
int i
AnalyzeAttrComputeStatsFunc compute_stats
Definition: vacuum.h:91
void * arg
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
void * extra_data
Definition: vacuum.h:93
#define qsort(a, b, c, d)
Definition: port.h:443
void vacuum_delay_point(void)
Definition: vacuum.c:1560
Oid getBaseType(Oid typid)
Definition: lsyscache.c:2271
int default_statistics_target
Definition: analyze.c:77
float4 stadistinct
Definition: vacuum.h:102
static FormData_pg_attribute a2
Definition: heap.c:150