PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
network_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * network_selfuncs.c
4  * Functions for selectivity estimation of inet/cidr operators
5  *
6  * This module provides estimators for the subnet inclusion and overlap
7  * operators. Estimates are based on null fraction, most common values,
8  * and histogram of inet/cidr columns.
9  *
10  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
11  * Portions Copyright (c) 1994, Regents of the University of California
12  *
13  *
14  * IDENTIFICATION
15  * src/backend/utils/adt/network_selfuncs.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include <math.h>
22 
23 #include "access/htup_details.h"
24 #include "catalog/pg_operator.h"
25 #include "catalog/pg_statistic.h"
26 #include "utils/builtins.h"
27 #include "utils/inet.h"
28 #include "utils/lsyscache.h"
29 #include "utils/selfuncs.h"
30 
31 
32 /* Default selectivity for the inet overlap operator */
33 #define DEFAULT_OVERLAP_SEL 0.01
34 
35 /* Default selectivity for the various inclusion operators */
36 #define DEFAULT_INCLUSION_SEL 0.005
37 
38 /* Default selectivity for specified operator */
39 #define DEFAULT_SEL(operator) \
40  ((operator) == OID_INET_OVERLAP_OP ? \
41  DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
42 
43 /* Maximum number of items to consider in join selectivity calculations */
44 #define MAX_CONSIDERED_ELEMS 1024
45 
46 static Selectivity networkjoinsel_inner(Oid operator,
47  VariableStatData *vardata1, VariableStatData *vardata2);
48 static Selectivity networkjoinsel_semi(Oid operator,
49  VariableStatData *vardata1, VariableStatData *vardata2);
50 static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
51 static Selectivity inet_hist_value_sel(Datum *values, int nvalues,
52  Datum constvalue, int opr_codenum);
53 static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
54  float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values,
55  float4 *mcv2_numbers, int mcv2_nvalues, Oid operator);
56 static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers,
57  int mcv_nvalues, Datum *hist_values, int hist_nvalues,
58  int opr_codenum);
60  int hist1_nvalues,
61  Datum *hist2_values, int hist2_nvalues,
62  int opr_codenum);
63 static Selectivity inet_semi_join_sel(Datum lhs_value,
64  bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
65  bool hist_exists, Datum *hist_values, int hist_nvalues,
66  double hist_weight,
67  FmgrInfo *proc, int opr_codenum);
68 static int inet_opr_codenum(Oid operator);
69 static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum);
70 static int inet_masklen_inclusion_cmp(inet *left, inet *right,
71  int opr_codenum);
72 static int inet_hist_match_divider(inet *boundary, inet *query,
73  int opr_codenum);
74 
75 /*
76  * Selectivity estimation for the subnet inclusion/overlap operators
77  */
78 Datum
80 {
82  Oid operator = PG_GETARG_OID(1);
83  List *args = (List *) PG_GETARG_POINTER(2);
84  int varRelid = PG_GETARG_INT32(3);
85  VariableStatData vardata;
86  Node *other;
87  bool varonleft;
88  Selectivity selec,
89  mcv_selec,
90  non_mcv_selec;
91  Datum constvalue,
92  *hist_values;
93  int hist_nvalues;
94  Form_pg_statistic stats;
95  double sumcommon,
96  nullfrac;
97  FmgrInfo proc;
98 
99  /*
100  * If expression is not (variable op something) or (something op
101  * variable), then punt and return a default estimate.
102  */
103  if (!get_restriction_variable(root, args, varRelid,
104  &vardata, &other, &varonleft))
105  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
106 
107  /*
108  * Can't do anything useful if the something is not a constant, either.
109  */
110  if (!IsA(other, Const))
111  {
112  ReleaseVariableStats(vardata);
113  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
114  }
115 
116  /* All of the operators handled here are strict. */
117  if (((Const *) other)->constisnull)
118  {
119  ReleaseVariableStats(vardata);
120  PG_RETURN_FLOAT8(0.0);
121  }
122  constvalue = ((Const *) other)->constvalue;
123 
124  /* Otherwise, we need stats in order to produce a non-default estimate. */
125  if (!HeapTupleIsValid(vardata.statsTuple))
126  {
127  ReleaseVariableStats(vardata);
128  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
129  }
130 
131  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
132  nullfrac = stats->stanullfrac;
133 
134  /*
135  * If we have most-common-values info, add up the fractions of the MCV
136  * entries that satisfy MCV OP CONST. These fractions contribute directly
137  * to the result selectivity. Also add up the total fraction represented
138  * by MCV entries.
139  */
140  fmgr_info(get_opcode(operator), &proc);
141  mcv_selec = mcv_selectivity(&vardata, &proc, constvalue, varonleft,
142  &sumcommon);
143 
144  /*
145  * If we have a histogram, use it to estimate the proportion of the
146  * non-MCV population that satisfies the clause. If we don't, apply the
147  * default selectivity to that population.
148  */
149  if (get_attstatsslot(vardata.statsTuple,
150  vardata.atttype, vardata.atttypmod,
152  NULL,
153  &hist_values, &hist_nvalues,
154  NULL, NULL))
155  {
156  int opr_codenum = inet_opr_codenum(operator);
157 
158  /* Commute if needed, so we can consider histogram to be on the left */
159  if (!varonleft)
160  opr_codenum = -opr_codenum;
161  non_mcv_selec = inet_hist_value_sel(hist_values, hist_nvalues,
162  constvalue, opr_codenum);
163 
164  free_attstatsslot(vardata.atttype, hist_values, hist_nvalues, NULL, 0);
165  }
166  else
167  non_mcv_selec = DEFAULT_SEL(operator);
168 
169  /* Combine selectivities for MCV and non-MCV populations */
170  selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec;
171 
172  /* Result should be in range, but make sure... */
173  CLAMP_PROBABILITY(selec);
174 
175  ReleaseVariableStats(vardata);
176 
177  PG_RETURN_FLOAT8(selec);
178 }
179 
180 /*
181  * Join selectivity estimation for the subnet inclusion/overlap operators
182  *
183  * This function has the same structure as eqjoinsel() in selfuncs.c.
184  *
185  * Throughout networkjoinsel and its subroutines, we have a performance issue
186  * in that the amount of work to be done is O(N^2) in the length of the MCV
187  * and histogram arrays. To keep the runtime from getting out of hand when
188  * large statistics targets have been set, we arbitrarily limit the number of
189  * values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
190  * is easy: just consider at most the first N elements. (Since the MCVs are
191  * sorted by decreasing frequency, this correctly gets us the first N MCVs.)
192  * For the histogram arrays, we decimate; that is consider only every k'th
193  * element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
194  * elements are considered. This should still give us a good random sample of
195  * the non-MCV population. Decimation is done on-the-fly in the loops that
196  * iterate over the histogram arrays.
197  */
198 Datum
200 {
202  Oid operator = PG_GETARG_OID(1);
203  List *args = (List *) PG_GETARG_POINTER(2);
204 #ifdef NOT_USED
205  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
206 #endif
208  double selec;
209  VariableStatData vardata1;
210  VariableStatData vardata2;
211  bool join_is_reversed;
212 
213  get_join_variables(root, args, sjinfo,
214  &vardata1, &vardata2, &join_is_reversed);
215 
216  switch (sjinfo->jointype)
217  {
218  case JOIN_INNER:
219  case JOIN_LEFT:
220  case JOIN_FULL:
221 
222  /*
223  * Selectivity for left/full join is not exactly the same as inner
224  * join, but we neglect the difference, as eqjoinsel does.
225  */
226  selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
227  break;
228  case JOIN_SEMI:
229  case JOIN_ANTI:
230  /* Here, it's important that we pass the outer var on the left. */
231  if (!join_is_reversed)
232  selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
233  else
234  selec = networkjoinsel_semi(get_commutator(operator),
235  &vardata2, &vardata1);
236  break;
237  default:
238  /* other values not expected here */
239  elog(ERROR, "unrecognized join type: %d",
240  (int) sjinfo->jointype);
241  selec = 0; /* keep compiler quiet */
242  break;
243  }
244 
245  ReleaseVariableStats(vardata1);
246  ReleaseVariableStats(vardata2);
247 
248  CLAMP_PROBABILITY(selec);
249 
250  PG_RETURN_FLOAT8((float8) selec);
251 }
252 
253 /*
254  * Inner join selectivity estimation for subnet inclusion/overlap operators
255  *
256  * Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
257  * selectivity for join using the subnet inclusion operators. Unlike the
258  * join selectivity function for the equality operator, eqjoinsel_inner(),
259  * one to one matching of the values is not enough. Network inclusion
260  * operators are likely to match many to many, so we must check all pairs.
261  * (Note: it might be possible to exploit understanding of the histogram's
262  * btree ordering to reduce the work needed, but we don't currently try.)
263  * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
264  */
265 static Selectivity
267  VariableStatData *vardata1, VariableStatData *vardata2)
268 {
269  Form_pg_statistic stats;
270  double nullfrac1 = 0.0,
271  nullfrac2 = 0.0;
272  Selectivity selec = 0.0,
273  sumcommon1 = 0.0,
274  sumcommon2 = 0.0;
275  bool mcv1_exists = false,
276  mcv2_exists = false,
277  hist1_exists = false,
278  hist2_exists = false;
279  int opr_codenum;
280  int mcv1_nvalues,
281  mcv2_nvalues,
282  mcv1_nnumbers,
283  mcv2_nnumbers,
284  hist1_nvalues,
285  hist2_nvalues,
286  mcv1_length = 0,
287  mcv2_length = 0;
288  Datum *mcv1_values,
289  *mcv2_values,
290  *hist1_values,
291  *hist2_values;
292  float4 *mcv1_numbers,
293  *mcv2_numbers;
294 
295  if (HeapTupleIsValid(vardata1->statsTuple))
296  {
297  stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
298  nullfrac1 = stats->stanullfrac;
299 
300  mcv1_exists = get_attstatsslot(vardata1->statsTuple,
301  vardata1->atttype, vardata1->atttypmod,
303  NULL,
304  &mcv1_values, &mcv1_nvalues,
305  &mcv1_numbers, &mcv1_nnumbers);
306  hist1_exists = get_attstatsslot(vardata1->statsTuple,
307  vardata1->atttype, vardata1->atttypmod,
309  NULL,
310  &hist1_values, &hist1_nvalues,
311  NULL, NULL);
312  /* Arbitrarily limit number of MCVs considered */
313  mcv1_length = Min(mcv1_nvalues, MAX_CONSIDERED_ELEMS);
314  if (mcv1_exists)
315  sumcommon1 = mcv_population(mcv1_numbers, mcv1_length);
316  }
317 
318  if (HeapTupleIsValid(vardata2->statsTuple))
319  {
320  stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
321  nullfrac2 = stats->stanullfrac;
322 
323  mcv2_exists = get_attstatsslot(vardata2->statsTuple,
324  vardata2->atttype, vardata2->atttypmod,
326  NULL,
327  &mcv2_values, &mcv2_nvalues,
328  &mcv2_numbers, &mcv2_nnumbers);
329  hist2_exists = get_attstatsslot(vardata2->statsTuple,
330  vardata2->atttype, vardata2->atttypmod,
332  NULL,
333  &hist2_values, &hist2_nvalues,
334  NULL, NULL);
335  /* Arbitrarily limit number of MCVs considered */
336  mcv2_length = Min(mcv2_nvalues, MAX_CONSIDERED_ELEMS);
337  if (mcv2_exists)
338  sumcommon2 = mcv_population(mcv2_numbers, mcv2_length);
339  }
340 
341  opr_codenum = inet_opr_codenum(operator);
342 
343  /*
344  * Calculate selectivity for MCV vs MCV matches.
345  */
346  if (mcv1_exists && mcv2_exists)
347  selec += inet_mcv_join_sel(mcv1_values, mcv1_numbers, mcv1_length,
348  mcv2_values, mcv2_numbers, mcv2_length,
349  operator);
350 
351  /*
352  * Add in selectivities for MCV vs histogram matches, scaling according to
353  * the fractions of the populations represented by the histograms. Note
354  * that the second case needs to commute the operator.
355  */
356  if (mcv1_exists && hist2_exists)
357  selec += (1.0 - nullfrac2 - sumcommon2) *
358  inet_mcv_hist_sel(mcv1_values, mcv1_numbers, mcv1_length,
359  hist2_values, hist2_nvalues,
360  opr_codenum);
361  if (mcv2_exists && hist1_exists)
362  selec += (1.0 - nullfrac1 - sumcommon1) *
363  inet_mcv_hist_sel(mcv2_values, mcv2_numbers, mcv2_length,
364  hist1_values, hist1_nvalues,
365  -opr_codenum);
366 
367  /*
368  * Add in selectivity for histogram vs histogram matches, again scaling
369  * appropriately.
370  */
371  if (hist1_exists && hist2_exists)
372  selec += (1.0 - nullfrac1 - sumcommon1) *
373  (1.0 - nullfrac2 - sumcommon2) *
374  inet_hist_inclusion_join_sel(hist1_values, hist1_nvalues,
375  hist2_values, hist2_nvalues,
376  opr_codenum);
377 
378  /*
379  * If useful statistics are not available then use the default estimate.
380  * We can apply null fractions if known, though.
381  */
382  if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
383  selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
384 
385  /* Release stats. */
386  if (mcv1_exists)
387  free_attstatsslot(vardata1->atttype, mcv1_values, mcv1_nvalues,
388  mcv1_numbers, mcv1_nnumbers);
389  if (mcv2_exists)
390  free_attstatsslot(vardata2->atttype, mcv2_values, mcv2_nvalues,
391  mcv2_numbers, mcv2_nnumbers);
392  if (hist1_exists)
393  free_attstatsslot(vardata1->atttype, hist1_values, hist1_nvalues,
394  NULL, 0);
395  if (hist2_exists)
396  free_attstatsslot(vardata2->atttype, hist2_values, hist2_nvalues,
397  NULL, 0);
398 
399  return selec;
400 }
401 
402 /*
403  * Semi join selectivity estimation for subnet inclusion/overlap operators
404  *
405  * Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
406  * histogram selectivity for semi/anti join cases.
407  */
408 static Selectivity
410  VariableStatData *vardata1, VariableStatData *vardata2)
411 {
412  Form_pg_statistic stats;
413  Selectivity selec = 0.0,
414  sumcommon1 = 0.0,
415  sumcommon2 = 0.0;
416  double nullfrac1 = 0.0,
417  nullfrac2 = 0.0,
418  hist2_weight = 0.0;
419  bool mcv1_exists = false,
420  mcv2_exists = false,
421  hist1_exists = false,
422  hist2_exists = false;
423  int opr_codenum;
424  FmgrInfo proc;
425  int i,
426  mcv1_nvalues,
427  mcv2_nvalues,
428  mcv1_nnumbers,
429  mcv2_nnumbers,
430  hist1_nvalues,
431  hist2_nvalues,
432  mcv1_length = 0,
433  mcv2_length = 0;
434  Datum *mcv1_values,
435  *mcv2_values,
436  *hist1_values,
437  *hist2_values;
438  float4 *mcv1_numbers,
439  *mcv2_numbers;
440 
441  if (HeapTupleIsValid(vardata1->statsTuple))
442  {
443  stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
444  nullfrac1 = stats->stanullfrac;
445 
446  mcv1_exists = get_attstatsslot(vardata1->statsTuple,
447  vardata1->atttype, vardata1->atttypmod,
449  NULL,
450  &mcv1_values, &mcv1_nvalues,
451  &mcv1_numbers, &mcv1_nnumbers);
452  hist1_exists = get_attstatsslot(vardata1->statsTuple,
453  vardata1->atttype, vardata1->atttypmod,
455  NULL,
456  &hist1_values, &hist1_nvalues,
457  NULL, NULL);
458  /* Arbitrarily limit number of MCVs considered */
459  mcv1_length = Min(mcv1_nvalues, MAX_CONSIDERED_ELEMS);
460  if (mcv1_exists)
461  sumcommon1 = mcv_population(mcv1_numbers, mcv1_length);
462  }
463 
464  if (HeapTupleIsValid(vardata2->statsTuple))
465  {
466  stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
467  nullfrac2 = stats->stanullfrac;
468 
469  mcv2_exists = get_attstatsslot(vardata2->statsTuple,
470  vardata2->atttype, vardata2->atttypmod,
472  NULL,
473  &mcv2_values, &mcv2_nvalues,
474  &mcv2_numbers, &mcv2_nnumbers);
475  hist2_exists = get_attstatsslot(vardata2->statsTuple,
476  vardata2->atttype, vardata2->atttypmod,
478  NULL,
479  &hist2_values, &hist2_nvalues,
480  NULL, NULL);
481  /* Arbitrarily limit number of MCVs considered */
482  mcv2_length = Min(mcv2_nvalues, MAX_CONSIDERED_ELEMS);
483  if (mcv2_exists)
484  sumcommon2 = mcv_population(mcv2_numbers, mcv2_length);
485  }
486 
487  opr_codenum = inet_opr_codenum(operator);
488  fmgr_info(get_opcode(operator), &proc);
489 
490  /* Estimate number of input rows represented by RHS histogram. */
491  if (hist2_exists && vardata2->rel)
492  hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows;
493 
494  /*
495  * Consider each element of the LHS MCV list, matching it to whatever RHS
496  * stats we have. Scale according to the known frequency of the MCV.
497  */
498  if (mcv1_exists && (mcv2_exists || hist2_exists))
499  {
500  for (i = 0; i < mcv1_length; i++)
501  {
502  selec += mcv1_numbers[i] *
503  inet_semi_join_sel(mcv1_values[i],
504  mcv2_exists, mcv2_values, mcv2_length,
505  hist2_exists, hist2_values, hist2_nvalues,
506  hist2_weight,
507  &proc, opr_codenum);
508  }
509  }
510 
511  /*
512  * Consider each element of the LHS histogram, except for the first and
513  * last elements, which we exclude on the grounds that they're outliers
514  * and thus not very representative. Scale on the assumption that each
515  * such histogram element represents an equal share of the LHS histogram
516  * population (which is a bit bogus, because the members of its bucket may
517  * not all act the same with respect to the join clause, but it's hard to
518  * do better).
519  *
520  * If there are too many histogram elements, decimate to limit runtime.
521  */
522  if (hist1_exists && hist1_nvalues > 2 && (mcv2_exists || hist2_exists))
523  {
524  double hist_selec_sum = 0.0;
525  int k,
526  n;
527 
528  k = (hist1_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
529 
530  n = 0;
531  for (i = 1; i < hist1_nvalues - 1; i += k)
532  {
533  hist_selec_sum +=
534  inet_semi_join_sel(hist1_values[i],
535  mcv2_exists, mcv2_values, mcv2_length,
536  hist2_exists, hist2_values, hist2_nvalues,
537  hist2_weight,
538  &proc, opr_codenum);
539  n++;
540  }
541 
542  selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n;
543  }
544 
545  /*
546  * If useful statistics are not available then use the default estimate.
547  * We can apply null fractions if known, though.
548  */
549  if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
550  selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
551 
552  /* Release stats. */
553  if (mcv1_exists)
554  free_attstatsslot(vardata1->atttype, mcv1_values, mcv1_nvalues,
555  mcv1_numbers, mcv1_nnumbers);
556  if (mcv2_exists)
557  free_attstatsslot(vardata2->atttype, mcv2_values, mcv2_nvalues,
558  mcv2_numbers, mcv2_nnumbers);
559  if (hist1_exists)
560  free_attstatsslot(vardata1->atttype, hist1_values, hist1_nvalues,
561  NULL, 0);
562  if (hist2_exists)
563  free_attstatsslot(vardata2->atttype, hist2_values, hist2_nvalues,
564  NULL, 0);
565 
566  return selec;
567 }
568 
569 /*
570  * Compute the fraction of a relation's population that is represented
571  * by the MCV list.
572  */
573 static Selectivity
574 mcv_population(float4 *mcv_numbers, int mcv_nvalues)
575 {
576  Selectivity sumcommon = 0.0;
577  int i;
578 
579  for (i = 0; i < mcv_nvalues; i++)
580  {
581  sumcommon += mcv_numbers[i];
582  }
583 
584  return sumcommon;
585 }
586 
587 /*
588  * Inet histogram vs single value selectivity estimation
589  *
590  * Estimate the fraction of the histogram population that satisfies
591  * "value OPR CONST". (The result needs to be scaled to reflect the
592  * proportion of the total population represented by the histogram.)
593  *
594  * The histogram is originally for the inet btree comparison operators.
595  * Only the common bits of the network part and the length of the network part
596  * (masklen) are interesting for the subnet inclusion operators. Fortunately,
597  * btree comparison treats the network part as the major sort key. Even so,
598  * the length of the network part would not really be significant in the
599  * histogram. This would lead to big mistakes for data sets with uneven
600  * masklen distribution. To reduce this problem, comparisons with the left
601  * and the right sides of the buckets are used together.
602  *
603  * Histogram bucket matches are calculated in two forms. If the constant
604  * matches both bucket endpoints the bucket is considered as fully matched.
605  * The second form is to match the bucket partially; we recognize this when
606  * the constant matches just one endpoint, or the two endpoints fall on
607  * opposite sides of the constant. (Note that when the constant matches an
608  * interior histogram element, it gets credit for partial matches to the
609  * buckets on both sides, while a match to a histogram endpoint gets credit
610  * for only one partial match. This is desirable.)
611  *
612  * The divider in the partial bucket match is imagined as the distance
613  * between the decisive bits and the common bits of the addresses. It will
614  * be used as a power of two as it is the natural scale for the IP network
615  * inclusion. This partial bucket match divider calculation is an empirical
616  * formula and subject to change with more experiment.
617  *
618  * For a partial match, we try to calculate dividers for both of the
619  * boundaries. If the address family of a boundary value does not match the
620  * constant or comparison of the length of the network parts is not correct
621  * for the operator, the divider for that boundary will not be taken into
622  * account. If both of the dividers are valid, the greater one will be used
623  * to minimize the mistake in buckets that have disparate masklens. This
624  * calculation is unfair when dividers can be calculated for both of the
625  * boundaries but they are far from each other; but it is not a common
626  * situation as the boundaries are expected to share most of their significant
627  * bits of their masklens. The mistake would be greater, if we would use the
628  * minimum instead of the maximum, and we don't know a sensible way to combine
629  * them.
630  *
631  * For partial match in buckets that have different address families on the
632  * left and right sides, only the boundary with the same address family is
633  * taken into consideration. This can cause more mistakes for these buckets
634  * if the masklens of their boundaries are also disparate. But this can only
635  * happen in one bucket, since only two address families exist. It seems a
636  * better option than not considering these buckets at all.
637  */
638 static Selectivity
639 inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue,
640  int opr_codenum)
641 {
642  Selectivity match = 0.0;
643  inet *query,
644  *left,
645  *right;
646  int i,
647  k,
648  n;
649  int left_order,
650  right_order,
651  left_divider,
652  right_divider;
653 
654  /* guard against zero-divide below */
655  if (nvalues <= 1)
656  return 0.0;
657 
658  /* if there are too many histogram elements, decimate to limit runtime */
659  k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1;
660 
661  query = DatumGetInetPP(constvalue);
662 
663  /* "left" is the left boundary value of the current bucket ... */
664  left = DatumGetInetPP(values[0]);
665  left_order = inet_inclusion_cmp(left, query, opr_codenum);
666 
667  n = 0;
668  for (i = k; i < nvalues; i += k)
669  {
670  /* ... and "right" is the right boundary value */
671  right = DatumGetInetPP(values[i]);
672  right_order = inet_inclusion_cmp(right, query, opr_codenum);
673 
674  if (left_order == 0 && right_order == 0)
675  {
676  /* The whole bucket matches, since both endpoints do. */
677  match += 1.0;
678  }
679  else if ((left_order <= 0 && right_order >= 0) ||
680  (left_order >= 0 && right_order <= 0))
681  {
682  /* Partial bucket match. */
683  left_divider = inet_hist_match_divider(left, query, opr_codenum);
684  right_divider = inet_hist_match_divider(right, query, opr_codenum);
685 
686  if (left_divider >= 0 || right_divider >= 0)
687  match += 1.0 / pow(2.0, Max(left_divider, right_divider));
688  }
689 
690  /* Shift the variables. */
691  left = right;
692  left_order = right_order;
693 
694  /* Count the number of buckets considered. */
695  n++;
696  }
697 
698  return match / n;
699 }
700 
701 /*
702  * Inet MCV vs MCV join selectivity estimation
703  *
704  * We simply add up the fractions of the populations that satisfy the clause.
705  * The result is exact and does not need to be scaled further.
706  */
707 static Selectivity
708 inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues,
709  Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues,
710  Oid operator)
711 {
712  Selectivity selec = 0.0;
713  FmgrInfo proc;
714  int i,
715  j;
716 
717  fmgr_info(get_opcode(operator), &proc);
718 
719  for (i = 0; i < mcv1_nvalues; i++)
720  {
721  for (j = 0; j < mcv2_nvalues; j++)
722  if (DatumGetBool(FunctionCall2(&proc,
723  mcv1_values[i],
724  mcv2_values[j])))
725  selec += mcv1_numbers[i] * mcv2_numbers[j];
726  }
727  return selec;
728 }
729 
730 /*
731  * Inet MCV vs histogram join selectivity estimation
732  *
733  * For each MCV on the lefthand side, estimate the fraction of the righthand's
734  * histogram population that satisfies the join clause, and add those up,
735  * scaling by the MCV's frequency. The result still needs to be scaled
736  * according to the fraction of the righthand's population represented by
737  * the histogram.
738  */
739 static Selectivity
740 inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues,
741  Datum *hist_values, int hist_nvalues,
742  int opr_codenum)
743 {
744  Selectivity selec = 0.0;
745  int i;
746 
747  /*
748  * We'll call inet_hist_value_selec with the histogram on the left, so we
749  * must commute the operator.
750  */
751  opr_codenum = -opr_codenum;
752 
753  for (i = 0; i < mcv_nvalues; i++)
754  {
755  selec += mcv_numbers[i] *
756  inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
757  opr_codenum);
758  }
759  return selec;
760 }
761 
762 /*
763  * Inet histogram vs histogram join selectivity estimation
764  *
765  * Here, we take all values listed in the second histogram (except for the
766  * first and last elements, which are excluded on the grounds of possibly
767  * not being very representative) and treat them as a uniform sample of
768  * the non-MCV population for that relation. For each one, we apply
769  * inet_hist_value_selec to see what fraction of the first histogram
770  * it matches.
771  *
772  * We could alternatively do this the other way around using the operator's
773  * commutator. XXX would it be worthwhile to do it both ways and take the
774  * average? That would at least avoid non-commutative estimation results.
775  */
776 static Selectivity
777 inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues,
778  Datum *hist2_values, int hist2_nvalues,
779  int opr_codenum)
780 {
781  double match = 0.0;
782  int i,
783  k,
784  n;
785 
786  if (hist2_nvalues <= 2)
787  return 0.0; /* no interior histogram elements */
788 
789  /* if there are too many histogram elements, decimate to limit runtime */
790  k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
791 
792  n = 0;
793  for (i = 1; i < hist2_nvalues - 1; i += k)
794  {
795  match += inet_hist_value_sel(hist1_values, hist1_nvalues,
796  hist2_values[i], opr_codenum);
797  n++;
798  }
799 
800  return match / n;
801 }
802 
803 /*
804  * Inet semi join selectivity estimation for one value
805  *
806  * The function calculates the probability that there is at least one row
807  * in the RHS table that satisfies the "lhs_value op column" condition.
808  * It is used in semi join estimation to check a sample from the left hand
809  * side table.
810  *
811  * The MCV and histogram from the right hand side table should be provided as
812  * arguments with the lhs_value from the left hand side table for the join.
813  * hist_weight is the total number of rows represented by the histogram.
814  * For example, if the table has 1000 rows, and 10% of the rows are in the MCV
815  * list, and another 10% are NULLs, hist_weight would be 800.
816  *
817  * First, the lhs_value will be matched to the most common values. If it
818  * matches any of them, 1.0 will be returned, because then there is surely
819  * a match.
820  *
821  * Otherwise, the histogram will be used to estimate the number of rows in
822  * the second table that match the condition. If the estimate is greater
823  * than 1.0, 1.0 will be returned, because it means there is a greater chance
824  * that the lhs_value will match more than one row in the table. If it is
825  * between 0.0 and 1.0, it will be returned as the probability.
826  */
827 static Selectivity
829  bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
830  bool hist_exists, Datum *hist_values, int hist_nvalues,
831  double hist_weight,
832  FmgrInfo *proc, int opr_codenum)
833 {
834  if (mcv_exists)
835  {
836  int i;
837 
838  for (i = 0; i < mcv_nvalues; i++)
839  {
840  if (DatumGetBool(FunctionCall2(proc,
841  lhs_value,
842  mcv_values[i])))
843  return 1.0;
844  }
845  }
846 
847  if (hist_exists && hist_weight > 0)
848  {
849  Selectivity hist_selec;
850 
851  /* Commute operator, since we're passing lhs_value on the right */
852  hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
853  lhs_value, -opr_codenum);
854 
855  if (hist_selec > 0)
856  return Min(1.0, hist_weight * hist_selec);
857  }
858 
859  return 0.0;
860 }
861 
862 /*
863  * Assign useful code numbers for the subnet inclusion/overlap operators
864  *
865  * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
866  * on the exact codes assigned here; but many other places in this file
867  * know that they can negate a code to obtain the code for the commutator
868  * operator.
869  */
870 static int
872 {
873  switch (operator)
874  {
875  case OID_INET_SUP_OP:
876  return -2;
877  case OID_INET_SUPEQ_OP:
878  return -1;
879  case OID_INET_OVERLAP_OP:
880  return 0;
881  case OID_INET_SUBEQ_OP:
882  return 1;
883  case OID_INET_SUB_OP:
884  return 2;
885  default:
886  elog(ERROR, "unrecognized operator %u for inet selectivity",
887  operator);
888  }
889  return 0; /* unreached, but keep compiler quiet */
890 }
891 
892 /*
893  * Comparison function for the subnet inclusion/overlap operators
894  *
895  * If the comparison is okay for the specified inclusion operator, the return
896  * value will be 0. Otherwise the return value will be less than or greater
897  * than 0 as appropriate for the operator.
898  *
899  * Comparison is compatible with the basic comparison function for the inet
900  * type. See network_cmp_internal() in network.c for the original. Basic
901  * comparison operators are implemented with the network_cmp_internal()
902  * function. It is possible to implement the subnet inclusion operators with
903  * this function.
904  *
905  * Comparison is first on the common bits of the network part, then on the
906  * length of the network part (masklen) as in the network_cmp_internal()
907  * function. Only the first part is in this function. The second part is
908  * separated to another function for reusability. The difference between the
909  * second part and the original network_cmp_internal() is that the inclusion
910  * operator is considered while comparing the lengths of the network parts.
911  * See the inet_masklen_inclusion_cmp() function below.
912  */
913 static int
914 inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
915 {
916  if (ip_family(left) == ip_family(right))
917  {
918  int order;
919 
920  order = bitncmp(ip_addr(left), ip_addr(right),
921  Min(ip_bits(left), ip_bits(right)));
922  if (order != 0)
923  return order;
924 
925  return inet_masklen_inclusion_cmp(left, right, opr_codenum);
926  }
927 
928  return ip_family(left) - ip_family(right);
929 }
930 
931 /*
932  * Masklen comparison function for the subnet inclusion/overlap operators
933  *
934  * Compares the lengths of the network parts of the inputs. If the comparison
935  * is okay for the specified inclusion operator, the return value will be 0.
936  * Otherwise the return value will be less than or greater than 0 as
937  * appropriate for the operator.
938  */
939 static int
940 inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
941 {
942  int order;
943 
944  order = (int) ip_bits(left) - (int) ip_bits(right);
945 
946  /*
947  * Return 0 if the operator would accept this combination of masklens.
948  * Note that opr_codenum zero (overlaps) will accept all cases.
949  */
950  if ((order > 0 && opr_codenum >= 0) ||
951  (order == 0 && opr_codenum >= -1 && opr_codenum <= 1) ||
952  (order < 0 && opr_codenum <= 0))
953  return 0;
954 
955  /*
956  * Otherwise, return a negative value for sup/supeq (notionally, the RHS
957  * needs to have a larger masklen than it has, which would make it sort
958  * later), or a positive value for sub/subeq (vice versa).
959  */
960  return opr_codenum;
961 }
962 
963 /*
964  * Inet histogram partial match divider calculation
965  *
966  * First the families and the lengths of the network parts are compared using
967  * the subnet inclusion operator. If those are acceptable for the operator,
968  * the divider will be calculated using the masklens and the common bits of
969  * the addresses. -1 will be returned if it cannot be calculated.
970  *
971  * See commentary for inet_hist_value_sel() for some rationale for this.
972  */
973 static int
974 inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
975 {
976  if (ip_family(boundary) == ip_family(query) &&
977  inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0)
978  {
979  int min_bits,
980  decisive_bits;
981 
982  min_bits = Min(ip_bits(boundary), ip_bits(query));
983 
984  /*
985  * Set decisive_bits to the masklen of the one that should contain the
986  * other according to the operator.
987  */
988  if (opr_codenum < 0)
989  decisive_bits = ip_bits(boundary);
990  else if (opr_codenum > 0)
991  decisive_bits = ip_bits(query);
992  else
993  decisive_bits = min_bits;
994 
995  /*
996  * Now return the number of non-common decisive bits. (This will be
997  * zero if the boundary and query in fact match, else positive.)
998  */
999  if (min_bits > 0)
1000  return decisive_bits - bitncommon(ip_addr(boundary),
1001  ip_addr(query),
1002  min_bits);
1003  return decisive_bits;
1004  }
1005 
1006  return -1;
1007 }
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
Definition: fmgr.h:56
#define IsA(nodeptr, _type_)
Definition: nodes.h:560
#define OID_INET_SUBEQ_OP
Definition: pg_operator.h:1182
Oid get_commutator(Oid opno)
Definition: lsyscache.c:1313
#define GETSTRUCT(TUP)
Definition: htup_details.h:656
#define ip_bits(inetptr)
Definition: inet.h:74
#define ip_family(inetptr)
Definition: inet.h:71
static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
HeapTuple statsTuple
Definition: selfuncs.h:71
#define STATISTIC_KIND_HISTOGRAM
Definition: pg_statistic.h:222
static Selectivity inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues, Oid operator)
#define DatumGetInetPP(X)
Definition: inet.h:122
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4446
#define ip_addr(inetptr)
Definition: inet.h:77
RelOptInfo * rel
Definition: selfuncs.h:70
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:326
#define Min(x, y)
Definition: c.h:806
#define FunctionCall2(flinfo, arg1, arg2)
Definition: fmgr.h:604
int bitncommon(const unsigned char *l, const unsigned char *r, int n)
Definition: network.c:1004
Definition: nodes.h:509
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
double Selectivity
Definition: nodes.h:638
bool get_attstatsslot(HeapTuple statstuple, Oid atttype, int32 atttypmod, int reqkind, Oid reqop, Oid *actualop, Datum **values, int *nvalues, float4 **numbers, int *nnumbers)
Definition: lsyscache.c:2886
unsigned int Oid
Definition: postgres_ext.h:31
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:129
static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, Datum *hist_values, int hist_nvalues, int opr_codenum)
int32 atttypmod
Definition: selfuncs.h:76
JoinType
Definition: nodes.h:672
static Selectivity inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue, int opr_codenum)
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:57
Datum networkjoinsel(PG_FUNCTION_ARGS)
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:381
void get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo, VariableStatData *vardata1, VariableStatData *vardata2, bool *join_is_reversed)
Definition: selfuncs.c:4506
static Selectivity inet_semi_join_sel(Datum lhs_value, bool mcv_exists, Datum *mcv_values, int mcv_nvalues, bool hist_exists, Datum *hist_values, int hist_nvalues, double hist_weight, FmgrInfo *proc, int opr_codenum)
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:127
static int inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
#define DatumGetBool(X)
Definition: postgres.h:399
Definition: inet.h:52
static Selectivity networkjoinsel_inner(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2)
#define STATISTIC_KIND_MCV
Definition: pg_statistic.h:204
static int inet_opr_codenum(Oid operator)
float float4
Definition: c.h:380
uintptr_t Datum
Definition: postgres.h:372
#define PG_GETARG_INT16(n)
Definition: fmgr.h:236
double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval, bool varonleft, double *sumcommonp)
Definition: selfuncs.c:612
static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues)
double rows
Definition: relation.h:527
#define InvalidOid
Definition: postgres_ext.h:36
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1094
#define OID_INET_SUP_OP
Definition: pg_operator.h:1185
#define Max(x, y)
Definition: c.h:800
#define HeapTupleIsValid(tuple)
Definition: htup.h:77
#define NULL
Definition: c.h:229
static int inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
#define OID_INET_OVERLAP_OP
Definition: pg_operator.h:1191
JoinType jointype
Definition: relation.h:1920
#define DEFAULT_SEL(operator)
static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues, Datum *hist2_values, int hist2_nvalues, int opr_codenum)
static Datum values[MAXATTR]
Definition: bootstrap.c:163
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:80
#define OID_INET_SUB_OP
Definition: pg_operator.h:1179
int bitncmp(const unsigned char *l, const unsigned char *r, int n)
Definition: network.c:970
int i
static Selectivity networkjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2)
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define elog
Definition: elog.h:219
Definition: pg_list.h:45
#define OID_INET_SUPEQ_OP
Definition: pg_operator.h:1188
Datum networksel(PG_FUNCTION_ARGS)
void free_attstatsslot(Oid atttype, Datum *values, int nvalues, float4 *numbers, int nnumbers)
Definition: lsyscache.c:3010
#define MAX_CONSIDERED_ELEMS