PostgreSQL Source Code  git master
network_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * network_selfuncs.c
4  * Functions for selectivity estimation of inet/cidr operators
5  *
6  * This module provides estimators for the subnet inclusion and overlap
7  * operators. Estimates are based on null fraction, most common values,
8  * and histogram of inet/cidr columns.
9  *
10  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
11  * Portions Copyright (c) 1994, Regents of the University of California
12  *
13  *
14  * IDENTIFICATION
15  * src/backend/utils/adt/network_selfuncs.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include <math.h>
22 
23 #include "access/htup_details.h"
24 #include "catalog/pg_operator.h"
25 #include "catalog/pg_statistic.h"
26 #include "utils/fmgrprotos.h"
27 #include "utils/inet.h"
28 #include "utils/lsyscache.h"
29 #include "utils/selfuncs.h"
30 
31 
32 /* Default selectivity for the inet overlap operator */
33 #define DEFAULT_OVERLAP_SEL 0.01
34 
35 /* Default selectivity for the various inclusion operators */
36 #define DEFAULT_INCLUSION_SEL 0.005
37 
38 /* Default selectivity for specified operator */
39 #define DEFAULT_SEL(operator) \
40  ((operator) == OID_INET_OVERLAP_OP ? \
41  DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
42 
43 /* Maximum number of items to consider in join selectivity calculations */
44 #define MAX_CONSIDERED_ELEMS 1024
45 
46 static Selectivity networkjoinsel_inner(Oid operator,
47  VariableStatData *vardata1, VariableStatData *vardata2);
48 static Selectivity networkjoinsel_semi(Oid operator,
49  VariableStatData *vardata1, VariableStatData *vardata2);
50 static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
51 static Selectivity inet_hist_value_sel(Datum *values, int nvalues,
52  Datum constvalue, int opr_codenum);
53 static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
54  float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values,
55  float4 *mcv2_numbers, int mcv2_nvalues, Oid operator);
56 static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers,
57  int mcv_nvalues, Datum *hist_values, int hist_nvalues,
58  int opr_codenum);
60  int hist1_nvalues,
61  Datum *hist2_values, int hist2_nvalues,
62  int opr_codenum);
63 static Selectivity inet_semi_join_sel(Datum lhs_value,
64  bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
65  bool hist_exists, Datum *hist_values, int hist_nvalues,
66  double hist_weight,
67  FmgrInfo *proc, int opr_codenum);
68 static int inet_opr_codenum(Oid operator);
69 static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum);
70 static int inet_masklen_inclusion_cmp(inet *left, inet *right,
71  int opr_codenum);
72 static int inet_hist_match_divider(inet *boundary, inet *query,
73  int opr_codenum);
74 
75 /*
76  * Selectivity estimation for the subnet inclusion/overlap operators
77  */
78 Datum
80 {
82  Oid operator = PG_GETARG_OID(1);
83  List *args = (List *) PG_GETARG_POINTER(2);
84  int varRelid = PG_GETARG_INT32(3);
85  VariableStatData vardata;
86  Node *other;
87  bool varonleft;
88  Selectivity selec,
89  mcv_selec,
90  non_mcv_selec;
91  Datum constvalue;
92  Form_pg_statistic stats;
93  AttStatsSlot hslot;
94  double sumcommon,
95  nullfrac;
96  FmgrInfo proc;
97 
98  /*
99  * If expression is not (variable op something) or (something op
100  * variable), then punt and return a default estimate.
101  */
102  if (!get_restriction_variable(root, args, varRelid,
103  &vardata, &other, &varonleft))
104  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
105 
106  /*
107  * Can't do anything useful if the something is not a constant, either.
108  */
109  if (!IsA(other, Const))
110  {
111  ReleaseVariableStats(vardata);
112  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
113  }
114 
115  /* All of the operators handled here are strict. */
116  if (((Const *) other)->constisnull)
117  {
118  ReleaseVariableStats(vardata);
119  PG_RETURN_FLOAT8(0.0);
120  }
121  constvalue = ((Const *) other)->constvalue;
122 
123  /* Otherwise, we need stats in order to produce a non-default estimate. */
124  if (!HeapTupleIsValid(vardata.statsTuple))
125  {
126  ReleaseVariableStats(vardata);
127  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
128  }
129 
130  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
131  nullfrac = stats->stanullfrac;
132 
133  /*
134  * If we have most-common-values info, add up the fractions of the MCV
135  * entries that satisfy MCV OP CONST. These fractions contribute directly
136  * to the result selectivity. Also add up the total fraction represented
137  * by MCV entries.
138  */
139  fmgr_info(get_opcode(operator), &proc);
140  mcv_selec = mcv_selectivity(&vardata, &proc, InvalidOid,
141  constvalue, varonleft,
142  &sumcommon);
143 
144  /*
145  * If we have a histogram, use it to estimate the proportion of the
146  * non-MCV population that satisfies the clause. If we don't, apply the
147  * default selectivity to that population.
148  */
149  if (get_attstatsslot(&hslot, vardata.statsTuple,
150  STATISTIC_KIND_HISTOGRAM, InvalidOid,
152  {
153  int opr_codenum = inet_opr_codenum(operator);
154 
155  /* Commute if needed, so we can consider histogram to be on the left */
156  if (!varonleft)
157  opr_codenum = -opr_codenum;
158  non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
159  constvalue, opr_codenum);
160 
161  free_attstatsslot(&hslot);
162  }
163  else
164  non_mcv_selec = DEFAULT_SEL(operator);
165 
166  /* Combine selectivities for MCV and non-MCV populations */
167  selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec;
168 
169  /* Result should be in range, but make sure... */
170  CLAMP_PROBABILITY(selec);
171 
172  ReleaseVariableStats(vardata);
173 
174  PG_RETURN_FLOAT8(selec);
175 }
176 
177 /*
178  * Join selectivity estimation for the subnet inclusion/overlap operators
179  *
180  * This function has the same structure as eqjoinsel() in selfuncs.c.
181  *
182  * Throughout networkjoinsel and its subroutines, we have a performance issue
183  * in that the amount of work to be done is O(N^2) in the length of the MCV
184  * and histogram arrays. To keep the runtime from getting out of hand when
185  * large statistics targets have been set, we arbitrarily limit the number of
186  * values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
187  * is easy: just consider at most the first N elements. (Since the MCVs are
188  * sorted by decreasing frequency, this correctly gets us the first N MCVs.)
189  * For the histogram arrays, we decimate; that is consider only every k'th
190  * element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
191  * elements are considered. This should still give us a good random sample of
192  * the non-MCV population. Decimation is done on-the-fly in the loops that
193  * iterate over the histogram arrays.
194  */
195 Datum
197 {
199  Oid operator = PG_GETARG_OID(1);
200  List *args = (List *) PG_GETARG_POINTER(2);
201 #ifdef NOT_USED
202  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
203 #endif
205  double selec;
206  VariableStatData vardata1;
207  VariableStatData vardata2;
208  bool join_is_reversed;
209 
210  get_join_variables(root, args, sjinfo,
211  &vardata1, &vardata2, &join_is_reversed);
212 
213  switch (sjinfo->jointype)
214  {
215  case JOIN_INNER:
216  case JOIN_LEFT:
217  case JOIN_FULL:
218 
219  /*
220  * Selectivity for left/full join is not exactly the same as inner
221  * join, but we neglect the difference, as eqjoinsel does.
222  */
223  selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
224  break;
225  case JOIN_SEMI:
226  case JOIN_ANTI:
227  /* Here, it's important that we pass the outer var on the left. */
228  if (!join_is_reversed)
229  selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
230  else
231  selec = networkjoinsel_semi(get_commutator(operator),
232  &vardata2, &vardata1);
233  break;
234  default:
235  /* other values not expected here */
236  elog(ERROR, "unrecognized join type: %d",
237  (int) sjinfo->jointype);
238  selec = 0; /* keep compiler quiet */
239  break;
240  }
241 
242  ReleaseVariableStats(vardata1);
243  ReleaseVariableStats(vardata2);
244 
245  CLAMP_PROBABILITY(selec);
246 
247  PG_RETURN_FLOAT8((float8) selec);
248 }
249 
250 /*
251  * Inner join selectivity estimation for subnet inclusion/overlap operators
252  *
253  * Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
254  * selectivity for join using the subnet inclusion operators. Unlike the
255  * join selectivity function for the equality operator, eqjoinsel_inner(),
256  * one to one matching of the values is not enough. Network inclusion
257  * operators are likely to match many to many, so we must check all pairs.
258  * (Note: it might be possible to exploit understanding of the histogram's
259  * btree ordering to reduce the work needed, but we don't currently try.)
260  * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
261  */
262 static Selectivity
264  VariableStatData *vardata1, VariableStatData *vardata2)
265 {
266  Form_pg_statistic stats;
267  double nullfrac1 = 0.0,
268  nullfrac2 = 0.0;
269  Selectivity selec = 0.0,
270  sumcommon1 = 0.0,
271  sumcommon2 = 0.0;
272  bool mcv1_exists = false,
273  mcv2_exists = false,
274  hist1_exists = false,
275  hist2_exists = false;
276  int opr_codenum;
277  int mcv1_length = 0,
278  mcv2_length = 0;
279  AttStatsSlot mcv1_slot;
280  AttStatsSlot mcv2_slot;
281  AttStatsSlot hist1_slot;
282  AttStatsSlot hist2_slot;
283 
284  if (HeapTupleIsValid(vardata1->statsTuple))
285  {
286  stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
287  nullfrac1 = stats->stanullfrac;
288 
289  mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
290  STATISTIC_KIND_MCV, InvalidOid,
292  hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
293  STATISTIC_KIND_HISTOGRAM, InvalidOid,
295  /* Arbitrarily limit number of MCVs considered */
296  mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
297  if (mcv1_exists)
298  sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
299  }
300  else
301  {
302  memset(&mcv1_slot, 0, sizeof(mcv1_slot));
303  memset(&hist1_slot, 0, sizeof(hist1_slot));
304  }
305 
306  if (HeapTupleIsValid(vardata2->statsTuple))
307  {
308  stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
309  nullfrac2 = stats->stanullfrac;
310 
311  mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
312  STATISTIC_KIND_MCV, InvalidOid,
314  hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
315  STATISTIC_KIND_HISTOGRAM, InvalidOid,
317  /* Arbitrarily limit number of MCVs considered */
318  mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
319  if (mcv2_exists)
320  sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
321  }
322  else
323  {
324  memset(&mcv2_slot, 0, sizeof(mcv2_slot));
325  memset(&hist2_slot, 0, sizeof(hist2_slot));
326  }
327 
328  opr_codenum = inet_opr_codenum(operator);
329 
330  /*
331  * Calculate selectivity for MCV vs MCV matches.
332  */
333  if (mcv1_exists && mcv2_exists)
334  selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers,
335  mcv1_length,
336  mcv2_slot.values, mcv2_slot.numbers,
337  mcv2_length,
338  operator);
339 
340  /*
341  * Add in selectivities for MCV vs histogram matches, scaling according to
342  * the fractions of the populations represented by the histograms. Note
343  * that the second case needs to commute the operator.
344  */
345  if (mcv1_exists && hist2_exists)
346  selec += (1.0 - nullfrac2 - sumcommon2) *
347  inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length,
348  hist2_slot.values, hist2_slot.nvalues,
349  opr_codenum);
350  if (mcv2_exists && hist1_exists)
351  selec += (1.0 - nullfrac1 - sumcommon1) *
352  inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length,
353  hist1_slot.values, hist1_slot.nvalues,
354  -opr_codenum);
355 
356  /*
357  * Add in selectivity for histogram vs histogram matches, again scaling
358  * appropriately.
359  */
360  if (hist1_exists && hist2_exists)
361  selec += (1.0 - nullfrac1 - sumcommon1) *
362  (1.0 - nullfrac2 - sumcommon2) *
363  inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues,
364  hist2_slot.values, hist2_slot.nvalues,
365  opr_codenum);
366 
367  /*
368  * If useful statistics are not available then use the default estimate.
369  * We can apply null fractions if known, though.
370  */
371  if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
372  selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
373 
374  /* Release stats. */
375  free_attstatsslot(&mcv1_slot);
376  free_attstatsslot(&mcv2_slot);
377  free_attstatsslot(&hist1_slot);
378  free_attstatsslot(&hist2_slot);
379 
380  return selec;
381 }
382 
383 /*
384  * Semi join selectivity estimation for subnet inclusion/overlap operators
385  *
386  * Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
387  * histogram selectivity for semi/anti join cases.
388  */
389 static Selectivity
391  VariableStatData *vardata1, VariableStatData *vardata2)
392 {
393  Form_pg_statistic stats;
394  Selectivity selec = 0.0,
395  sumcommon1 = 0.0,
396  sumcommon2 = 0.0;
397  double nullfrac1 = 0.0,
398  nullfrac2 = 0.0,
399  hist2_weight = 0.0;
400  bool mcv1_exists = false,
401  mcv2_exists = false,
402  hist1_exists = false,
403  hist2_exists = false;
404  int opr_codenum;
405  FmgrInfo proc;
406  int i,
407  mcv1_length = 0,
408  mcv2_length = 0;
409  AttStatsSlot mcv1_slot;
410  AttStatsSlot mcv2_slot;
411  AttStatsSlot hist1_slot;
412  AttStatsSlot hist2_slot;
413 
414  if (HeapTupleIsValid(vardata1->statsTuple))
415  {
416  stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
417  nullfrac1 = stats->stanullfrac;
418 
419  mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
420  STATISTIC_KIND_MCV, InvalidOid,
422  hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
423  STATISTIC_KIND_HISTOGRAM, InvalidOid,
425  /* Arbitrarily limit number of MCVs considered */
426  mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
427  if (mcv1_exists)
428  sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
429  }
430  else
431  {
432  memset(&mcv1_slot, 0, sizeof(mcv1_slot));
433  memset(&hist1_slot, 0, sizeof(hist1_slot));
434  }
435 
436  if (HeapTupleIsValid(vardata2->statsTuple))
437  {
438  stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
439  nullfrac2 = stats->stanullfrac;
440 
441  mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
442  STATISTIC_KIND_MCV, InvalidOid,
444  hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
445  STATISTIC_KIND_HISTOGRAM, InvalidOid,
447  /* Arbitrarily limit number of MCVs considered */
448  mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
449  if (mcv2_exists)
450  sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
451  }
452  else
453  {
454  memset(&mcv2_slot, 0, sizeof(mcv2_slot));
455  memset(&hist2_slot, 0, sizeof(hist2_slot));
456  }
457 
458  opr_codenum = inet_opr_codenum(operator);
459  fmgr_info(get_opcode(operator), &proc);
460 
461  /* Estimate number of input rows represented by RHS histogram. */
462  if (hist2_exists && vardata2->rel)
463  hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows;
464 
465  /*
466  * Consider each element of the LHS MCV list, matching it to whatever RHS
467  * stats we have. Scale according to the known frequency of the MCV.
468  */
469  if (mcv1_exists && (mcv2_exists || hist2_exists))
470  {
471  for (i = 0; i < mcv1_length; i++)
472  {
473  selec += mcv1_slot.numbers[i] *
474  inet_semi_join_sel(mcv1_slot.values[i],
475  mcv2_exists, mcv2_slot.values, mcv2_length,
476  hist2_exists,
477  hist2_slot.values, hist2_slot.nvalues,
478  hist2_weight,
479  &proc, opr_codenum);
480  }
481  }
482 
483  /*
484  * Consider each element of the LHS histogram, except for the first and
485  * last elements, which we exclude on the grounds that they're outliers
486  * and thus not very representative. Scale on the assumption that each
487  * such histogram element represents an equal share of the LHS histogram
488  * population (which is a bit bogus, because the members of its bucket may
489  * not all act the same with respect to the join clause, but it's hard to
490  * do better).
491  *
492  * If there are too many histogram elements, decimate to limit runtime.
493  */
494  if (hist1_exists && hist1_slot.nvalues > 2 && (mcv2_exists || hist2_exists))
495  {
496  double hist_selec_sum = 0.0;
497  int k,
498  n;
499 
500  k = (hist1_slot.nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
501 
502  n = 0;
503  for (i = 1; i < hist1_slot.nvalues - 1; i += k)
504  {
505  hist_selec_sum +=
506  inet_semi_join_sel(hist1_slot.values[i],
507  mcv2_exists, mcv2_slot.values, mcv2_length,
508  hist2_exists,
509  hist2_slot.values, hist2_slot.nvalues,
510  hist2_weight,
511  &proc, opr_codenum);
512  n++;
513  }
514 
515  selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n;
516  }
517 
518  /*
519  * If useful statistics are not available then use the default estimate.
520  * We can apply null fractions if known, though.
521  */
522  if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
523  selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
524 
525  /* Release stats. */
526  free_attstatsslot(&mcv1_slot);
527  free_attstatsslot(&mcv2_slot);
528  free_attstatsslot(&hist1_slot);
529  free_attstatsslot(&hist2_slot);
530 
531  return selec;
532 }
533 
534 /*
535  * Compute the fraction of a relation's population that is represented
536  * by the MCV list.
537  */
538 static Selectivity
539 mcv_population(float4 *mcv_numbers, int mcv_nvalues)
540 {
541  Selectivity sumcommon = 0.0;
542  int i;
543 
544  for (i = 0; i < mcv_nvalues; i++)
545  {
546  sumcommon += mcv_numbers[i];
547  }
548 
549  return sumcommon;
550 }
551 
552 /*
553  * Inet histogram vs single value selectivity estimation
554  *
555  * Estimate the fraction of the histogram population that satisfies
556  * "value OPR CONST". (The result needs to be scaled to reflect the
557  * proportion of the total population represented by the histogram.)
558  *
559  * The histogram is originally for the inet btree comparison operators.
560  * Only the common bits of the network part and the length of the network part
561  * (masklen) are interesting for the subnet inclusion operators. Fortunately,
562  * btree comparison treats the network part as the major sort key. Even so,
563  * the length of the network part would not really be significant in the
564  * histogram. This would lead to big mistakes for data sets with uneven
565  * masklen distribution. To reduce this problem, comparisons with the left
566  * and the right sides of the buckets are used together.
567  *
568  * Histogram bucket matches are calculated in two forms. If the constant
569  * matches both bucket endpoints the bucket is considered as fully matched.
570  * The second form is to match the bucket partially; we recognize this when
571  * the constant matches just one endpoint, or the two endpoints fall on
572  * opposite sides of the constant. (Note that when the constant matches an
573  * interior histogram element, it gets credit for partial matches to the
574  * buckets on both sides, while a match to a histogram endpoint gets credit
575  * for only one partial match. This is desirable.)
576  *
577  * The divider in the partial bucket match is imagined as the distance
578  * between the decisive bits and the common bits of the addresses. It will
579  * be used as a power of two as it is the natural scale for the IP network
580  * inclusion. This partial bucket match divider calculation is an empirical
581  * formula and subject to change with more experiment.
582  *
583  * For a partial match, we try to calculate dividers for both of the
584  * boundaries. If the address family of a boundary value does not match the
585  * constant or comparison of the length of the network parts is not correct
586  * for the operator, the divider for that boundary will not be taken into
587  * account. If both of the dividers are valid, the greater one will be used
588  * to minimize the mistake in buckets that have disparate masklens. This
589  * calculation is unfair when dividers can be calculated for both of the
590  * boundaries but they are far from each other; but it is not a common
591  * situation as the boundaries are expected to share most of their significant
592  * bits of their masklens. The mistake would be greater, if we would use the
593  * minimum instead of the maximum, and we don't know a sensible way to combine
594  * them.
595  *
596  * For partial match in buckets that have different address families on the
597  * left and right sides, only the boundary with the same address family is
598  * taken into consideration. This can cause more mistakes for these buckets
599  * if the masklens of their boundaries are also disparate. But this can only
600  * happen in one bucket, since only two address families exist. It seems a
601  * better option than not considering these buckets at all.
602  */
603 static Selectivity
604 inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue,
605  int opr_codenum)
606 {
607  Selectivity match = 0.0;
608  inet *query,
609  *left,
610  *right;
611  int i,
612  k,
613  n;
614  int left_order,
615  right_order,
616  left_divider,
617  right_divider;
618 
619  /* guard against zero-divide below */
620  if (nvalues <= 1)
621  return 0.0;
622 
623  /* if there are too many histogram elements, decimate to limit runtime */
624  k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1;
625 
626  query = DatumGetInetPP(constvalue);
627 
628  /* "left" is the left boundary value of the current bucket ... */
629  left = DatumGetInetPP(values[0]);
630  left_order = inet_inclusion_cmp(left, query, opr_codenum);
631 
632  n = 0;
633  for (i = k; i < nvalues; i += k)
634  {
635  /* ... and "right" is the right boundary value */
636  right = DatumGetInetPP(values[i]);
637  right_order = inet_inclusion_cmp(right, query, opr_codenum);
638 
639  if (left_order == 0 && right_order == 0)
640  {
641  /* The whole bucket matches, since both endpoints do. */
642  match += 1.0;
643  }
644  else if ((left_order <= 0 && right_order >= 0) ||
645  (left_order >= 0 && right_order <= 0))
646  {
647  /* Partial bucket match. */
648  left_divider = inet_hist_match_divider(left, query, opr_codenum);
649  right_divider = inet_hist_match_divider(right, query, opr_codenum);
650 
651  if (left_divider >= 0 || right_divider >= 0)
652  match += 1.0 / pow(2.0, Max(left_divider, right_divider));
653  }
654 
655  /* Shift the variables. */
656  left = right;
657  left_order = right_order;
658 
659  /* Count the number of buckets considered. */
660  n++;
661  }
662 
663  return match / n;
664 }
665 
666 /*
667  * Inet MCV vs MCV join selectivity estimation
668  *
669  * We simply add up the fractions of the populations that satisfy the clause.
670  * The result is exact and does not need to be scaled further.
671  */
672 static Selectivity
673 inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues,
674  Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues,
675  Oid operator)
676 {
677  Selectivity selec = 0.0;
678  FmgrInfo proc;
679  int i,
680  j;
681 
682  fmgr_info(get_opcode(operator), &proc);
683 
684  for (i = 0; i < mcv1_nvalues; i++)
685  {
686  for (j = 0; j < mcv2_nvalues; j++)
687  if (DatumGetBool(FunctionCall2(&proc,
688  mcv1_values[i],
689  mcv2_values[j])))
690  selec += mcv1_numbers[i] * mcv2_numbers[j];
691  }
692  return selec;
693 }
694 
695 /*
696  * Inet MCV vs histogram join selectivity estimation
697  *
698  * For each MCV on the lefthand side, estimate the fraction of the righthand's
699  * histogram population that satisfies the join clause, and add those up,
700  * scaling by the MCV's frequency. The result still needs to be scaled
701  * according to the fraction of the righthand's population represented by
702  * the histogram.
703  */
704 static Selectivity
705 inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues,
706  Datum *hist_values, int hist_nvalues,
707  int opr_codenum)
708 {
709  Selectivity selec = 0.0;
710  int i;
711 
712  /*
713  * We'll call inet_hist_value_selec with the histogram on the left, so we
714  * must commute the operator.
715  */
716  opr_codenum = -opr_codenum;
717 
718  for (i = 0; i < mcv_nvalues; i++)
719  {
720  selec += mcv_numbers[i] *
721  inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
722  opr_codenum);
723  }
724  return selec;
725 }
726 
727 /*
728  * Inet histogram vs histogram join selectivity estimation
729  *
730  * Here, we take all values listed in the second histogram (except for the
731  * first and last elements, which are excluded on the grounds of possibly
732  * not being very representative) and treat them as a uniform sample of
733  * the non-MCV population for that relation. For each one, we apply
734  * inet_hist_value_selec to see what fraction of the first histogram
735  * it matches.
736  *
737  * We could alternatively do this the other way around using the operator's
738  * commutator. XXX would it be worthwhile to do it both ways and take the
739  * average? That would at least avoid non-commutative estimation results.
740  */
741 static Selectivity
742 inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues,
743  Datum *hist2_values, int hist2_nvalues,
744  int opr_codenum)
745 {
746  double match = 0.0;
747  int i,
748  k,
749  n;
750 
751  if (hist2_nvalues <= 2)
752  return 0.0; /* no interior histogram elements */
753 
754  /* if there are too many histogram elements, decimate to limit runtime */
755  k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
756 
757  n = 0;
758  for (i = 1; i < hist2_nvalues - 1; i += k)
759  {
760  match += inet_hist_value_sel(hist1_values, hist1_nvalues,
761  hist2_values[i], opr_codenum);
762  n++;
763  }
764 
765  return match / n;
766 }
767 
768 /*
769  * Inet semi join selectivity estimation for one value
770  *
771  * The function calculates the probability that there is at least one row
772  * in the RHS table that satisfies the "lhs_value op column" condition.
773  * It is used in semi join estimation to check a sample from the left hand
774  * side table.
775  *
776  * The MCV and histogram from the right hand side table should be provided as
777  * arguments with the lhs_value from the left hand side table for the join.
778  * hist_weight is the total number of rows represented by the histogram.
779  * For example, if the table has 1000 rows, and 10% of the rows are in the MCV
780  * list, and another 10% are NULLs, hist_weight would be 800.
781  *
782  * First, the lhs_value will be matched to the most common values. If it
783  * matches any of them, 1.0 will be returned, because then there is surely
784  * a match.
785  *
786  * Otherwise, the histogram will be used to estimate the number of rows in
787  * the second table that match the condition. If the estimate is greater
788  * than 1.0, 1.0 will be returned, because it means there is a greater chance
789  * that the lhs_value will match more than one row in the table. If it is
790  * between 0.0 and 1.0, it will be returned as the probability.
791  */
792 static Selectivity
794  bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
795  bool hist_exists, Datum *hist_values, int hist_nvalues,
796  double hist_weight,
797  FmgrInfo *proc, int opr_codenum)
798 {
799  if (mcv_exists)
800  {
801  int i;
802 
803  for (i = 0; i < mcv_nvalues; i++)
804  {
805  if (DatumGetBool(FunctionCall2(proc,
806  lhs_value,
807  mcv_values[i])))
808  return 1.0;
809  }
810  }
811 
812  if (hist_exists && hist_weight > 0)
813  {
814  Selectivity hist_selec;
815 
816  /* Commute operator, since we're passing lhs_value on the right */
817  hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
818  lhs_value, -opr_codenum);
819 
820  if (hist_selec > 0)
821  return Min(1.0, hist_weight * hist_selec);
822  }
823 
824  return 0.0;
825 }
826 
827 /*
828  * Assign useful code numbers for the subnet inclusion/overlap operators
829  *
830  * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
831  * on the exact codes assigned here; but many other places in this file
832  * know that they can negate a code to obtain the code for the commutator
833  * operator.
834  */
835 static int
837 {
838  switch (operator)
839  {
840  case OID_INET_SUP_OP:
841  return -2;
842  case OID_INET_SUPEQ_OP:
843  return -1;
844  case OID_INET_OVERLAP_OP:
845  return 0;
846  case OID_INET_SUBEQ_OP:
847  return 1;
848  case OID_INET_SUB_OP:
849  return 2;
850  default:
851  elog(ERROR, "unrecognized operator %u for inet selectivity",
852  operator);
853  }
854  return 0; /* unreached, but keep compiler quiet */
855 }
856 
857 /*
858  * Comparison function for the subnet inclusion/overlap operators
859  *
860  * If the comparison is okay for the specified inclusion operator, the return
861  * value will be 0. Otherwise the return value will be less than or greater
862  * than 0 as appropriate for the operator.
863  *
864  * Comparison is compatible with the basic comparison function for the inet
865  * type. See network_cmp_internal() in network.c for the original. Basic
866  * comparison operators are implemented with the network_cmp_internal()
867  * function. It is possible to implement the subnet inclusion operators with
868  * this function.
869  *
870  * Comparison is first on the common bits of the network part, then on the
871  * length of the network part (masklen) as in the network_cmp_internal()
872  * function. Only the first part is in this function. The second part is
873  * separated to another function for reusability. The difference between the
874  * second part and the original network_cmp_internal() is that the inclusion
875  * operator is considered while comparing the lengths of the network parts.
876  * See the inet_masklen_inclusion_cmp() function below.
877  */
878 static int
879 inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
880 {
881  if (ip_family(left) == ip_family(right))
882  {
883  int order;
884 
885  order = bitncmp(ip_addr(left), ip_addr(right),
886  Min(ip_bits(left), ip_bits(right)));
887  if (order != 0)
888  return order;
889 
890  return inet_masklen_inclusion_cmp(left, right, opr_codenum);
891  }
892 
893  return ip_family(left) - ip_family(right);
894 }
895 
896 /*
897  * Masklen comparison function for the subnet inclusion/overlap operators
898  *
899  * Compares the lengths of the network parts of the inputs. If the comparison
900  * is okay for the specified inclusion operator, the return value will be 0.
901  * Otherwise the return value will be less than or greater than 0 as
902  * appropriate for the operator.
903  */
904 static int
905 inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
906 {
907  int order;
908 
909  order = (int) ip_bits(left) - (int) ip_bits(right);
910 
911  /*
912  * Return 0 if the operator would accept this combination of masklens.
913  * Note that opr_codenum zero (overlaps) will accept all cases.
914  */
915  if ((order > 0 && opr_codenum >= 0) ||
916  (order == 0 && opr_codenum >= -1 && opr_codenum <= 1) ||
917  (order < 0 && opr_codenum <= 0))
918  return 0;
919 
920  /*
921  * Otherwise, return a negative value for sup/supeq (notionally, the RHS
922  * needs to have a larger masklen than it has, which would make it sort
923  * later), or a positive value for sub/subeq (vice versa).
924  */
925  return opr_codenum;
926 }
927 
928 /*
929  * Inet histogram partial match divider calculation
930  *
931  * First the families and the lengths of the network parts are compared using
932  * the subnet inclusion operator. If those are acceptable for the operator,
933  * the divider will be calculated using the masklens and the common bits of
934  * the addresses. -1 will be returned if it cannot be calculated.
935  *
936  * See commentary for inet_hist_value_sel() for some rationale for this.
937  */
938 static int
939 inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
940 {
941  if (ip_family(boundary) == ip_family(query) &&
942  inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0)
943  {
944  int min_bits,
945  decisive_bits;
946 
947  min_bits = Min(ip_bits(boundary), ip_bits(query));
948 
949  /*
950  * Set decisive_bits to the masklen of the one that should contain the
951  * other according to the operator.
952  */
953  if (opr_codenum < 0)
954  decisive_bits = ip_bits(boundary);
955  else if (opr_codenum > 0)
956  decisive_bits = ip_bits(query);
957  else
958  decisive_bits = min_bits;
959 
960  /*
961  * Now return the number of non-common decisive bits. (This will be
962  * zero if the boundary and query in fact match, else positive.)
963  */
964  if (min_bits > 0)
965  return decisive_bits - bitncommon(ip_addr(boundary),
966  ip_addr(query),
967  min_bits);
968  return decisive_bits;
969  }
970 
971  return -1;
972 }
static Datum values[MAXATTR]
Definition: bootstrap.c:152
#define Min(x, y)
Definition: c.h:1004
#define Max(x, y)
Definition: c.h:998
double float8
Definition: c.h:630
float float4
Definition: c.h:629
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:127
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:367
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define FunctionCall2(flinfo, arg1, arg2)
Definition: fmgr.h:662
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define PG_GETARG_INT16(n)
Definition: fmgr.h:271
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
int j
Definition: isn.c:74
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void free_attstatsslot(AttStatsSlot *sslot)
Definition: lsyscache.c:3344
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1285
bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags)
Definition: lsyscache.c:3234
Oid get_commutator(Oid opno)
Definition: lsyscache.c:1509
#define ATTSTATSSLOT_NUMBERS
Definition: lsyscache.h:43
#define ATTSTATSSLOT_VALUES
Definition: lsyscache.h:42
int bitncommon(const unsigned char *l, const unsigned char *r, int n)
Definition: network.c:1603
int bitncmp(const unsigned char *l, const unsigned char *r, int n)
Definition: network.c:1569
static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
static int inet_opr_codenum(Oid operator)
static Selectivity networkjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2)
static Selectivity inet_semi_join_sel(Datum lhs_value, bool mcv_exists, Datum *mcv_values, int mcv_nvalues, bool hist_exists, Datum *hist_values, int hist_nvalues, double hist_weight, FmgrInfo *proc, int opr_codenum)
static int inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
Datum networksel(PG_FUNCTION_ARGS)
static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues)
static Selectivity inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues, Oid operator)
Datum networkjoinsel(PG_FUNCTION_ARGS)
static Selectivity inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue, int opr_codenum)
static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, Datum *hist_values, int hist_nvalues, int opr_codenum)
#define MAX_CONSIDERED_ELEMS
static int inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
static Selectivity networkjoinsel_inner(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2)
#define DEFAULT_SEL(operator)
static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues, Datum *hist2_values, int hist2_nvalues, int opr_codenum)
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
double Selectivity
Definition: nodes.h:250
JoinType
Definition: nodes.h:288
@ JOIN_SEMI
Definition: nodes.h:307
@ JOIN_FULL
Definition: nodes.h:295
@ JOIN_INNER
Definition: nodes.h:293
@ JOIN_LEFT
Definition: nodes.h:294
@ JOIN_ANTI
Definition: nodes.h:308
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:135
static bool DatumGetBool(Datum X)
Definition: postgres.h:90
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
tree ctl root
Definition: radixtree.h:1884
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4883
double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Oid collation, Datum constval, bool varonleft, double *sumcommonp)
Definition: selfuncs.c:732
void get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo, VariableStatData *vardata1, VariableStatData *vardata2, bool *join_is_reversed)
Definition: selfuncs.c:4943
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:99
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:63
Datum * values
Definition: lsyscache.h:53
float4 * numbers
Definition: lsyscache.h:56
Definition: fmgr.h:57
Definition: pg_list.h:54
Definition: nodes.h:129
Cardinality rows
Definition: pathnodes.h:867
JoinType jointype
Definition: pathnodes.h:2886
HeapTuple statsTuple
Definition: selfuncs.h:89
RelOptInfo * rel
Definition: selfuncs.h:88
Definition: inet.h:53
#define ip_addr(inetptr)
Definition: inet.h:77
#define ip_family(inetptr)
Definition: inet.h:71
static inet * DatumGetInetPP(Datum X)
Definition: inet.h:123
#define ip_bits(inetptr)
Definition: inet.h:74