PostgreSQL Source Code  git master
like_support.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * like_support.c
4  * Planner support functions for LIKE, regex, and related operators.
5  *
6  * These routines handle special optimization of operators that can be
7  * used with index scans even though they are not known to the executor's
8  * indexscan machinery. The key idea is that these operators allow us
9  * to derive approximate indexscan qual clauses, such that any tuples
10  * that pass the operator clause itself must also satisfy the simpler
11  * indexscan condition(s). Then we can use the indexscan machinery
12  * to avoid scanning as much of the table as we'd otherwise have to,
13  * while applying the original operator as a qpqual condition to ensure
14  * we deliver only the tuples we want. (In essence, we're using a regular
15  * index as if it were a lossy index.)
16  *
17  * An example of what we're doing is
18  * textfield LIKE 'abc%def'
19  * from which we can generate the indexscanable conditions
20  * textfield >= 'abc' AND textfield < 'abd'
21  * which allow efficient scanning of an index on textfield.
22  * (In reality, character set and collation issues make the transformation
23  * from LIKE to indexscan limits rather harder than one might think ...
24  * but that's the basic idea.)
25  *
26  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/adt/like_support.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include <math.h>
38 
39 #include "access/htup_details.h"
40 #include "access/stratnum.h"
41 #include "catalog/pg_collation.h"
42 #include "catalog/pg_operator.h"
43 #include "catalog/pg_opfamily.h"
44 #include "catalog/pg_statistic.h"
45 #include "catalog/pg_type.h"
46 #include "mb/pg_wchar.h"
47 #include "miscadmin.h"
48 #include "nodes/makefuncs.h"
49 #include "nodes/nodeFuncs.h"
50 #include "nodes/supportnodes.h"
51 #include "utils/builtins.h"
52 #include "utils/datum.h"
53 #include "utils/lsyscache.h"
54 #include "utils/pg_locale.h"
55 #include "utils/selfuncs.h"
56 #include "utils/varlena.h"
57 
58 
59 typedef enum
60 {
67 
68 typedef enum
69 {
72 
73 static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
74 static List *match_pattern_prefix(Node *leftop,
75  Node *rightop,
76  Pattern_Type ptype,
77  Oid expr_coll,
78  Oid opfamily,
79  Oid indexcollation);
80 static double patternsel_common(PlannerInfo *root,
81  Oid oprid,
82  Oid opfuncid,
83  List *args,
84  int varRelid,
85  Oid collation,
86  Pattern_Type ptype,
87  bool negate);
89  Pattern_Type ptype,
90  Oid collation,
91  Const **prefix,
92  Selectivity *rest_selec);
94  VariableStatData *vardata,
95  Oid eqopr, Oid ltopr, Oid geopr,
96  Oid collation,
97  Const *prefixcon);
98 static Selectivity like_selectivity(const char *patt, int pattlen,
99  bool case_insensitive);
100 static Selectivity regex_selectivity(const char *patt, int pattlen,
101  bool case_insensitive,
102  int fixed_prefix_len);
103 static int pattern_char_isalpha(char c, bool is_multibyte,
104  pg_locale_t locale, bool locale_is_c);
105 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
106  Oid collation);
107 static Datum string_to_datum(const char *str, Oid datatype);
108 static Const *string_to_const(const char *str, Oid datatype);
109 static Const *string_to_bytea_const(const char *str, size_t str_len);
110 
111 
112 /*
113  * Planner support functions for LIKE, regex, and related operators
114  */
115 Datum
117 {
118  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
119 
121 }
122 
123 Datum
125 {
126  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
127 
129 }
130 
131 Datum
133 {
134  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
135 
137 }
138 
139 Datum
141 {
142  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
143 
145 }
146 
147 Datum
149 {
150  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
151 
153 }
154 
155 /* Common code for the above */
156 static Node *
158 {
159  Node *ret = NULL;
160 
161  if (IsA(rawreq, SupportRequestSelectivity))
162  {
163  /*
164  * Make a selectivity estimate for a function call, just as we'd do if
165  * the call was via the corresponding operator.
166  */
168  Selectivity s1;
169 
170  if (req->is_join)
171  {
172  /*
173  * For the moment we just punt. If patternjoinsel is ever
174  * improved to do better, this should be made to call it.
175  */
177  }
178  else
179  {
180  /* Share code with operator restriction selectivity functions */
181  s1 = patternsel_common(req->root,
182  InvalidOid,
183  req->funcid,
184  req->args,
185  req->varRelid,
186  req->inputcollid,
187  ptype,
188  false);
189  }
190  req->selectivity = s1;
191  ret = (Node *) req;
192  }
193  else if (IsA(rawreq, SupportRequestIndexCondition))
194  {
195  /* Try to convert operator/function call to index conditions */
197 
198  /*
199  * Currently we have no "reverse" match operators with the pattern on
200  * the left, so we only need consider cases with the indexkey on the
201  * left.
202  */
203  if (req->indexarg != 0)
204  return NULL;
205 
206  if (is_opclause(req->node))
207  {
208  OpExpr *clause = (OpExpr *) req->node;
209 
210  Assert(list_length(clause->args) == 2);
211  ret = (Node *)
212  match_pattern_prefix((Node *) linitial(clause->args),
213  (Node *) lsecond(clause->args),
214  ptype,
215  clause->inputcollid,
216  req->opfamily,
217  req->indexcollation);
218  }
219  else if (is_funcclause(req->node)) /* be paranoid */
220  {
221  FuncExpr *clause = (FuncExpr *) req->node;
222 
223  Assert(list_length(clause->args) == 2);
224  ret = (Node *)
225  match_pattern_prefix((Node *) linitial(clause->args),
226  (Node *) lsecond(clause->args),
227  ptype,
228  clause->inputcollid,
229  req->opfamily,
230  req->indexcollation);
231  }
232  }
233 
234  return ret;
235 }
236 
237 /*
238  * match_pattern_prefix
239  * Try to generate an indexqual for a LIKE or regex operator.
240  */
241 static List *
243  Node *rightop,
244  Pattern_Type ptype,
245  Oid expr_coll,
246  Oid opfamily,
247  Oid indexcollation)
248 {
249  List *result;
250  Const *patt;
251  Const *prefix;
252  Pattern_Prefix_Status pstatus;
253  Oid ldatatype;
254  Oid rdatatype;
255  Oid eqopr;
256  Oid ltopr;
257  Oid geopr;
258  Oid preopr = InvalidOid;
259  bool collation_aware;
260  Expr *expr;
261  FmgrInfo ltproc;
262  Const *greaterstr;
263 
264  /*
265  * Can't do anything with a non-constant or NULL pattern argument.
266  *
267  * Note that since we restrict ourselves to cases with a hard constant on
268  * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
269  * about verifying that.
270  */
271  if (!IsA(rightop, Const) ||
272  ((Const *) rightop)->constisnull)
273  return NIL;
274  patt = (Const *) rightop;
275 
276  /*
277  * Not supported if the expression collation is nondeterministic. The
278  * optimized equality or prefix tests use bytewise comparisons, which is
279  * not consistent with nondeterministic collations. The actual
280  * pattern-matching implementation functions will later error out that
281  * pattern-matching is not supported with nondeterministic collations. (We
282  * could also error out here, but by doing it later we get more precise
283  * error messages.) (It should be possible to support at least
284  * Pattern_Prefix_Exact, but no point as long as the actual
285  * pattern-matching implementations don't support it.)
286  *
287  * expr_coll is not set for a non-collation-aware data type such as bytea.
288  */
289  if (expr_coll && !get_collation_isdeterministic(expr_coll))
290  return NIL;
291 
292  /*
293  * Try to extract a fixed prefix from the pattern.
294  */
295  pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
296  &prefix, NULL);
297 
298  /* fail if no fixed prefix */
299  if (pstatus == Pattern_Prefix_None)
300  return NIL;
301 
302  /*
303  * Identify the operators we want to use, based on the type of the
304  * left-hand argument. Usually these are just the type's regular
305  * comparison operators, but if we are considering one of the semi-legacy
306  * "pattern" opclasses, use the "pattern" operators instead. Those are
307  * not collation-sensitive but always use C collation, as we want. The
308  * selected operators also determine the needed type of the prefix
309  * constant.
310  */
311  ldatatype = exprType(leftop);
312  switch (ldatatype)
313  {
314  case TEXTOID:
315  if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
316  {
317  eqopr = TextEqualOperator;
318  ltopr = TextPatternLessOperator;
319  geopr = TextPatternGreaterEqualOperator;
320  collation_aware = false;
321  }
322  else if (opfamily == TEXT_SPGIST_FAM_OID)
323  {
324  eqopr = TextEqualOperator;
325  ltopr = TextPatternLessOperator;
326  geopr = TextPatternGreaterEqualOperator;
327  /* This opfamily has direct support for prefixing */
328  preopr = TextPrefixOperator;
329  collation_aware = false;
330  }
331  else
332  {
333  eqopr = TextEqualOperator;
334  ltopr = TextLessOperator;
335  geopr = TextGreaterEqualOperator;
336  collation_aware = true;
337  }
338  rdatatype = TEXTOID;
339  break;
340  case NAMEOID:
341 
342  /*
343  * Note that here, we need the RHS type to be text, so that the
344  * comparison value isn't improperly truncated to NAMEDATALEN.
345  */
346  eqopr = NameEqualTextOperator;
347  ltopr = NameLessTextOperator;
348  geopr = NameGreaterEqualTextOperator;
349  collation_aware = true;
350  rdatatype = TEXTOID;
351  break;
352  case BPCHAROID:
353  if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
354  {
355  eqopr = BpcharEqualOperator;
356  ltopr = BpcharPatternLessOperator;
357  geopr = BpcharPatternGreaterEqualOperator;
358  collation_aware = false;
359  }
360  else
361  {
362  eqopr = BpcharEqualOperator;
363  ltopr = BpcharLessOperator;
364  geopr = BpcharGreaterEqualOperator;
365  collation_aware = true;
366  }
367  rdatatype = BPCHAROID;
368  break;
369  case BYTEAOID:
370  eqopr = ByteaEqualOperator;
371  ltopr = ByteaLessOperator;
372  geopr = ByteaGreaterEqualOperator;
373  collation_aware = false;
374  rdatatype = BYTEAOID;
375  break;
376  default:
377  /* Can't get here unless we're attached to the wrong operator */
378  return NIL;
379  }
380 
381  /*
382  * If necessary, coerce the prefix constant to the right type. The given
383  * prefix constant is either text or bytea type, therefore the only case
384  * where we need to do anything is when converting text to bpchar. Those
385  * two types are binary-compatible, so relabeling the Const node is
386  * sufficient.
387  */
388  if (prefix->consttype != rdatatype)
389  {
390  Assert(prefix->consttype == TEXTOID &&
391  rdatatype == BPCHAROID);
392  prefix->consttype = rdatatype;
393  }
394 
395  /*
396  * If we found an exact-match pattern, generate an "=" indexqual.
397  *
398  * Here and below, check to see whether the desired operator is actually
399  * supported by the index opclass, and fail quietly if not. This allows
400  * us to not be concerned with specific opclasses (except for the legacy
401  * "pattern" cases); any index that correctly implements the operators
402  * will work.
403  */
404  if (pstatus == Pattern_Prefix_Exact)
405  {
406  if (!op_in_opfamily(eqopr, opfamily))
407  return NIL;
408  expr = make_opclause(eqopr, BOOLOID, false,
409  (Expr *) leftop, (Expr *) prefix,
410  InvalidOid, indexcollation);
411  result = list_make1(expr);
412  return result;
413  }
414 
415  /*
416  * Otherwise, we have a nonempty required prefix of the values. Some
417  * opclasses support prefix checks directly, otherwise we'll try to
418  * generate a range constraint.
419  */
420  if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
421  {
422  expr = make_opclause(preopr, BOOLOID, false,
423  (Expr *) leftop, (Expr *) prefix,
424  InvalidOid, indexcollation);
425  result = list_make1(expr);
426  return result;
427  }
428 
429  /*
430  * Since we need a range constraint, it's only going to work reliably if
431  * the index is collation-insensitive or has "C" collation. Note that
432  * here we are looking at the index's collation, not the expression's
433  * collation -- this test is *not* dependent on the LIKE/regex operator's
434  * collation.
435  */
436  if (collation_aware &&
437  !lc_collate_is_c(indexcollation))
438  return NIL;
439 
440  /*
441  * We can always say "x >= prefix".
442  */
443  if (!op_in_opfamily(geopr, opfamily))
444  return NIL;
445  expr = make_opclause(geopr, BOOLOID, false,
446  (Expr *) leftop, (Expr *) prefix,
447  InvalidOid, indexcollation);
448  result = list_make1(expr);
449 
450  /*-------
451  * If we can create a string larger than the prefix, we can say
452  * "x < greaterstr". NB: we rely on make_greater_string() to generate
453  * a guaranteed-greater string, not just a probably-greater string.
454  * In general this is only guaranteed in C locale, so we'd better be
455  * using a C-locale index collation.
456  *-------
457  */
458  if (!op_in_opfamily(ltopr, opfamily))
459  return result;
460  fmgr_info(get_opcode(ltopr), &ltproc);
461  greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
462  if (greaterstr)
463  {
464  expr = make_opclause(ltopr, BOOLOID, false,
465  (Expr *) leftop, (Expr *) greaterstr,
466  InvalidOid, indexcollation);
467  result = lappend(result, expr);
468  }
469 
470  return result;
471 }
472 
473 
474 /*
475  * patternsel_common - generic code for pattern-match restriction selectivity.
476  *
477  * To support using this from either the operator or function paths, caller
478  * may pass either operator OID or underlying function OID; we look up the
479  * latter from the former if needed. (We could just have patternsel() call
480  * get_opcode(), but the work would be wasted if we don't have a need to
481  * compare a fixed prefix to the pg_statistic data.)
482  *
483  * Note that oprid and/or opfuncid should be for the positive-match operator
484  * even when negate is true.
485  */
486 static double
488  Oid oprid,
489  Oid opfuncid,
490  List *args,
491  int varRelid,
492  Oid collation,
493  Pattern_Type ptype,
494  bool negate)
495 {
496  VariableStatData vardata;
497  Node *other;
498  bool varonleft;
499  Datum constval;
500  Oid consttype;
501  Oid vartype;
502  Oid rdatatype;
503  Oid eqopr;
504  Oid ltopr;
505  Oid geopr;
506  Pattern_Prefix_Status pstatus;
507  Const *patt;
508  Const *prefix = NULL;
509  Selectivity rest_selec = 0;
510  double nullfrac = 0.0;
511  double result;
512 
513  /*
514  * Initialize result to the appropriate default estimate depending on
515  * whether it's a match or not-match operator.
516  */
517  if (negate)
518  result = 1.0 - DEFAULT_MATCH_SEL;
519  else
520  result = DEFAULT_MATCH_SEL;
521 
522  /*
523  * If expression is not variable op constant, then punt and return the
524  * default estimate.
525  */
526  if (!get_restriction_variable(root, args, varRelid,
527  &vardata, &other, &varonleft))
528  return result;
529  if (!varonleft || !IsA(other, Const))
530  {
531  ReleaseVariableStats(vardata);
532  return result;
533  }
534 
535  /*
536  * If the constant is NULL, assume operator is strict and return zero, ie,
537  * operator will never return TRUE. (It's zero even for a negator op.)
538  */
539  if (((Const *) other)->constisnull)
540  {
541  ReleaseVariableStats(vardata);
542  return 0.0;
543  }
544  constval = ((Const *) other)->constvalue;
545  consttype = ((Const *) other)->consttype;
546 
547  /*
548  * The right-hand const is type text or bytea for all supported operators.
549  * We do not expect to see binary-compatible types here, since
550  * const-folding should have relabeled the const to exactly match the
551  * operator's declared type.
552  */
553  if (consttype != TEXTOID && consttype != BYTEAOID)
554  {
555  ReleaseVariableStats(vardata);
556  return result;
557  }
558 
559  /*
560  * Similarly, the exposed type of the left-hand side should be one of
561  * those we know. (Do not look at vardata.atttype, which might be
562  * something binary-compatible but different.) We can use it to identify
563  * the comparison operators and the required type of the comparison
564  * constant, much as in match_pattern_prefix().
565  */
566  vartype = vardata.vartype;
567 
568  switch (vartype)
569  {
570  case TEXTOID:
571  eqopr = TextEqualOperator;
572  ltopr = TextLessOperator;
573  geopr = TextGreaterEqualOperator;
574  rdatatype = TEXTOID;
575  break;
576  case NAMEOID:
577 
578  /*
579  * Note that here, we need the RHS type to be text, so that the
580  * comparison value isn't improperly truncated to NAMEDATALEN.
581  */
582  eqopr = NameEqualTextOperator;
583  ltopr = NameLessTextOperator;
584  geopr = NameGreaterEqualTextOperator;
585  rdatatype = TEXTOID;
586  break;
587  case BPCHAROID:
588  eqopr = BpcharEqualOperator;
589  ltopr = BpcharLessOperator;
590  geopr = BpcharGreaterEqualOperator;
591  rdatatype = BPCHAROID;
592  break;
593  case BYTEAOID:
594  eqopr = ByteaEqualOperator;
595  ltopr = ByteaLessOperator;
596  geopr = ByteaGreaterEqualOperator;
597  rdatatype = BYTEAOID;
598  break;
599  default:
600  /* Can't get here unless we're attached to the wrong operator */
601  ReleaseVariableStats(vardata);
602  return result;
603  }
604 
605  /*
606  * Grab the nullfrac for use below.
607  */
608  if (HeapTupleIsValid(vardata.statsTuple))
609  {
610  Form_pg_statistic stats;
611 
612  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
613  nullfrac = stats->stanullfrac;
614  }
615 
616  /*
617  * Pull out any fixed prefix implied by the pattern, and estimate the
618  * fractional selectivity of the remainder of the pattern. Unlike many
619  * other selectivity estimators, we use the pattern operator's actual
620  * collation for this step. This is not because we expect the collation
621  * to make a big difference in the selectivity estimate (it seldom would),
622  * but because we want to be sure we cache compiled regexps under the
623  * right cache key, so that they can be re-used at runtime.
624  */
625  patt = (Const *) other;
626  pstatus = pattern_fixed_prefix(patt, ptype, collation,
627  &prefix, &rest_selec);
628 
629  /*
630  * If necessary, coerce the prefix constant to the right type. The only
631  * case where we need to do anything is when converting text to bpchar.
632  * Those two types are binary-compatible, so relabeling the Const node is
633  * sufficient.
634  */
635  if (prefix && prefix->consttype != rdatatype)
636  {
637  Assert(prefix->consttype == TEXTOID &&
638  rdatatype == BPCHAROID);
639  prefix->consttype = rdatatype;
640  }
641 
642  if (pstatus == Pattern_Prefix_Exact)
643  {
644  /*
645  * Pattern specifies an exact match, so estimate as for '='
646  */
647  result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
648  false, true, false);
649  }
650  else
651  {
652  /*
653  * Not exact-match pattern. If we have a sufficiently large
654  * histogram, estimate selectivity for the histogram part of the
655  * population by counting matches in the histogram. If not, estimate
656  * selectivity of the fixed prefix and remainder of pattern
657  * separately, then combine the two to get an estimate of the
658  * selectivity for the part of the column population represented by
659  * the histogram. (For small histograms, we combine these
660  * approaches.)
661  *
662  * We then add up data for any most-common-values values; these are
663  * not in the histogram population, and we can get exact answers for
664  * them by applying the pattern operator, so there's no reason to
665  * approximate. (If the MCVs cover a significant part of the total
666  * population, this gives us a big leg up in accuracy.)
667  */
668  Selectivity selec;
669  int hist_size;
670  FmgrInfo opproc;
671  double mcv_selec,
672  sumcommon;
673 
674  /* Try to use the histogram entries to get selectivity */
675  if (!OidIsValid(opfuncid))
676  opfuncid = get_opcode(oprid);
677  fmgr_info(opfuncid, &opproc);
678 
679  selec = histogram_selectivity(&vardata, &opproc, collation,
680  constval, true,
681  10, 1, &hist_size);
682 
683  /* If not at least 100 entries, use the heuristic method */
684  if (hist_size < 100)
685  {
686  Selectivity heursel;
688 
689  if (pstatus == Pattern_Prefix_Partial)
690  prefixsel = prefix_selectivity(root, &vardata,
691  eqopr, ltopr, geopr,
692  collation,
693  prefix);
694  else
695  prefixsel = 1.0;
696  heursel = prefixsel * rest_selec;
697 
698  if (selec < 0) /* fewer than 10 histogram entries? */
699  selec = heursel;
700  else
701  {
702  /*
703  * For histogram sizes from 10 to 100, we combine the
704  * histogram and heuristic selectivities, putting increasingly
705  * more trust in the histogram for larger sizes.
706  */
707  double hist_weight = hist_size / 100.0;
708 
709  selec = selec * hist_weight + heursel * (1.0 - hist_weight);
710  }
711  }
712 
713  /* In any case, don't believe extremely small or large estimates. */
714  if (selec < 0.0001)
715  selec = 0.0001;
716  else if (selec > 0.9999)
717  selec = 0.9999;
718 
719  /*
720  * If we have most-common-values info, add up the fractions of the MCV
721  * entries that satisfy MCV OP PATTERN. These fractions contribute
722  * directly to the result selectivity. Also add up the total fraction
723  * represented by MCV entries.
724  */
725  mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
726  constval, true,
727  &sumcommon);
728 
729  /*
730  * Now merge the results from the MCV and histogram calculations,
731  * realizing that the histogram covers only the non-null values that
732  * are not listed in MCV.
733  */
734  selec *= 1.0 - nullfrac - sumcommon;
735  selec += mcv_selec;
736  result = selec;
737  }
738 
739  /* now adjust if we wanted not-match rather than match */
740  if (negate)
741  result = 1.0 - result - nullfrac;
742 
743  /* result should be in range, but make sure... */
744  CLAMP_PROBABILITY(result);
745 
746  if (prefix)
747  {
748  pfree(DatumGetPointer(prefix->constvalue));
749  pfree(prefix);
750  }
751 
752  ReleaseVariableStats(vardata);
753 
754  return result;
755 }
756 
757 /*
758  * Fix impedance mismatch between SQL-callable functions and patternsel_common
759  */
760 static double
762 {
764  Oid operator = PG_GETARG_OID(1);
765  List *args = (List *) PG_GETARG_POINTER(2);
766  int varRelid = PG_GETARG_INT32(3);
767  Oid collation = PG_GET_COLLATION();
768 
769  /*
770  * If this is for a NOT LIKE or similar operator, get the corresponding
771  * positive-match operator and work with that.
772  */
773  if (negate)
774  {
775  operator = get_negator(operator);
776  if (!OidIsValid(operator))
777  elog(ERROR, "patternsel called for operator without a negator");
778  }
779 
780  return patternsel_common(root,
781  operator,
782  InvalidOid,
783  args,
784  varRelid,
785  collation,
786  ptype,
787  negate);
788 }
789 
790 /*
791  * regexeqsel - Selectivity of regular-expression pattern match.
792  */
793 Datum
795 {
797 }
798 
799 /*
800  * icregexeqsel - Selectivity of case-insensitive regex match.
801  */
802 Datum
804 {
806 }
807 
808 /*
809  * likesel - Selectivity of LIKE pattern match.
810  */
811 Datum
813 {
815 }
816 
817 /*
818  * prefixsel - selectivity of prefix operator
819  */
820 Datum
822 {
824 }
825 
826 /*
827  *
828  * iclikesel - Selectivity of ILIKE pattern match.
829  */
830 Datum
832 {
834 }
835 
836 /*
837  * regexnesel - Selectivity of regular-expression pattern non-match.
838  */
839 Datum
841 {
843 }
844 
845 /*
846  * icregexnesel - Selectivity of case-insensitive regex non-match.
847  */
848 Datum
850 {
852 }
853 
854 /*
855  * nlikesel - Selectivity of LIKE pattern non-match.
856  */
857 Datum
859 {
861 }
862 
863 /*
864  * icnlikesel - Selectivity of ILIKE pattern non-match.
865  */
866 Datum
868 {
870 }
871 
872 /*
873  * patternjoinsel - Generic code for pattern-match join selectivity.
874  */
875 static double
877 {
878  /* For the moment we just punt. */
879  return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
880 }
881 
882 /*
883  * regexeqjoinsel - Join selectivity of regular-expression pattern match.
884  */
885 Datum
887 {
889 }
890 
891 /*
892  * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
893  */
894 Datum
896 {
898 }
899 
900 /*
901  * likejoinsel - Join selectivity of LIKE pattern match.
902  */
903 Datum
905 {
907 }
908 
909 /*
910  * prefixjoinsel - Join selectivity of prefix operator
911  */
912 Datum
914 {
916 }
917 
918 /*
919  * iclikejoinsel - Join selectivity of ILIKE pattern match.
920  */
921 Datum
923 {
925 }
926 
927 /*
928  * regexnejoinsel - Join selectivity of regex non-match.
929  */
930 Datum
932 {
934 }
935 
936 /*
937  * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
938  */
939 Datum
941 {
943 }
944 
945 /*
946  * nlikejoinsel - Join selectivity of LIKE pattern non-match.
947  */
948 Datum
950 {
952 }
953 
954 /*
955  * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
956  */
957 Datum
959 {
961 }
962 
963 
964 /*-------------------------------------------------------------------------
965  *
966  * Pattern analysis functions
967  *
968  * These routines support analysis of LIKE and regular-expression patterns
969  * by the planner/optimizer. It's important that they agree with the
970  * regular-expression code in backend/regex/ and the LIKE code in
971  * backend/utils/adt/like.c. Also, the computation of the fixed prefix
972  * must be conservative: if we report a string longer than the true fixed
973  * prefix, the query may produce actually wrong answers, rather than just
974  * getting a bad selectivity estimate!
975  *
976  *-------------------------------------------------------------------------
977  */
978 
979 /*
980  * Extract the fixed prefix, if any, for a pattern.
981  *
982  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
983  * or to NULL if no fixed prefix exists for the pattern.
984  * If rest_selec is not NULL, *rest_selec is set to an estimate of the
985  * selectivity of the remainder of the pattern (without any fixed prefix).
986  * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
987  *
988  * The return value distinguishes no fixed prefix, a partial prefix,
989  * or an exact-match-only pattern.
990  */
991 
993 like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
994  Const **prefix_const, Selectivity *rest_selec)
995 {
996  char *match;
997  char *patt;
998  int pattlen;
999  Oid typeid = patt_const->consttype;
1000  int pos,
1001  match_pos;
1002  bool is_multibyte = (pg_database_encoding_max_length() > 1);
1003  pg_locale_t locale = 0;
1004  bool locale_is_c = false;
1005 
1006  /* the right-hand const is type text or bytea */
1007  Assert(typeid == BYTEAOID || typeid == TEXTOID);
1008 
1009  if (case_insensitive)
1010  {
1011  if (typeid == BYTEAOID)
1012  ereport(ERROR,
1013  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1014  errmsg("case insensitive matching not supported on type bytea")));
1015 
1016  if (!OidIsValid(collation))
1017  {
1018  /*
1019  * This typically means that the parser could not resolve a
1020  * conflict of implicit collations, so report it that way.
1021  */
1022  ereport(ERROR,
1023  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1024  errmsg("could not determine which collation to use for ILIKE"),
1025  errhint("Use the COLLATE clause to set the collation explicitly.")));
1026  }
1027 
1028  /* If case-insensitive, we need locale info */
1029  if (lc_ctype_is_c(collation))
1030  locale_is_c = true;
1031  else
1032  locale = pg_newlocale_from_collation(collation);
1033  }
1034 
1035  if (typeid != BYTEAOID)
1036  {
1037  patt = TextDatumGetCString(patt_const->constvalue);
1038  pattlen = strlen(patt);
1039  }
1040  else
1041  {
1042  bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1043 
1044  pattlen = VARSIZE_ANY_EXHDR(bstr);
1045  patt = (char *) palloc(pattlen);
1046  memcpy(patt, VARDATA_ANY(bstr), pattlen);
1047  Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1048  }
1049 
1050  match = palloc(pattlen + 1);
1051  match_pos = 0;
1052  for (pos = 0; pos < pattlen; pos++)
1053  {
1054  /* % and _ are wildcard characters in LIKE */
1055  if (patt[pos] == '%' ||
1056  patt[pos] == '_')
1057  break;
1058 
1059  /* Backslash escapes the next character */
1060  if (patt[pos] == '\\')
1061  {
1062  pos++;
1063  if (pos >= pattlen)
1064  break;
1065  }
1066 
1067  /* Stop if case-varying character (it's sort of a wildcard) */
1068  if (case_insensitive &&
1069  pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1070  break;
1071 
1072  match[match_pos++] = patt[pos];
1073  }
1074 
1075  match[match_pos] = '\0';
1076 
1077  if (typeid != BYTEAOID)
1078  *prefix_const = string_to_const(match, typeid);
1079  else
1080  *prefix_const = string_to_bytea_const(match, match_pos);
1081 
1082  if (rest_selec != NULL)
1083  *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1084  case_insensitive);
1085 
1086  pfree(patt);
1087  pfree(match);
1088 
1089  /* in LIKE, an empty pattern is an exact match! */
1090  if (pos == pattlen)
1091  return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1092 
1093  if (match_pos > 0)
1094  return Pattern_Prefix_Partial;
1095 
1096  return Pattern_Prefix_None;
1097 }
1098 
1099 static Pattern_Prefix_Status
1100 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1101  Const **prefix_const, Selectivity *rest_selec)
1102 {
1103  Oid typeid = patt_const->consttype;
1104  char *prefix;
1105  bool exact;
1106 
1107  /*
1108  * Should be unnecessary, there are no bytea regex operators defined. As
1109  * such, it should be noted that the rest of this function has *not* been
1110  * made safe for binary (possibly NULL containing) strings.
1111  */
1112  if (typeid == BYTEAOID)
1113  ereport(ERROR,
1114  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1115  errmsg("regular-expression matching not supported on type bytea")));
1116 
1117  /* Use the regexp machinery to extract the prefix, if any */
1118  prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1119  case_insensitive, collation,
1120  &exact);
1121 
1122  if (prefix == NULL)
1123  {
1124  *prefix_const = NULL;
1125 
1126  if (rest_selec != NULL)
1127  {
1128  char *patt = TextDatumGetCString(patt_const->constvalue);
1129 
1130  *rest_selec = regex_selectivity(patt, strlen(patt),
1131  case_insensitive,
1132  0);
1133  pfree(patt);
1134  }
1135 
1136  return Pattern_Prefix_None;
1137  }
1138 
1139  *prefix_const = string_to_const(prefix, typeid);
1140 
1141  if (rest_selec != NULL)
1142  {
1143  if (exact)
1144  {
1145  /* Exact match, so there's no additional selectivity */
1146  *rest_selec = 1.0;
1147  }
1148  else
1149  {
1150  char *patt = TextDatumGetCString(patt_const->constvalue);
1151 
1152  *rest_selec = regex_selectivity(patt, strlen(patt),
1153  case_insensitive,
1154  strlen(prefix));
1155  pfree(patt);
1156  }
1157  }
1158 
1159  pfree(prefix);
1160 
1161  if (exact)
1162  return Pattern_Prefix_Exact; /* pattern specifies exact match */
1163  else
1164  return Pattern_Prefix_Partial;
1165 }
1166 
1167 static Pattern_Prefix_Status
1168 pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1169  Const **prefix, Selectivity *rest_selec)
1170 {
1171  Pattern_Prefix_Status result;
1172 
1173  switch (ptype)
1174  {
1175  case Pattern_Type_Like:
1176  result = like_fixed_prefix(patt, false, collation,
1177  prefix, rest_selec);
1178  break;
1179  case Pattern_Type_Like_IC:
1180  result = like_fixed_prefix(patt, true, collation,
1181  prefix, rest_selec);
1182  break;
1183  case Pattern_Type_Regex:
1184  result = regex_fixed_prefix(patt, false, collation,
1185  prefix, rest_selec);
1186  break;
1187  case Pattern_Type_Regex_IC:
1188  result = regex_fixed_prefix(patt, true, collation,
1189  prefix, rest_selec);
1190  break;
1191  case Pattern_Type_Prefix:
1192  /* Prefix type work is trivial. */
1193  result = Pattern_Prefix_Partial;
1194  *prefix = makeConst(patt->consttype,
1195  patt->consttypmod,
1196  patt->constcollid,
1197  patt->constlen,
1198  datumCopy(patt->constvalue,
1199  patt->constbyval,
1200  patt->constlen),
1201  patt->constisnull,
1202  patt->constbyval);
1203  if (rest_selec != NULL)
1204  *rest_selec = 1.0; /* all */
1205  break;
1206  default:
1207  elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1208  result = Pattern_Prefix_None; /* keep compiler quiet */
1209  break;
1210  }
1211  return result;
1212 }
1213 
1214 /*
1215  * Estimate the selectivity of a fixed prefix for a pattern match.
1216  *
1217  * A fixed prefix "foo" is estimated as the selectivity of the expression
1218  * "variable >= 'foo' AND variable < 'fop'".
1219  *
1220  * The selectivity estimate is with respect to the portion of the column
1221  * population represented by the histogram --- the caller must fold this
1222  * together with info about MCVs and NULLs.
1223  *
1224  * We use the given comparison operators and collation to do the estimation.
1225  * The given variable and Const must be of the associated datatype(s).
1226  *
1227  * XXX Note: we make use of the upper bound to estimate operator selectivity
1228  * even if the locale is such that we cannot rely on the upper-bound string.
1229  * The selectivity only needs to be approximately right anyway, so it seems
1230  * more useful to use the upper-bound code than not.
1231  */
1232 static Selectivity
1234  Oid eqopr, Oid ltopr, Oid geopr,
1235  Oid collation,
1236  Const *prefixcon)
1237 {
1239  FmgrInfo opproc;
1240  Const *greaterstrcon;
1241  Selectivity eq_sel;
1242 
1243  /* Estimate the selectivity of "x >= prefix" */
1244  fmgr_info(get_opcode(geopr), &opproc);
1245 
1246  prefixsel = ineq_histogram_selectivity(root, vardata,
1247  geopr, &opproc, true, true,
1248  collation,
1249  prefixcon->constvalue,
1250  prefixcon->consttype);
1251 
1252  if (prefixsel < 0.0)
1253  {
1254  /* No histogram is present ... return a suitable default estimate */
1255  return DEFAULT_MATCH_SEL;
1256  }
1257 
1258  /*
1259  * If we can create a string larger than the prefix, say "x < greaterstr".
1260  */
1261  fmgr_info(get_opcode(ltopr), &opproc);
1262  greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1263  if (greaterstrcon)
1264  {
1265  Selectivity topsel;
1266 
1267  topsel = ineq_histogram_selectivity(root, vardata,
1268  ltopr, &opproc, false, false,
1269  collation,
1270  greaterstrcon->constvalue,
1271  greaterstrcon->consttype);
1272 
1273  /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1274  Assert(topsel >= 0.0);
1275 
1276  /*
1277  * Merge the two selectivities in the same way as for a range query
1278  * (see clauselist_selectivity()). Note that we don't need to worry
1279  * about double-exclusion of nulls, since ineq_histogram_selectivity
1280  * doesn't count those anyway.
1281  */
1282  prefixsel = topsel + prefixsel - 1.0;
1283  }
1284 
1285  /*
1286  * If the prefix is long then the two bounding values might be too close
1287  * together for the histogram to distinguish them usefully, resulting in a
1288  * zero estimate (plus or minus roundoff error). To avoid returning a
1289  * ridiculously small estimate, compute the estimated selectivity for
1290  * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1291  * estimate should be at least that.)
1292  *
1293  * We apply this even if we couldn't make a greater string. That case
1294  * suggests that the prefix is near the maximum possible, and thus
1295  * probably off the end of the histogram, and thus we probably got a very
1296  * small estimate from the >= condition; so we still need to clamp.
1297  */
1298  eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1299  false, true, false);
1300 
1301  prefixsel = Max(prefixsel, eq_sel);
1302 
1303  return prefixsel;
1304 }
1305 
1306 
1307 /*
1308  * Estimate the selectivity of a pattern of the specified type.
1309  * Note that any fixed prefix of the pattern will have been removed already,
1310  * so actually we may be looking at just a fragment of the pattern.
1311  *
1312  * For now, we use a very simplistic approach: fixed characters reduce the
1313  * selectivity a good deal, character ranges reduce it a little,
1314  * wildcards (such as % for LIKE or .* for regex) increase it.
1315  */
1316 
1317 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1318 #define CHAR_RANGE_SEL 0.25
1319 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1320 #define FULL_WILDCARD_SEL 5.0
1321 #define PARTIAL_WILDCARD_SEL 2.0
1322 
1323 static Selectivity
1324 like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1325 {
1326  Selectivity sel = 1.0;
1327  int pos;
1328 
1329  /* Skip any leading wildcard; it's already factored into initial sel */
1330  for (pos = 0; pos < pattlen; pos++)
1331  {
1332  if (patt[pos] != '%' && patt[pos] != '_')
1333  break;
1334  }
1335 
1336  for (; pos < pattlen; pos++)
1337  {
1338  /* % and _ are wildcard characters in LIKE */
1339  if (patt[pos] == '%')
1340  sel *= FULL_WILDCARD_SEL;
1341  else if (patt[pos] == '_')
1342  sel *= ANY_CHAR_SEL;
1343  else if (patt[pos] == '\\')
1344  {
1345  /* Backslash quotes the next character */
1346  pos++;
1347  if (pos >= pattlen)
1348  break;
1349  sel *= FIXED_CHAR_SEL;
1350  }
1351  else
1352  sel *= FIXED_CHAR_SEL;
1353  }
1354  /* Could get sel > 1 if multiple wildcards */
1355  if (sel > 1.0)
1356  sel = 1.0;
1357  return sel;
1358 }
1359 
1360 static Selectivity
1361 regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1362 {
1363  Selectivity sel = 1.0;
1364  int paren_depth = 0;
1365  int paren_pos = 0; /* dummy init to keep compiler quiet */
1366  int pos;
1367 
1368  /* since this function recurses, it could be driven to stack overflow */
1370 
1371  for (pos = 0; pos < pattlen; pos++)
1372  {
1373  if (patt[pos] == '(')
1374  {
1375  if (paren_depth == 0)
1376  paren_pos = pos; /* remember start of parenthesized item */
1377  paren_depth++;
1378  }
1379  else if (patt[pos] == ')' && paren_depth > 0)
1380  {
1381  paren_depth--;
1382  if (paren_depth == 0)
1383  sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1384  pos - (paren_pos + 1),
1385  case_insensitive);
1386  }
1387  else if (patt[pos] == '|' && paren_depth == 0)
1388  {
1389  /*
1390  * If unquoted | is present at paren level 0 in pattern, we have
1391  * multiple alternatives; sum their probabilities.
1392  */
1393  sel += regex_selectivity_sub(patt + (pos + 1),
1394  pattlen - (pos + 1),
1395  case_insensitive);
1396  break; /* rest of pattern is now processed */
1397  }
1398  else if (patt[pos] == '[')
1399  {
1400  bool negclass = false;
1401 
1402  if (patt[++pos] == '^')
1403  {
1404  negclass = true;
1405  pos++;
1406  }
1407  if (patt[pos] == ']') /* ']' at start of class is not special */
1408  pos++;
1409  while (pos < pattlen && patt[pos] != ']')
1410  pos++;
1411  if (paren_depth == 0)
1412  sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1413  }
1414  else if (patt[pos] == '.')
1415  {
1416  if (paren_depth == 0)
1417  sel *= ANY_CHAR_SEL;
1418  }
1419  else if (patt[pos] == '*' ||
1420  patt[pos] == '?' ||
1421  patt[pos] == '+')
1422  {
1423  /* Ought to be smarter about quantifiers... */
1424  if (paren_depth == 0)
1425  sel *= PARTIAL_WILDCARD_SEL;
1426  }
1427  else if (patt[pos] == '{')
1428  {
1429  while (pos < pattlen && patt[pos] != '}')
1430  pos++;
1431  if (paren_depth == 0)
1432  sel *= PARTIAL_WILDCARD_SEL;
1433  }
1434  else if (patt[pos] == '\\')
1435  {
1436  /* backslash quotes the next character */
1437  pos++;
1438  if (pos >= pattlen)
1439  break;
1440  if (paren_depth == 0)
1441  sel *= FIXED_CHAR_SEL;
1442  }
1443  else
1444  {
1445  if (paren_depth == 0)
1446  sel *= FIXED_CHAR_SEL;
1447  }
1448  }
1449  /* Could get sel > 1 if multiple wildcards */
1450  if (sel > 1.0)
1451  sel = 1.0;
1452  return sel;
1453 }
1454 
1455 static Selectivity
1456 regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1457  int fixed_prefix_len)
1458 {
1459  Selectivity sel;
1460 
1461  /* If patt doesn't end with $, consider it to have a trailing wildcard */
1462  if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1463  (pattlen == 1 || patt[pattlen - 2] != '\\'))
1464  {
1465  /* has trailing $ */
1466  sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1467  }
1468  else
1469  {
1470  /* no trailing $ */
1471  sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1472  sel *= FULL_WILDCARD_SEL;
1473  }
1474 
1475  /*
1476  * If there's a fixed prefix, discount its selectivity. We have to be
1477  * careful here since a very long prefix could result in pow's result
1478  * underflowing to zero (in which case "sel" probably has as well).
1479  */
1480  if (fixed_prefix_len > 0)
1481  {
1482  double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1483 
1484  if (prefixsel > 0.0)
1485  sel /= prefixsel;
1486  }
1487 
1488  /* Make sure result stays in range */
1489  CLAMP_PROBABILITY(sel);
1490  return sel;
1491 }
1492 
1493 /*
1494  * Check whether char is a letter (and, hence, subject to case-folding)
1495  *
1496  * In multibyte character sets or with ICU, we can't use isalpha, and it does
1497  * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1498  * Instead, just assume any non-ASCII char is potentially case-varying, and
1499  * hard-wire knowledge of which ASCII chars are letters.
1500  */
1501 static int
1502 pattern_char_isalpha(char c, bool is_multibyte,
1503  pg_locale_t locale, bool locale_is_c)
1504 {
1505  if (locale_is_c)
1506  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1507  else if (is_multibyte && IS_HIGHBIT_SET(c))
1508  return true;
1509  else if (locale && locale->provider == COLLPROVIDER_ICU)
1510  return IS_HIGHBIT_SET(c) ||
1511  (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1512 #ifdef HAVE_LOCALE_T
1513  else if (locale && locale->provider == COLLPROVIDER_LIBC)
1514  return isalpha_l((unsigned char) c, locale->info.lt);
1515 #endif
1516  else
1517  return isalpha((unsigned char) c);
1518 }
1519 
1520 
1521 /*
1522  * For bytea, the increment function need only increment the current byte
1523  * (there are no multibyte characters to worry about).
1524  */
1525 static bool
1526 byte_increment(unsigned char *ptr, int len)
1527 {
1528  if (*ptr >= 255)
1529  return false;
1530  (*ptr)++;
1531  return true;
1532 }
1533 
1534 /*
1535  * Try to generate a string greater than the given string or any
1536  * string it is a prefix of. If successful, return a palloc'd string
1537  * in the form of a Const node; else return NULL.
1538  *
1539  * The caller must provide the appropriate "less than" comparison function
1540  * for testing the strings, along with the collation to use.
1541  *
1542  * The key requirement here is that given a prefix string, say "foo",
1543  * we must be able to generate another string "fop" that is greater than
1544  * all strings "foobar" starting with "foo". We can test that we have
1545  * generated a string greater than the prefix string, but in non-C collations
1546  * that is not a bulletproof guarantee that an extension of the string might
1547  * not sort after it; an example is that "foo " is less than "foo!", but it
1548  * is not clear that a "dictionary" sort ordering will consider "foo!" less
1549  * than "foo bar". CAUTION: Therefore, this function should be used only for
1550  * estimation purposes when working in a non-C collation.
1551  *
1552  * To try to catch most cases where an extended string might otherwise sort
1553  * before the result value, we determine which of the strings "Z", "z", "y",
1554  * and "9" is seen as largest by the collation, and append that to the given
1555  * prefix before trying to find a string that compares as larger.
1556  *
1557  * To search for a greater string, we repeatedly "increment" the rightmost
1558  * character, using an encoding-specific character incrementer function.
1559  * When it's no longer possible to increment the last character, we truncate
1560  * off that character and start incrementing the next-to-rightmost.
1561  * For example, if "z" were the last character in the sort order, then we
1562  * could produce "foo" as a string greater than "fonz".
1563  *
1564  * This could be rather slow in the worst case, but in most cases we
1565  * won't have to try more than one or two strings before succeeding.
1566  *
1567  * Note that it's important for the character incrementer not to be too anal
1568  * about producing every possible character code, since in some cases the only
1569  * way to get a larger string is to increment a previous character position.
1570  * So we don't want to spend too much time trying every possible character
1571  * code at the last position. A good rule of thumb is to be sure that we
1572  * don't try more than 256*K values for a K-byte character (and definitely
1573  * not 256^K, which is what an exhaustive search would approach).
1574  */
1575 static Const *
1576 make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1577 {
1578  Oid datatype = str_const->consttype;
1579  char *workstr;
1580  int len;
1581  Datum cmpstr;
1582  char *cmptxt = NULL;
1583  mbcharacter_incrementer charinc;
1584 
1585  /*
1586  * Get a modifiable copy of the prefix string in C-string format, and set
1587  * up the string we will compare to as a Datum. In C locale this can just
1588  * be the given prefix string, otherwise we need to add a suffix. Type
1589  * BYTEA sorts bytewise so it never needs a suffix either.
1590  */
1591  if (datatype == BYTEAOID)
1592  {
1593  bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1594 
1595  len = VARSIZE_ANY_EXHDR(bstr);
1596  workstr = (char *) palloc(len);
1597  memcpy(workstr, VARDATA_ANY(bstr), len);
1598  Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1599  cmpstr = str_const->constvalue;
1600  }
1601  else
1602  {
1603  if (datatype == NAMEOID)
1605  str_const->constvalue));
1606  else
1607  workstr = TextDatumGetCString(str_const->constvalue);
1608  len = strlen(workstr);
1609  if (lc_collate_is_c(collation) || len == 0)
1610  cmpstr = str_const->constvalue;
1611  else
1612  {
1613  /* If first time through, determine the suffix to use */
1614  static char suffixchar = 0;
1615  static Oid suffixcollation = 0;
1616 
1617  if (!suffixchar || suffixcollation != collation)
1618  {
1619  char *best;
1620 
1621  best = "Z";
1622  if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1623  best = "z";
1624  if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1625  best = "y";
1626  if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1627  best = "9";
1628  suffixchar = *best;
1629  suffixcollation = collation;
1630  }
1631 
1632  /* And build the string to compare to */
1633  if (datatype == NAMEOID)
1634  {
1635  cmptxt = palloc(len + 2);
1636  memcpy(cmptxt, workstr, len);
1637  cmptxt[len] = suffixchar;
1638  cmptxt[len + 1] = '\0';
1639  cmpstr = PointerGetDatum(cmptxt);
1640  }
1641  else
1642  {
1643  cmptxt = palloc(VARHDRSZ + len + 1);
1644  SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1645  memcpy(VARDATA(cmptxt), workstr, len);
1646  *(VARDATA(cmptxt) + len) = suffixchar;
1647  cmpstr = PointerGetDatum(cmptxt);
1648  }
1649  }
1650  }
1651 
1652  /* Select appropriate character-incrementer function */
1653  if (datatype == BYTEAOID)
1654  charinc = byte_increment;
1655  else
1657 
1658  /* And search ... */
1659  while (len > 0)
1660  {
1661  int charlen;
1662  unsigned char *lastchar;
1663 
1664  /* Identify the last character --- for bytea, just the last byte */
1665  if (datatype == BYTEAOID)
1666  charlen = 1;
1667  else
1668  charlen = len - pg_mbcliplen(workstr, len, len - 1);
1669  lastchar = (unsigned char *) (workstr + len - charlen);
1670 
1671  /*
1672  * Try to generate a larger string by incrementing the last character
1673  * (for BYTEA, we treat each byte as a character).
1674  *
1675  * Note: the incrementer function is expected to return true if it's
1676  * generated a valid-per-the-encoding new character, otherwise false.
1677  * The contents of the character on false return are unspecified.
1678  */
1679  while (charinc(lastchar, charlen))
1680  {
1681  Const *workstr_const;
1682 
1683  if (datatype == BYTEAOID)
1684  workstr_const = string_to_bytea_const(workstr, len);
1685  else
1686  workstr_const = string_to_const(workstr, datatype);
1687 
1688  if (DatumGetBool(FunctionCall2Coll(ltproc,
1689  collation,
1690  cmpstr,
1691  workstr_const->constvalue)))
1692  {
1693  /* Successfully made a string larger than cmpstr */
1694  if (cmptxt)
1695  pfree(cmptxt);
1696  pfree(workstr);
1697  return workstr_const;
1698  }
1699 
1700  /* No good, release unusable value and try again */
1701  pfree(DatumGetPointer(workstr_const->constvalue));
1702  pfree(workstr_const);
1703  }
1704 
1705  /*
1706  * No luck here, so truncate off the last character and try to
1707  * increment the next one.
1708  */
1709  len -= charlen;
1710  workstr[len] = '\0';
1711  }
1712 
1713  /* Failed... */
1714  if (cmptxt)
1715  pfree(cmptxt);
1716  pfree(workstr);
1717 
1718  return NULL;
1719 }
1720 
1721 /*
1722  * Generate a Datum of the appropriate type from a C string.
1723  * Note that all of the supported types are pass-by-ref, so the
1724  * returned value should be pfree'd if no longer needed.
1725  */
1726 static Datum
1727 string_to_datum(const char *str, Oid datatype)
1728 {
1729  Assert(str != NULL);
1730 
1731  /*
1732  * We cheat a little by assuming that CStringGetTextDatum() will do for
1733  * bpchar and varchar constants too...
1734  */
1735  if (datatype == NAMEOID)
1737  else if (datatype == BYTEAOID)
1739  else
1740  return CStringGetTextDatum(str);
1741 }
1742 
1743 /*
1744  * Generate a Const node of the appropriate type from a C string.
1745  */
1746 static Const *
1747 string_to_const(const char *str, Oid datatype)
1748 {
1749  Datum conval = string_to_datum(str, datatype);
1750  Oid collation;
1751  int constlen;
1752 
1753  /*
1754  * We only need to support a few datatypes here, so hard-wire properties
1755  * instead of incurring the expense of catalog lookups.
1756  */
1757  switch (datatype)
1758  {
1759  case TEXTOID:
1760  case VARCHAROID:
1761  case BPCHAROID:
1762  collation = DEFAULT_COLLATION_OID;
1763  constlen = -1;
1764  break;
1765 
1766  case NAMEOID:
1767  collation = C_COLLATION_OID;
1768  constlen = NAMEDATALEN;
1769  break;
1770 
1771  case BYTEAOID:
1772  collation = InvalidOid;
1773  constlen = -1;
1774  break;
1775 
1776  default:
1777  elog(ERROR, "unexpected datatype in string_to_const: %u",
1778  datatype);
1779  return NULL;
1780  }
1781 
1782  return makeConst(datatype, -1, collation, constlen,
1783  conval, false, false);
1784 }
1785 
1786 /*
1787  * Generate a Const node of bytea type from a binary C string and a length.
1788  */
1789 static Const *
1790 string_to_bytea_const(const char *str, size_t str_len)
1791 {
1792  bytea *bstr = palloc(VARHDRSZ + str_len);
1793  Datum conval;
1794 
1795  memcpy(VARDATA(bstr), str, str_len);
1796  SET_VARSIZE(bstr, VARHDRSZ + str_len);
1797  conval = PointerGetDatum(bstr);
1798 
1799  return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1800 }
#define CStringGetTextDatum(s)
Definition: builtins.h:85
#define TextDatumGetCString(d)
Definition: builtins.h:86
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1094
#define Max(x, y)
Definition: c.h:931
char * Pointer
Definition: c.h:419
#define VARHDRSZ
Definition: c.h:628
#define OidIsValid(objectId)
Definition: c.h:711
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
int errhint(const char *fmt,...)
Definition: elog.c:1153
int errcode(int sqlerrcode)
Definition: elog.c:695
int errmsg(const char *fmt,...)
Definition: elog.c:906
#define ERROR
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:145
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1134
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:126
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define DatumGetByteaPP(X)
Definition: fmgr.h:291
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:367
#define DatumGetTextPP(X)
Definition: fmgr.h:292
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:642
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_GET_COLLATION()
Definition: fmgr.h:198
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:649
static char * locale
Definition: initdb.c:129
Assert(fmt[strlen(fmt) - 1] !='\n')
Datum icregexnesel(PG_FUNCTION_ARGS)
Definition: like_support.c:849
Datum regexnesel(PG_FUNCTION_ARGS)
Definition: like_support.c:840
static Node * like_regex_support(Node *rawreq, Pattern_Type ptype)
Definition: like_support.c:157
Datum iclikesel(PG_FUNCTION_ARGS)
Definition: like_support.c:831
Datum texticregexeq_support(PG_FUNCTION_ARGS)
Definition: like_support.c:140
static Selectivity prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, Oid eqopr, Oid ltopr, Oid geopr, Oid collation, Const *prefixcon)
#define FULL_WILDCARD_SEL
Datum iclikejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:922
Datum prefixjoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:913
#define ANY_CHAR_SEL
static double patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Definition: like_support.c:761
Datum regexeqsel(PG_FUNCTION_ARGS)
Definition: like_support.c:794
static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, Const **prefix, Selectivity *rest_selec)
Datum likejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:904
static Selectivity like_selectivity(const char *patt, int pattlen, bool case_insensitive)
Datum icregexnejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:940
static Pattern_Prefix_Status like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec)
Definition: like_support.c:993
static List * match_pattern_prefix(Node *leftop, Node *rightop, Pattern_Type ptype, Oid expr_coll, Oid opfamily, Oid indexcollation)
Definition: like_support.c:242
Datum nlikejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:949
static Datum string_to_datum(const char *str, Oid datatype)
Datum icnlikejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:958
static Selectivity regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
Datum texticlike_support(PG_FUNCTION_ARGS)
Definition: like_support.c:124
Datum nlikesel(PG_FUNCTION_ARGS)
Definition: like_support.c:858
static Const * string_to_const(const char *str, Oid datatype)
#define PARTIAL_WILDCARD_SEL
Datum text_starts_with_support(PG_FUNCTION_ARGS)
Definition: like_support.c:148
#define CHAR_RANGE_SEL
static Const * string_to_bytea_const(const char *str, size_t str_len)
static Pattern_Prefix_Status regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec)
Pattern_Type
Definition: like_support.c:60
@ Pattern_Type_Prefix
Definition: like_support.c:65
@ Pattern_Type_Regex_IC
Definition: like_support.c:64
@ Pattern_Type_Like
Definition: like_support.c:61
@ Pattern_Type_Regex
Definition: like_support.c:63
@ Pattern_Type_Like_IC
Definition: like_support.c:62
Pattern_Prefix_Status
Definition: like_support.c:69
@ Pattern_Prefix_Partial
Definition: like_support.c:70
@ Pattern_Prefix_None
Definition: like_support.c:70
@ Pattern_Prefix_Exact
Definition: like_support.c:70
static Const * make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
Datum icregexeqsel(PG_FUNCTION_ARGS)
Definition: like_support.c:803
#define FIXED_CHAR_SEL
Datum textlike_support(PG_FUNCTION_ARGS)
Definition: like_support.c:116
static int pattern_char_isalpha(char c, bool is_multibyte, pg_locale_t locale, bool locale_is_c)
Datum regexnejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:931
static bool byte_increment(unsigned char *ptr, int len)
static double patternsel_common(PlannerInfo *root, Oid oprid, Oid opfuncid, List *args, int varRelid, Oid collation, Pattern_Type ptype, bool negate)
Definition: like_support.c:487
static Selectivity regex_selectivity(const char *patt, int pattlen, bool case_insensitive, int fixed_prefix_len)
Datum icnlikesel(PG_FUNCTION_ARGS)
Definition: like_support.c:867
Datum textregexeq_support(PG_FUNCTION_ARGS)
Definition: like_support.c:132
Datum prefixsel(PG_FUNCTION_ARGS)
Definition: like_support.c:821
static double patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Definition: like_support.c:876
Datum likesel(PG_FUNCTION_ARGS)
Definition: like_support.c:812
Datum regexeqjoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:886
Datum icregexeqjoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:895
List * lappend(List *list, void *datum)
Definition: list.c:338
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1267
bool get_collation_isdeterministic(Oid colloid)
Definition: lsyscache.c:1080
bool op_in_opfamily(Oid opno, Oid opfamily)
Definition: lsyscache.c:65
Oid get_negator(Oid opno)
Definition: lsyscache.c:1515
Expr * make_opclause(Oid opno, Oid opresulttype, bool opretset, Expr *leftop, Expr *rightop, Oid opcollid, Oid inputcollid)
Definition: makefuncs.c:610
Const * makeConst(Oid consttype, int32 consttypmod, Oid constcollid, int constlen, Datum constvalue, bool constisnull, bool constbyval)
Definition: makefuncs.c:299
mbcharacter_incrementer pg_database_encoding_character_incrementer(void)
Definition: mbutils.c:1472
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1026
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1495
void pfree(void *pointer)
Definition: mcxt.c:1306
void * palloc(Size size)
Definition: mcxt.c:1199
Datum nameout(PG_FUNCTION_ARGS)
Definition: name.c:71
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:48
Oid exprType(const Node *expr)
Definition: nodeFuncs.c:43
static bool is_opclause(const void *clause)
Definition: nodeFuncs.h:74
static bool is_funcclause(const void *clause)
Definition: nodeFuncs.h:67
#define IsA(nodeptr, _type_)
Definition: nodes.h:162
double Selectivity
Definition: nodes.h:244
Oid oprid(Operator op)
Definition: parse_oper.c:250
#define NAMEDATALEN
const void size_t len
static int list_length(const List *l)
Definition: pg_list.h:150
#define NIL
Definition: pg_list.h:66
#define list_make1(x1)
Definition: pg_list.h:210
#define linitial(l)
Definition: pg_list.h:176
#define lsecond(l)
Definition: pg_list.h:181
bool lc_collate_is_c(Oid collation)
Definition: pg_locale.c:1299
pg_locale_t pg_newlocale_from_collation(Oid collid)
Definition: pg_locale.c:1481
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1352
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:135
bool(* mbcharacter_incrementer)(unsigned char *mbstr, int len)
Definition: pg_wchar.h:376
void check_stack_depth(void)
Definition: postgres.c:3440
static bool DatumGetBool(Datum X)
Definition: postgres.h:438
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:670
static char * DatumGetCString(Datum X)
Definition: postgres.h:683
uintptr_t Datum
Definition: postgres.h:412
#define VARDATA(PTR)
Definition: postgres.h:316
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:660
#define VARDATA_ANY(PTR)
Definition: postgres.h:362
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:343
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:698
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:355
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
char * c
char * s1
char * regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation, bool *exact)
Definition: regexp.c:1939
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4848
double var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation, Datum constval, bool constisnull, bool varonleft, bool negate)
Definition: selfuncs.c:293
double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Oid collation, Datum constval, bool varonleft, double *sumcommonp)
Definition: selfuncs.c:730
double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, Oid opoid, FmgrInfo *opproc, bool isgt, bool iseq, Oid collation, Datum constval, Oid consttype)
Definition: selfuncs.c:1039
double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Oid collation, Datum constval, bool varonleft, int min_hist_size, int n_skip, int *hist_size)
Definition: selfuncs.c:821
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:99
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:63
#define DEFAULT_MATCH_SEL
Definition: selfuncs.h:46
Oid constcollid
Definition: primnodes.h:261
Oid consttype
Definition: primnodes.h:259
int constlen
Definition: primnodes.h:262
Datum constvalue
Definition: primnodes.h:263
bool constisnull
Definition: primnodes.h:264
bool constbyval
Definition: primnodes.h:266
int32 consttypmod
Definition: primnodes.h:260
Definition: fmgr.h:57
List * args
Definition: primnodes.h:606
Oid inputcollid
Definition: primnodes.h:605
Definition: pg_list.h:52
Definition: nodes.h:112
List * args
Definition: primnodes.h:666
Oid inputcollid
Definition: primnodes.h:663
struct PlannerInfo * root
Definition: supportnodes.h:96
HeapTuple statsTuple
Definition: selfuncs.h:89
Definition: c.h:623
int varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
Definition: varlena.c:1518
Datum byteain(PG_FUNCTION_ARGS)
Definition: varlena.c:295
#define isalpha_l
Definition: win32_port.h:426