PostgreSQL Source Code  git master
like_support.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * like_support.c
4  * Planner support functions for LIKE, regex, and related operators.
5  *
6  * These routines handle special optimization of operators that can be
7  * used with index scans even though they are not known to the executor's
8  * indexscan machinery. The key idea is that these operators allow us
9  * to derive approximate indexscan qual clauses, such that any tuples
10  * that pass the operator clause itself must also satisfy the simpler
11  * indexscan condition(s). Then we can use the indexscan machinery
12  * to avoid scanning as much of the table as we'd otherwise have to,
13  * while applying the original operator as a qpqual condition to ensure
14  * we deliver only the tuples we want. (In essence, we're using a regular
15  * index as if it were a lossy index.)
16  *
17  * An example of what we're doing is
18  * textfield LIKE 'abc%def'
19  * from which we can generate the indexscanable conditions
20  * textfield >= 'abc' AND textfield < 'abd'
21  * which allow efficient scanning of an index on textfield.
22  * (In reality, character set and collation issues make the transformation
23  * from LIKE to indexscan limits rather harder than one might think ...
24  * but that's the basic idea.)
25  *
26  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/adt/like_support.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include <math.h>
38 
39 #include "access/htup_details.h"
40 #include "catalog/pg_collation.h"
41 #include "catalog/pg_operator.h"
42 #include "catalog/pg_opfamily.h"
43 #include "catalog/pg_statistic.h"
44 #include "catalog/pg_type.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
47 #include "nodes/makefuncs.h"
48 #include "nodes/nodeFuncs.h"
49 #include "nodes/supportnodes.h"
50 #include "utils/builtins.h"
51 #include "utils/datum.h"
52 #include "utils/lsyscache.h"
53 #include "utils/pg_locale.h"
54 #include "utils/selfuncs.h"
55 #include "utils/varlena.h"
56 
57 
58 typedef enum
59 {
65 } Pattern_Type;
66 
67 typedef enum
68 {
71 
72 static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
73 static List *match_pattern_prefix(Node *leftop,
74  Node *rightop,
75  Pattern_Type ptype,
76  Oid expr_coll,
77  Oid opfamily,
78  Oid indexcollation);
79 static double patternsel_common(PlannerInfo *root,
80  Oid oprid,
81  Oid opfuncid,
82  List *args,
83  int varRelid,
84  Oid collation,
85  Pattern_Type ptype,
86  bool negate);
88  Pattern_Type ptype,
89  Oid collation,
90  Const **prefix,
91  Selectivity *rest_selec);
93  VariableStatData *vardata,
94  Oid eqopr, Oid ltopr, Oid geopr,
95  Oid collation,
96  Const *prefixcon);
97 static Selectivity like_selectivity(const char *patt, int pattlen,
98  bool case_insensitive);
99 static Selectivity regex_selectivity(const char *patt, int pattlen,
100  bool case_insensitive,
101  int fixed_prefix_len);
102 static int pattern_char_isalpha(char c, bool is_multibyte,
103  pg_locale_t locale, bool locale_is_c);
104 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
105  Oid collation);
106 static Datum string_to_datum(const char *str, Oid datatype);
107 static Const *string_to_const(const char *str, Oid datatype);
108 static Const *string_to_bytea_const(const char *str, size_t str_len);
109 
110 
111 /*
112  * Planner support functions for LIKE, regex, and related operators
113  */
114 Datum
116 {
117  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
118 
120 }
121 
122 Datum
124 {
125  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
126 
128 }
129 
130 Datum
132 {
133  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
134 
136 }
137 
138 Datum
140 {
141  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
142 
144 }
145 
146 Datum
148 {
149  Node *rawreq = (Node *) PG_GETARG_POINTER(0);
150 
152 }
153 
154 /* Common code for the above */
155 static Node *
157 {
158  Node *ret = NULL;
159 
160  if (IsA(rawreq, SupportRequestSelectivity))
161  {
162  /*
163  * Make a selectivity estimate for a function call, just as we'd do if
164  * the call was via the corresponding operator.
165  */
167  Selectivity s1;
168 
169  if (req->is_join)
170  {
171  /*
172  * For the moment we just punt. If patternjoinsel is ever
173  * improved to do better, this should be made to call it.
174  */
176  }
177  else
178  {
179  /* Share code with operator restriction selectivity functions */
180  s1 = patternsel_common(req->root,
181  InvalidOid,
182  req->funcid,
183  req->args,
184  req->varRelid,
185  req->inputcollid,
186  ptype,
187  false);
188  }
189  req->selectivity = s1;
190  ret = (Node *) req;
191  }
192  else if (IsA(rawreq, SupportRequestIndexCondition))
193  {
194  /* Try to convert operator/function call to index conditions */
196 
197  /*
198  * Currently we have no "reverse" match operators with the pattern on
199  * the left, so we only need consider cases with the indexkey on the
200  * left.
201  */
202  if (req->indexarg != 0)
203  return NULL;
204 
205  if (is_opclause(req->node))
206  {
207  OpExpr *clause = (OpExpr *) req->node;
208 
209  Assert(list_length(clause->args) == 2);
210  ret = (Node *)
211  match_pattern_prefix((Node *) linitial(clause->args),
212  (Node *) lsecond(clause->args),
213  ptype,
214  clause->inputcollid,
215  req->opfamily,
216  req->indexcollation);
217  }
218  else if (is_funcclause(req->node)) /* be paranoid */
219  {
220  FuncExpr *clause = (FuncExpr *) req->node;
221 
222  Assert(list_length(clause->args) == 2);
223  ret = (Node *)
224  match_pattern_prefix((Node *) linitial(clause->args),
225  (Node *) lsecond(clause->args),
226  ptype,
227  clause->inputcollid,
228  req->opfamily,
229  req->indexcollation);
230  }
231  }
232 
233  return ret;
234 }
235 
236 /*
237  * match_pattern_prefix
238  * Try to generate an indexqual for a LIKE or regex operator.
239  */
240 static List *
242  Node *rightop,
243  Pattern_Type ptype,
244  Oid expr_coll,
245  Oid opfamily,
246  Oid indexcollation)
247 {
248  List *result;
249  Const *patt;
250  Const *prefix;
251  Pattern_Prefix_Status pstatus;
252  Oid ldatatype;
253  Oid rdatatype;
254  Oid eqopr;
255  Oid ltopr;
256  Oid geopr;
257  Oid preopr = InvalidOid;
258  bool collation_aware;
259  Expr *expr;
260  FmgrInfo ltproc;
261  Const *greaterstr;
262 
263  /*
264  * Can't do anything with a non-constant or NULL pattern argument.
265  *
266  * Note that since we restrict ourselves to cases with a hard constant on
267  * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
268  * about verifying that.
269  */
270  if (!IsA(rightop, Const) ||
271  ((Const *) rightop)->constisnull)
272  return NIL;
273  patt = (Const *) rightop;
274 
275  /*
276  * Not supported if the expression collation is nondeterministic. The
277  * optimized equality or prefix tests use bytewise comparisons, which is
278  * not consistent with nondeterministic collations. The actual
279  * pattern-matching implementation functions will later error out that
280  * pattern-matching is not supported with nondeterministic collations. (We
281  * could also error out here, but by doing it later we get more precise
282  * error messages.) (It should be possible to support at least
283  * Pattern_Prefix_Exact, but no point as long as the actual
284  * pattern-matching implementations don't support it.)
285  *
286  * expr_coll is not set for a non-collation-aware data type such as bytea.
287  */
288  if (expr_coll && !get_collation_isdeterministic(expr_coll))
289  return NIL;
290 
291  /*
292  * Try to extract a fixed prefix from the pattern.
293  */
294  pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
295  &prefix, NULL);
296 
297  /* fail if no fixed prefix */
298  if (pstatus == Pattern_Prefix_None)
299  return NIL;
300 
301  /*
302  * Identify the operators we want to use, based on the type of the
303  * left-hand argument. Usually these are just the type's regular
304  * comparison operators, but if we are considering one of the semi-legacy
305  * "pattern" opclasses, use the "pattern" operators instead. Those are
306  * not collation-sensitive but always use C collation, as we want. The
307  * selected operators also determine the needed type of the prefix
308  * constant.
309  */
310  ldatatype = exprType(leftop);
311  switch (ldatatype)
312  {
313  case TEXTOID:
314  if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
315  {
316  eqopr = TextEqualOperator;
317  ltopr = TextPatternLessOperator;
318  geopr = TextPatternGreaterEqualOperator;
319  collation_aware = false;
320  }
321  else if (opfamily == TEXT_SPGIST_FAM_OID)
322  {
323  eqopr = TextEqualOperator;
324  ltopr = TextPatternLessOperator;
325  geopr = TextPatternGreaterEqualOperator;
326  /* This opfamily has direct support for prefixing */
327  preopr = TextPrefixOperator;
328  collation_aware = false;
329  }
330  else
331  {
332  eqopr = TextEqualOperator;
333  ltopr = TextLessOperator;
334  geopr = TextGreaterEqualOperator;
335  collation_aware = true;
336  }
337  rdatatype = TEXTOID;
338  break;
339  case NAMEOID:
340 
341  /*
342  * Note that here, we need the RHS type to be text, so that the
343  * comparison value isn't improperly truncated to NAMEDATALEN.
344  */
345  eqopr = NameEqualTextOperator;
346  ltopr = NameLessTextOperator;
347  geopr = NameGreaterEqualTextOperator;
348  collation_aware = true;
349  rdatatype = TEXTOID;
350  break;
351  case BPCHAROID:
352  if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
353  {
354  eqopr = BpcharEqualOperator;
355  ltopr = BpcharPatternLessOperator;
356  geopr = BpcharPatternGreaterEqualOperator;
357  collation_aware = false;
358  }
359  else
360  {
361  eqopr = BpcharEqualOperator;
362  ltopr = BpcharLessOperator;
363  geopr = BpcharGreaterEqualOperator;
364  collation_aware = true;
365  }
366  rdatatype = BPCHAROID;
367  break;
368  case BYTEAOID:
369  eqopr = ByteaEqualOperator;
370  ltopr = ByteaLessOperator;
371  geopr = ByteaGreaterEqualOperator;
372  collation_aware = false;
373  rdatatype = BYTEAOID;
374  break;
375  default:
376  /* Can't get here unless we're attached to the wrong operator */
377  return NIL;
378  }
379 
380  /*
381  * If necessary, coerce the prefix constant to the right type. The given
382  * prefix constant is either text or bytea type, therefore the only case
383  * where we need to do anything is when converting text to bpchar. Those
384  * two types are binary-compatible, so relabeling the Const node is
385  * sufficient.
386  */
387  if (prefix->consttype != rdatatype)
388  {
389  Assert(prefix->consttype == TEXTOID &&
390  rdatatype == BPCHAROID);
391  prefix->consttype = rdatatype;
392  }
393 
394  /*
395  * If we found an exact-match pattern, generate an "=" indexqual.
396  *
397  * Here and below, check to see whether the desired operator is actually
398  * supported by the index opclass, and fail quietly if not. This allows
399  * us to not be concerned with specific opclasses (except for the legacy
400  * "pattern" cases); any index that correctly implements the operators
401  * will work.
402  */
403  if (pstatus == Pattern_Prefix_Exact)
404  {
405  if (!op_in_opfamily(eqopr, opfamily))
406  return NIL;
407  expr = make_opclause(eqopr, BOOLOID, false,
408  (Expr *) leftop, (Expr *) prefix,
409  InvalidOid, indexcollation);
410  result = list_make1(expr);
411  return result;
412  }
413 
414  /*
415  * Otherwise, we have a nonempty required prefix of the values. Some
416  * opclasses support prefix checks directly, otherwise we'll try to
417  * generate a range constraint.
418  */
419  if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
420  {
421  expr = make_opclause(preopr, BOOLOID, false,
422  (Expr *) leftop, (Expr *) prefix,
423  InvalidOid, indexcollation);
424  result = list_make1(expr);
425  return result;
426  }
427 
428  /*
429  * Since we need a range constraint, it's only going to work reliably if
430  * the index is collation-insensitive or has "C" collation. Note that
431  * here we are looking at the index's collation, not the expression's
432  * collation -- this test is *not* dependent on the LIKE/regex operator's
433  * collation.
434  */
435  if (collation_aware &&
436  !lc_collate_is_c(indexcollation))
437  return NIL;
438 
439  /*
440  * We can always say "x >= prefix".
441  */
442  if (!op_in_opfamily(geopr, opfamily))
443  return NIL;
444  expr = make_opclause(geopr, BOOLOID, false,
445  (Expr *) leftop, (Expr *) prefix,
446  InvalidOid, indexcollation);
447  result = list_make1(expr);
448 
449  /*-------
450  * If we can create a string larger than the prefix, we can say
451  * "x < greaterstr". NB: we rely on make_greater_string() to generate
452  * a guaranteed-greater string, not just a probably-greater string.
453  * In general this is only guaranteed in C locale, so we'd better be
454  * using a C-locale index collation.
455  *-------
456  */
457  if (!op_in_opfamily(ltopr, opfamily))
458  return result;
459  fmgr_info(get_opcode(ltopr), &ltproc);
460  greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
461  if (greaterstr)
462  {
463  expr = make_opclause(ltopr, BOOLOID, false,
464  (Expr *) leftop, (Expr *) greaterstr,
465  InvalidOid, indexcollation);
466  result = lappend(result, expr);
467  }
468 
469  return result;
470 }
471 
472 
473 /*
474  * patternsel_common - generic code for pattern-match restriction selectivity.
475  *
476  * To support using this from either the operator or function paths, caller
477  * may pass either operator OID or underlying function OID; we look up the
478  * latter from the former if needed. (We could just have patternsel() call
479  * get_opcode(), but the work would be wasted if we don't have a need to
480  * compare a fixed prefix to the pg_statistic data.)
481  *
482  * Note that oprid and/or opfuncid should be for the positive-match operator
483  * even when negate is true.
484  */
485 static double
487  Oid oprid,
488  Oid opfuncid,
489  List *args,
490  int varRelid,
491  Oid collation,
492  Pattern_Type ptype,
493  bool negate)
494 {
495  VariableStatData vardata;
496  Node *other;
497  bool varonleft;
498  Datum constval;
499  Oid consttype;
500  Oid vartype;
501  Oid rdatatype;
502  Oid eqopr;
503  Oid ltopr;
504  Oid geopr;
505  Pattern_Prefix_Status pstatus;
506  Const *patt;
507  Const *prefix = NULL;
508  Selectivity rest_selec = 0;
509  double nullfrac = 0.0;
510  double result;
511 
512  /*
513  * Initialize result to the appropriate default estimate depending on
514  * whether it's a match or not-match operator.
515  */
516  if (negate)
517  result = 1.0 - DEFAULT_MATCH_SEL;
518  else
519  result = DEFAULT_MATCH_SEL;
520 
521  /*
522  * If expression is not variable op constant, then punt and return the
523  * default estimate.
524  */
525  if (!get_restriction_variable(root, args, varRelid,
526  &vardata, &other, &varonleft))
527  return result;
528  if (!varonleft || !IsA(other, Const))
529  {
530  ReleaseVariableStats(vardata);
531  return result;
532  }
533 
534  /*
535  * If the constant is NULL, assume operator is strict and return zero, ie,
536  * operator will never return TRUE. (It's zero even for a negator op.)
537  */
538  if (((Const *) other)->constisnull)
539  {
540  ReleaseVariableStats(vardata);
541  return 0.0;
542  }
543  constval = ((Const *) other)->constvalue;
544  consttype = ((Const *) other)->consttype;
545 
546  /*
547  * The right-hand const is type text or bytea for all supported operators.
548  * We do not expect to see binary-compatible types here, since
549  * const-folding should have relabeled the const to exactly match the
550  * operator's declared type.
551  */
552  if (consttype != TEXTOID && consttype != BYTEAOID)
553  {
554  ReleaseVariableStats(vardata);
555  return result;
556  }
557 
558  /*
559  * Similarly, the exposed type of the left-hand side should be one of
560  * those we know. (Do not look at vardata.atttype, which might be
561  * something binary-compatible but different.) We can use it to identify
562  * the comparison operators and the required type of the comparison
563  * constant, much as in match_pattern_prefix().
564  */
565  vartype = vardata.vartype;
566 
567  switch (vartype)
568  {
569  case TEXTOID:
570  eqopr = TextEqualOperator;
571  ltopr = TextLessOperator;
572  geopr = TextGreaterEqualOperator;
573  rdatatype = TEXTOID;
574  break;
575  case NAMEOID:
576 
577  /*
578  * Note that here, we need the RHS type to be text, so that the
579  * comparison value isn't improperly truncated to NAMEDATALEN.
580  */
581  eqopr = NameEqualTextOperator;
582  ltopr = NameLessTextOperator;
583  geopr = NameGreaterEqualTextOperator;
584  rdatatype = TEXTOID;
585  break;
586  case BPCHAROID:
587  eqopr = BpcharEqualOperator;
588  ltopr = BpcharLessOperator;
589  geopr = BpcharGreaterEqualOperator;
590  rdatatype = BPCHAROID;
591  break;
592  case BYTEAOID:
593  eqopr = ByteaEqualOperator;
594  ltopr = ByteaLessOperator;
595  geopr = ByteaGreaterEqualOperator;
596  rdatatype = BYTEAOID;
597  break;
598  default:
599  /* Can't get here unless we're attached to the wrong operator */
600  ReleaseVariableStats(vardata);
601  return result;
602  }
603 
604  /*
605  * Grab the nullfrac for use below.
606  */
607  if (HeapTupleIsValid(vardata.statsTuple))
608  {
609  Form_pg_statistic stats;
610 
611  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
612  nullfrac = stats->stanullfrac;
613  }
614 
615  /*
616  * Pull out any fixed prefix implied by the pattern, and estimate the
617  * fractional selectivity of the remainder of the pattern. Unlike many
618  * other selectivity estimators, we use the pattern operator's actual
619  * collation for this step. This is not because we expect the collation
620  * to make a big difference in the selectivity estimate (it seldom would),
621  * but because we want to be sure we cache compiled regexps under the
622  * right cache key, so that they can be re-used at runtime.
623  */
624  patt = (Const *) other;
625  pstatus = pattern_fixed_prefix(patt, ptype, collation,
626  &prefix, &rest_selec);
627 
628  /*
629  * If necessary, coerce the prefix constant to the right type. The only
630  * case where we need to do anything is when converting text to bpchar.
631  * Those two types are binary-compatible, so relabeling the Const node is
632  * sufficient.
633  */
634  if (prefix && prefix->consttype != rdatatype)
635  {
636  Assert(prefix->consttype == TEXTOID &&
637  rdatatype == BPCHAROID);
638  prefix->consttype = rdatatype;
639  }
640 
641  if (pstatus == Pattern_Prefix_Exact)
642  {
643  /*
644  * Pattern specifies an exact match, so estimate as for '='
645  */
646  result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
647  false, true, false);
648  }
649  else
650  {
651  /*
652  * Not exact-match pattern. If we have a sufficiently large
653  * histogram, estimate selectivity for the histogram part of the
654  * population by counting matches in the histogram. If not, estimate
655  * selectivity of the fixed prefix and remainder of pattern
656  * separately, then combine the two to get an estimate of the
657  * selectivity for the part of the column population represented by
658  * the histogram. (For small histograms, we combine these
659  * approaches.)
660  *
661  * We then add up data for any most-common-values values; these are
662  * not in the histogram population, and we can get exact answers for
663  * them by applying the pattern operator, so there's no reason to
664  * approximate. (If the MCVs cover a significant part of the total
665  * population, this gives us a big leg up in accuracy.)
666  */
667  Selectivity selec;
668  int hist_size;
669  FmgrInfo opproc;
670  double mcv_selec,
671  sumcommon;
672 
673  /* Try to use the histogram entries to get selectivity */
674  if (!OidIsValid(opfuncid))
675  opfuncid = get_opcode(oprid);
676  fmgr_info(opfuncid, &opproc);
677 
678  selec = histogram_selectivity(&vardata, &opproc, collation,
679  constval, true,
680  10, 1, &hist_size);
681 
682  /* If not at least 100 entries, use the heuristic method */
683  if (hist_size < 100)
684  {
685  Selectivity heursel;
687 
688  if (pstatus == Pattern_Prefix_Partial)
689  prefixsel = prefix_selectivity(root, &vardata,
690  eqopr, ltopr, geopr,
691  collation,
692  prefix);
693  else
694  prefixsel = 1.0;
695  heursel = prefixsel * rest_selec;
696 
697  if (selec < 0) /* fewer than 10 histogram entries? */
698  selec = heursel;
699  else
700  {
701  /*
702  * For histogram sizes from 10 to 100, we combine the
703  * histogram and heuristic selectivities, putting increasingly
704  * more trust in the histogram for larger sizes.
705  */
706  double hist_weight = hist_size / 100.0;
707 
708  selec = selec * hist_weight + heursel * (1.0 - hist_weight);
709  }
710  }
711 
712  /* In any case, don't believe extremely small or large estimates. */
713  if (selec < 0.0001)
714  selec = 0.0001;
715  else if (selec > 0.9999)
716  selec = 0.9999;
717 
718  /*
719  * If we have most-common-values info, add up the fractions of the MCV
720  * entries that satisfy MCV OP PATTERN. These fractions contribute
721  * directly to the result selectivity. Also add up the total fraction
722  * represented by MCV entries.
723  */
724  mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
725  constval, true,
726  &sumcommon);
727 
728  /*
729  * Now merge the results from the MCV and histogram calculations,
730  * realizing that the histogram covers only the non-null values that
731  * are not listed in MCV.
732  */
733  selec *= 1.0 - nullfrac - sumcommon;
734  selec += mcv_selec;
735  result = selec;
736  }
737 
738  /* now adjust if we wanted not-match rather than match */
739  if (negate)
740  result = 1.0 - result - nullfrac;
741 
742  /* result should be in range, but make sure... */
743  CLAMP_PROBABILITY(result);
744 
745  if (prefix)
746  {
747  pfree(DatumGetPointer(prefix->constvalue));
748  pfree(prefix);
749  }
750 
751  ReleaseVariableStats(vardata);
752 
753  return result;
754 }
755 
756 /*
757  * Fix impedance mismatch between SQL-callable functions and patternsel_common
758  */
759 static double
761 {
763  Oid operator = PG_GETARG_OID(1);
764  List *args = (List *) PG_GETARG_POINTER(2);
765  int varRelid = PG_GETARG_INT32(3);
766  Oid collation = PG_GET_COLLATION();
767 
768  /*
769  * If this is for a NOT LIKE or similar operator, get the corresponding
770  * positive-match operator and work with that.
771  */
772  if (negate)
773  {
774  operator = get_negator(operator);
775  if (!OidIsValid(operator))
776  elog(ERROR, "patternsel called for operator without a negator");
777  }
778 
779  return patternsel_common(root,
780  operator,
781  InvalidOid,
782  args,
783  varRelid,
784  collation,
785  ptype,
786  negate);
787 }
788 
789 /*
790  * regexeqsel - Selectivity of regular-expression pattern match.
791  */
792 Datum
794 {
796 }
797 
798 /*
799  * icregexeqsel - Selectivity of case-insensitive regex match.
800  */
801 Datum
803 {
805 }
806 
807 /*
808  * likesel - Selectivity of LIKE pattern match.
809  */
810 Datum
812 {
814 }
815 
816 /*
817  * prefixsel - selectivity of prefix operator
818  */
819 Datum
821 {
823 }
824 
825 /*
826  *
827  * iclikesel - Selectivity of ILIKE pattern match.
828  */
829 Datum
831 {
833 }
834 
835 /*
836  * regexnesel - Selectivity of regular-expression pattern non-match.
837  */
838 Datum
840 {
842 }
843 
844 /*
845  * icregexnesel - Selectivity of case-insensitive regex non-match.
846  */
847 Datum
849 {
851 }
852 
853 /*
854  * nlikesel - Selectivity of LIKE pattern non-match.
855  */
856 Datum
858 {
860 }
861 
862 /*
863  * icnlikesel - Selectivity of ILIKE pattern non-match.
864  */
865 Datum
867 {
869 }
870 
871 /*
872  * patternjoinsel - Generic code for pattern-match join selectivity.
873  */
874 static double
876 {
877  /* For the moment we just punt. */
878  return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
879 }
880 
881 /*
882  * regexeqjoinsel - Join selectivity of regular-expression pattern match.
883  */
884 Datum
886 {
888 }
889 
890 /*
891  * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
892  */
893 Datum
895 {
897 }
898 
899 /*
900  * likejoinsel - Join selectivity of LIKE pattern match.
901  */
902 Datum
904 {
906 }
907 
908 /*
909  * prefixjoinsel - Join selectivity of prefix operator
910  */
911 Datum
913 {
915 }
916 
917 /*
918  * iclikejoinsel - Join selectivity of ILIKE pattern match.
919  */
920 Datum
922 {
924 }
925 
926 /*
927  * regexnejoinsel - Join selectivity of regex non-match.
928  */
929 Datum
931 {
933 }
934 
935 /*
936  * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
937  */
938 Datum
940 {
942 }
943 
944 /*
945  * nlikejoinsel - Join selectivity of LIKE pattern non-match.
946  */
947 Datum
949 {
951 }
952 
953 /*
954  * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
955  */
956 Datum
958 {
960 }
961 
962 
963 /*-------------------------------------------------------------------------
964  *
965  * Pattern analysis functions
966  *
967  * These routines support analysis of LIKE and regular-expression patterns
968  * by the planner/optimizer. It's important that they agree with the
969  * regular-expression code in backend/regex/ and the LIKE code in
970  * backend/utils/adt/like.c. Also, the computation of the fixed prefix
971  * must be conservative: if we report a string longer than the true fixed
972  * prefix, the query may produce actually wrong answers, rather than just
973  * getting a bad selectivity estimate!
974  *
975  *-------------------------------------------------------------------------
976  */
977 
978 /*
979  * Extract the fixed prefix, if any, for a pattern.
980  *
981  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
982  * or to NULL if no fixed prefix exists for the pattern.
983  * If rest_selec is not NULL, *rest_selec is set to an estimate of the
984  * selectivity of the remainder of the pattern (without any fixed prefix).
985  * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
986  *
987  * The return value distinguishes no fixed prefix, a partial prefix,
988  * or an exact-match-only pattern.
989  */
990 
992 like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
993  Const **prefix_const, Selectivity *rest_selec)
994 {
995  char *match;
996  char *patt;
997  int pattlen;
998  Oid typeid = patt_const->consttype;
999  int pos,
1000  match_pos;
1001  bool is_multibyte = (pg_database_encoding_max_length() > 1);
1002  pg_locale_t locale = 0;
1003  bool locale_is_c = false;
1004 
1005  /* the right-hand const is type text or bytea */
1006  Assert(typeid == BYTEAOID || typeid == TEXTOID);
1007 
1008  if (case_insensitive)
1009  {
1010  if (typeid == BYTEAOID)
1011  ereport(ERROR,
1012  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1013  errmsg("case insensitive matching not supported on type bytea")));
1014 
1015  if (!OidIsValid(collation))
1016  {
1017  /*
1018  * This typically means that the parser could not resolve a
1019  * conflict of implicit collations, so report it that way.
1020  */
1021  ereport(ERROR,
1022  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1023  errmsg("could not determine which collation to use for ILIKE"),
1024  errhint("Use the COLLATE clause to set the collation explicitly.")));
1025  }
1026 
1027  /* If case-insensitive, we need locale info */
1028  if (lc_ctype_is_c(collation))
1029  locale_is_c = true;
1030  else
1031  locale = pg_newlocale_from_collation(collation);
1032  }
1033 
1034  if (typeid != BYTEAOID)
1035  {
1036  patt = TextDatumGetCString(patt_const->constvalue);
1037  pattlen = strlen(patt);
1038  }
1039  else
1040  {
1041  bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1042 
1043  pattlen = VARSIZE_ANY_EXHDR(bstr);
1044  patt = (char *) palloc(pattlen);
1045  memcpy(patt, VARDATA_ANY(bstr), pattlen);
1046  Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1047  }
1048 
1049  match = palloc(pattlen + 1);
1050  match_pos = 0;
1051  for (pos = 0; pos < pattlen; pos++)
1052  {
1053  /* % and _ are wildcard characters in LIKE */
1054  if (patt[pos] == '%' ||
1055  patt[pos] == '_')
1056  break;
1057 
1058  /* Backslash escapes the next character */
1059  if (patt[pos] == '\\')
1060  {
1061  pos++;
1062  if (pos >= pattlen)
1063  break;
1064  }
1065 
1066  /* Stop if case-varying character (it's sort of a wildcard) */
1067  if (case_insensitive &&
1068  pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1069  break;
1070 
1071  match[match_pos++] = patt[pos];
1072  }
1073 
1074  match[match_pos] = '\0';
1075 
1076  if (typeid != BYTEAOID)
1077  *prefix_const = string_to_const(match, typeid);
1078  else
1079  *prefix_const = string_to_bytea_const(match, match_pos);
1080 
1081  if (rest_selec != NULL)
1082  *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1083  case_insensitive);
1084 
1085  pfree(patt);
1086  pfree(match);
1087 
1088  /* in LIKE, an empty pattern is an exact match! */
1089  if (pos == pattlen)
1090  return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1091 
1092  if (match_pos > 0)
1093  return Pattern_Prefix_Partial;
1094 
1095  return Pattern_Prefix_None;
1096 }
1097 
1098 static Pattern_Prefix_Status
1099 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1100  Const **prefix_const, Selectivity *rest_selec)
1101 {
1102  Oid typeid = patt_const->consttype;
1103  char *prefix;
1104  bool exact;
1105 
1106  /*
1107  * Should be unnecessary, there are no bytea regex operators defined. As
1108  * such, it should be noted that the rest of this function has *not* been
1109  * made safe for binary (possibly NULL containing) strings.
1110  */
1111  if (typeid == BYTEAOID)
1112  ereport(ERROR,
1113  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1114  errmsg("regular-expression matching not supported on type bytea")));
1115 
1116  /* Use the regexp machinery to extract the prefix, if any */
1117  prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1118  case_insensitive, collation,
1119  &exact);
1120 
1121  if (prefix == NULL)
1122  {
1123  *prefix_const = NULL;
1124 
1125  if (rest_selec != NULL)
1126  {
1127  char *patt = TextDatumGetCString(patt_const->constvalue);
1128 
1129  *rest_selec = regex_selectivity(patt, strlen(patt),
1130  case_insensitive,
1131  0);
1132  pfree(patt);
1133  }
1134 
1135  return Pattern_Prefix_None;
1136  }
1137 
1138  *prefix_const = string_to_const(prefix, typeid);
1139 
1140  if (rest_selec != NULL)
1141  {
1142  if (exact)
1143  {
1144  /* Exact match, so there's no additional selectivity */
1145  *rest_selec = 1.0;
1146  }
1147  else
1148  {
1149  char *patt = TextDatumGetCString(patt_const->constvalue);
1150 
1151  *rest_selec = regex_selectivity(patt, strlen(patt),
1152  case_insensitive,
1153  strlen(prefix));
1154  pfree(patt);
1155  }
1156  }
1157 
1158  pfree(prefix);
1159 
1160  if (exact)
1161  return Pattern_Prefix_Exact; /* pattern specifies exact match */
1162  else
1163  return Pattern_Prefix_Partial;
1164 }
1165 
1166 static Pattern_Prefix_Status
1167 pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1168  Const **prefix, Selectivity *rest_selec)
1169 {
1170  Pattern_Prefix_Status result;
1171 
1172  switch (ptype)
1173  {
1174  case Pattern_Type_Like:
1175  result = like_fixed_prefix(patt, false, collation,
1176  prefix, rest_selec);
1177  break;
1178  case Pattern_Type_Like_IC:
1179  result = like_fixed_prefix(patt, true, collation,
1180  prefix, rest_selec);
1181  break;
1182  case Pattern_Type_Regex:
1183  result = regex_fixed_prefix(patt, false, collation,
1184  prefix, rest_selec);
1185  break;
1186  case Pattern_Type_Regex_IC:
1187  result = regex_fixed_prefix(patt, true, collation,
1188  prefix, rest_selec);
1189  break;
1190  case Pattern_Type_Prefix:
1191  /* Prefix type work is trivial. */
1192  result = Pattern_Prefix_Partial;
1193  *prefix = makeConst(patt->consttype,
1194  patt->consttypmod,
1195  patt->constcollid,
1196  patt->constlen,
1197  datumCopy(patt->constvalue,
1198  patt->constbyval,
1199  patt->constlen),
1200  patt->constisnull,
1201  patt->constbyval);
1202  if (rest_selec != NULL)
1203  *rest_selec = 1.0; /* all */
1204  break;
1205  default:
1206  elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1207  result = Pattern_Prefix_None; /* keep compiler quiet */
1208  break;
1209  }
1210  return result;
1211 }
1212 
1213 /*
1214  * Estimate the selectivity of a fixed prefix for a pattern match.
1215  *
1216  * A fixed prefix "foo" is estimated as the selectivity of the expression
1217  * "variable >= 'foo' AND variable < 'fop'".
1218  *
1219  * The selectivity estimate is with respect to the portion of the column
1220  * population represented by the histogram --- the caller must fold this
1221  * together with info about MCVs and NULLs.
1222  *
1223  * We use the given comparison operators and collation to do the estimation.
1224  * The given variable and Const must be of the associated datatype(s).
1225  *
1226  * XXX Note: we make use of the upper bound to estimate operator selectivity
1227  * even if the locale is such that we cannot rely on the upper-bound string.
1228  * The selectivity only needs to be approximately right anyway, so it seems
1229  * more useful to use the upper-bound code than not.
1230  */
1231 static Selectivity
1233  Oid eqopr, Oid ltopr, Oid geopr,
1234  Oid collation,
1235  Const *prefixcon)
1236 {
1238  FmgrInfo opproc;
1239  Const *greaterstrcon;
1240  Selectivity eq_sel;
1241 
1242  /* Estimate the selectivity of "x >= prefix" */
1243  fmgr_info(get_opcode(geopr), &opproc);
1244 
1246  geopr, &opproc, true, true,
1247  collation,
1248  prefixcon->constvalue,
1249  prefixcon->consttype);
1250 
1251  if (prefixsel < 0.0)
1252  {
1253  /* No histogram is present ... return a suitable default estimate */
1254  return DEFAULT_MATCH_SEL;
1255  }
1256 
1257  /*
1258  * If we can create a string larger than the prefix, say "x < greaterstr".
1259  */
1260  fmgr_info(get_opcode(ltopr), &opproc);
1261  greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1262  if (greaterstrcon)
1263  {
1264  Selectivity topsel;
1265 
1266  topsel = ineq_histogram_selectivity(root, vardata,
1267  ltopr, &opproc, false, false,
1268  collation,
1269  greaterstrcon->constvalue,
1270  greaterstrcon->consttype);
1271 
1272  /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1273  Assert(topsel >= 0.0);
1274 
1275  /*
1276  * Merge the two selectivities in the same way as for a range query
1277  * (see clauselist_selectivity()). Note that we don't need to worry
1278  * about double-exclusion of nulls, since ineq_histogram_selectivity
1279  * doesn't count those anyway.
1280  */
1281  prefixsel = topsel + prefixsel - 1.0;
1282  }
1283 
1284  /*
1285  * If the prefix is long then the two bounding values might be too close
1286  * together for the histogram to distinguish them usefully, resulting in a
1287  * zero estimate (plus or minus roundoff error). To avoid returning a
1288  * ridiculously small estimate, compute the estimated selectivity for
1289  * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1290  * estimate should be at least that.)
1291  *
1292  * We apply this even if we couldn't make a greater string. That case
1293  * suggests that the prefix is near the maximum possible, and thus
1294  * probably off the end of the histogram, and thus we probably got a very
1295  * small estimate from the >= condition; so we still need to clamp.
1296  */
1297  eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1298  false, true, false);
1299 
1300  prefixsel = Max(prefixsel, eq_sel);
1301 
1302  return prefixsel;
1303 }
1304 
1305 
1306 /*
1307  * Estimate the selectivity of a pattern of the specified type.
1308  * Note that any fixed prefix of the pattern will have been removed already,
1309  * so actually we may be looking at just a fragment of the pattern.
1310  *
1311  * For now, we use a very simplistic approach: fixed characters reduce the
1312  * selectivity a good deal, character ranges reduce it a little,
1313  * wildcards (such as % for LIKE or .* for regex) increase it.
1314  */
1315 
1316 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1317 #define CHAR_RANGE_SEL 0.25
1318 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1319 #define FULL_WILDCARD_SEL 5.0
1320 #define PARTIAL_WILDCARD_SEL 2.0
1321 
1322 static Selectivity
1323 like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1324 {
1325  Selectivity sel = 1.0;
1326  int pos;
1327 
1328  /* Skip any leading wildcard; it's already factored into initial sel */
1329  for (pos = 0; pos < pattlen; pos++)
1330  {
1331  if (patt[pos] != '%' && patt[pos] != '_')
1332  break;
1333  }
1334 
1335  for (; pos < pattlen; pos++)
1336  {
1337  /* % and _ are wildcard characters in LIKE */
1338  if (patt[pos] == '%')
1339  sel *= FULL_WILDCARD_SEL;
1340  else if (patt[pos] == '_')
1341  sel *= ANY_CHAR_SEL;
1342  else if (patt[pos] == '\\')
1343  {
1344  /* Backslash quotes the next character */
1345  pos++;
1346  if (pos >= pattlen)
1347  break;
1348  sel *= FIXED_CHAR_SEL;
1349  }
1350  else
1351  sel *= FIXED_CHAR_SEL;
1352  }
1353  /* Could get sel > 1 if multiple wildcards */
1354  if (sel > 1.0)
1355  sel = 1.0;
1356  return sel;
1357 }
1358 
1359 static Selectivity
1360 regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1361 {
1362  Selectivity sel = 1.0;
1363  int paren_depth = 0;
1364  int paren_pos = 0; /* dummy init to keep compiler quiet */
1365  int pos;
1366 
1367  /* since this function recurses, it could be driven to stack overflow */
1369 
1370  for (pos = 0; pos < pattlen; pos++)
1371  {
1372  if (patt[pos] == '(')
1373  {
1374  if (paren_depth == 0)
1375  paren_pos = pos; /* remember start of parenthesized item */
1376  paren_depth++;
1377  }
1378  else if (patt[pos] == ')' && paren_depth > 0)
1379  {
1380  paren_depth--;
1381  if (paren_depth == 0)
1382  sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1383  pos - (paren_pos + 1),
1384  case_insensitive);
1385  }
1386  else if (patt[pos] == '|' && paren_depth == 0)
1387  {
1388  /*
1389  * If unquoted | is present at paren level 0 in pattern, we have
1390  * multiple alternatives; sum their probabilities.
1391  */
1392  sel += regex_selectivity_sub(patt + (pos + 1),
1393  pattlen - (pos + 1),
1394  case_insensitive);
1395  break; /* rest of pattern is now processed */
1396  }
1397  else if (patt[pos] == '[')
1398  {
1399  bool negclass = false;
1400 
1401  if (patt[++pos] == '^')
1402  {
1403  negclass = true;
1404  pos++;
1405  }
1406  if (patt[pos] == ']') /* ']' at start of class is not special */
1407  pos++;
1408  while (pos < pattlen && patt[pos] != ']')
1409  pos++;
1410  if (paren_depth == 0)
1411  sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1412  }
1413  else if (patt[pos] == '.')
1414  {
1415  if (paren_depth == 0)
1416  sel *= ANY_CHAR_SEL;
1417  }
1418  else if (patt[pos] == '*' ||
1419  patt[pos] == '?' ||
1420  patt[pos] == '+')
1421  {
1422  /* Ought to be smarter about quantifiers... */
1423  if (paren_depth == 0)
1424  sel *= PARTIAL_WILDCARD_SEL;
1425  }
1426  else if (patt[pos] == '{')
1427  {
1428  while (pos < pattlen && patt[pos] != '}')
1429  pos++;
1430  if (paren_depth == 0)
1431  sel *= PARTIAL_WILDCARD_SEL;
1432  }
1433  else if (patt[pos] == '\\')
1434  {
1435  /* backslash quotes the next character */
1436  pos++;
1437  if (pos >= pattlen)
1438  break;
1439  if (paren_depth == 0)
1440  sel *= FIXED_CHAR_SEL;
1441  }
1442  else
1443  {
1444  if (paren_depth == 0)
1445  sel *= FIXED_CHAR_SEL;
1446  }
1447  }
1448  /* Could get sel > 1 if multiple wildcards */
1449  if (sel > 1.0)
1450  sel = 1.0;
1451  return sel;
1452 }
1453 
1454 static Selectivity
1455 regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1456  int fixed_prefix_len)
1457 {
1458  Selectivity sel;
1459 
1460  /* If patt doesn't end with $, consider it to have a trailing wildcard */
1461  if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1462  (pattlen == 1 || patt[pattlen - 2] != '\\'))
1463  {
1464  /* has trailing $ */
1465  sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1466  }
1467  else
1468  {
1469  /* no trailing $ */
1470  sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1471  sel *= FULL_WILDCARD_SEL;
1472  }
1473 
1474  /*
1475  * If there's a fixed prefix, discount its selectivity. We have to be
1476  * careful here since a very long prefix could result in pow's result
1477  * underflowing to zero (in which case "sel" probably has as well).
1478  */
1479  if (fixed_prefix_len > 0)
1480  {
1481  double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1482 
1483  if (prefixsel > 0.0)
1484  sel /= prefixsel;
1485  }
1486 
1487  /* Make sure result stays in range */
1488  CLAMP_PROBABILITY(sel);
1489  return sel;
1490 }
1491 
1492 /*
1493  * Check whether char is a letter (and, hence, subject to case-folding)
1494  *
1495  * In multibyte character sets or with ICU, we can't use isalpha, and it does
1496  * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1497  * Instead, just assume any non-ASCII char is potentially case-varying, and
1498  * hard-wire knowledge of which ASCII chars are letters.
1499  */
1500 static int
1501 pattern_char_isalpha(char c, bool is_multibyte,
1502  pg_locale_t locale, bool locale_is_c)
1503 {
1504  if (locale_is_c)
1505  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1506  else if (is_multibyte && IS_HIGHBIT_SET(c))
1507  return true;
1508  else if (locale && locale->provider == COLLPROVIDER_ICU)
1509  return IS_HIGHBIT_SET(c) ||
1510  (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1511  else if (locale && locale->provider == COLLPROVIDER_LIBC)
1512  return isalpha_l((unsigned char) c, locale->info.lt);
1513  else
1514  return isalpha((unsigned char) c);
1515 }
1516 
1517 
1518 /*
1519  * For bytea, the increment function need only increment the current byte
1520  * (there are no multibyte characters to worry about).
1521  */
1522 static bool
1523 byte_increment(unsigned char *ptr, int len)
1524 {
1525  if (*ptr >= 255)
1526  return false;
1527  (*ptr)++;
1528  return true;
1529 }
1530 
1531 /*
1532  * Try to generate a string greater than the given string or any
1533  * string it is a prefix of. If successful, return a palloc'd string
1534  * in the form of a Const node; else return NULL.
1535  *
1536  * The caller must provide the appropriate "less than" comparison function
1537  * for testing the strings, along with the collation to use.
1538  *
1539  * The key requirement here is that given a prefix string, say "foo",
1540  * we must be able to generate another string "fop" that is greater than
1541  * all strings "foobar" starting with "foo". We can test that we have
1542  * generated a string greater than the prefix string, but in non-C collations
1543  * that is not a bulletproof guarantee that an extension of the string might
1544  * not sort after it; an example is that "foo " is less than "foo!", but it
1545  * is not clear that a "dictionary" sort ordering will consider "foo!" less
1546  * than "foo bar". CAUTION: Therefore, this function should be used only for
1547  * estimation purposes when working in a non-C collation.
1548  *
1549  * To try to catch most cases where an extended string might otherwise sort
1550  * before the result value, we determine which of the strings "Z", "z", "y",
1551  * and "9" is seen as largest by the collation, and append that to the given
1552  * prefix before trying to find a string that compares as larger.
1553  *
1554  * To search for a greater string, we repeatedly "increment" the rightmost
1555  * character, using an encoding-specific character incrementer function.
1556  * When it's no longer possible to increment the last character, we truncate
1557  * off that character and start incrementing the next-to-rightmost.
1558  * For example, if "z" were the last character in the sort order, then we
1559  * could produce "foo" as a string greater than "fonz".
1560  *
1561  * This could be rather slow in the worst case, but in most cases we
1562  * won't have to try more than one or two strings before succeeding.
1563  *
1564  * Note that it's important for the character incrementer not to be too anal
1565  * about producing every possible character code, since in some cases the only
1566  * way to get a larger string is to increment a previous character position.
1567  * So we don't want to spend too much time trying every possible character
1568  * code at the last position. A good rule of thumb is to be sure that we
1569  * don't try more than 256*K values for a K-byte character (and definitely
1570  * not 256^K, which is what an exhaustive search would approach).
1571  */
1572 static Const *
1573 make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1574 {
1575  Oid datatype = str_const->consttype;
1576  char *workstr;
1577  int len;
1578  Datum cmpstr;
1579  char *cmptxt = NULL;
1580  mbcharacter_incrementer charinc;
1581 
1582  /*
1583  * Get a modifiable copy of the prefix string in C-string format, and set
1584  * up the string we will compare to as a Datum. In C locale this can just
1585  * be the given prefix string, otherwise we need to add a suffix. Type
1586  * BYTEA sorts bytewise so it never needs a suffix either.
1587  */
1588  if (datatype == BYTEAOID)
1589  {
1590  bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1591 
1592  len = VARSIZE_ANY_EXHDR(bstr);
1593  workstr = (char *) palloc(len);
1594  memcpy(workstr, VARDATA_ANY(bstr), len);
1595  Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1596  cmpstr = str_const->constvalue;
1597  }
1598  else
1599  {
1600  if (datatype == NAMEOID)
1602  str_const->constvalue));
1603  else
1604  workstr = TextDatumGetCString(str_const->constvalue);
1605  len = strlen(workstr);
1606  if (lc_collate_is_c(collation) || len == 0)
1607  cmpstr = str_const->constvalue;
1608  else
1609  {
1610  /* If first time through, determine the suffix to use */
1611  static char suffixchar = 0;
1612  static Oid suffixcollation = 0;
1613 
1614  if (!suffixchar || suffixcollation != collation)
1615  {
1616  char *best;
1617 
1618  best = "Z";
1619  if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1620  best = "z";
1621  if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1622  best = "y";
1623  if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1624  best = "9";
1625  suffixchar = *best;
1626  suffixcollation = collation;
1627  }
1628 
1629  /* And build the string to compare to */
1630  if (datatype == NAMEOID)
1631  {
1632  cmptxt = palloc(len + 2);
1633  memcpy(cmptxt, workstr, len);
1634  cmptxt[len] = suffixchar;
1635  cmptxt[len + 1] = '\0';
1636  cmpstr = PointerGetDatum(cmptxt);
1637  }
1638  else
1639  {
1640  cmptxt = palloc(VARHDRSZ + len + 1);
1641  SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1642  memcpy(VARDATA(cmptxt), workstr, len);
1643  *(VARDATA(cmptxt) + len) = suffixchar;
1644  cmpstr = PointerGetDatum(cmptxt);
1645  }
1646  }
1647  }
1648 
1649  /* Select appropriate character-incrementer function */
1650  if (datatype == BYTEAOID)
1651  charinc = byte_increment;
1652  else
1654 
1655  /* And search ... */
1656  while (len > 0)
1657  {
1658  int charlen;
1659  unsigned char *lastchar;
1660 
1661  /* Identify the last character --- for bytea, just the last byte */
1662  if (datatype == BYTEAOID)
1663  charlen = 1;
1664  else
1665  charlen = len - pg_mbcliplen(workstr, len, len - 1);
1666  lastchar = (unsigned char *) (workstr + len - charlen);
1667 
1668  /*
1669  * Try to generate a larger string by incrementing the last character
1670  * (for BYTEA, we treat each byte as a character).
1671  *
1672  * Note: the incrementer function is expected to return true if it's
1673  * generated a valid-per-the-encoding new character, otherwise false.
1674  * The contents of the character on false return are unspecified.
1675  */
1676  while (charinc(lastchar, charlen))
1677  {
1678  Const *workstr_const;
1679 
1680  if (datatype == BYTEAOID)
1681  workstr_const = string_to_bytea_const(workstr, len);
1682  else
1683  workstr_const = string_to_const(workstr, datatype);
1684 
1685  if (DatumGetBool(FunctionCall2Coll(ltproc,
1686  collation,
1687  cmpstr,
1688  workstr_const->constvalue)))
1689  {
1690  /* Successfully made a string larger than cmpstr */
1691  if (cmptxt)
1692  pfree(cmptxt);
1693  pfree(workstr);
1694  return workstr_const;
1695  }
1696 
1697  /* No good, release unusable value and try again */
1698  pfree(DatumGetPointer(workstr_const->constvalue));
1699  pfree(workstr_const);
1700  }
1701 
1702  /*
1703  * No luck here, so truncate off the last character and try to
1704  * increment the next one.
1705  */
1706  len -= charlen;
1707  workstr[len] = '\0';
1708  }
1709 
1710  /* Failed... */
1711  if (cmptxt)
1712  pfree(cmptxt);
1713  pfree(workstr);
1714 
1715  return NULL;
1716 }
1717 
1718 /*
1719  * Generate a Datum of the appropriate type from a C string.
1720  * Note that all of the supported types are pass-by-ref, so the
1721  * returned value should be pfree'd if no longer needed.
1722  */
1723 static Datum
1724 string_to_datum(const char *str, Oid datatype)
1725 {
1726  Assert(str != NULL);
1727 
1728  /*
1729  * We cheat a little by assuming that CStringGetTextDatum() will do for
1730  * bpchar and varchar constants too...
1731  */
1732  if (datatype == NAMEOID)
1734  else if (datatype == BYTEAOID)
1736  else
1737  return CStringGetTextDatum(str);
1738 }
1739 
1740 /*
1741  * Generate a Const node of the appropriate type from a C string.
1742  */
1743 static Const *
1744 string_to_const(const char *str, Oid datatype)
1745 {
1746  Datum conval = string_to_datum(str, datatype);
1747  Oid collation;
1748  int constlen;
1749 
1750  /*
1751  * We only need to support a few datatypes here, so hard-wire properties
1752  * instead of incurring the expense of catalog lookups.
1753  */
1754  switch (datatype)
1755  {
1756  case TEXTOID:
1757  case VARCHAROID:
1758  case BPCHAROID:
1759  collation = DEFAULT_COLLATION_OID;
1760  constlen = -1;
1761  break;
1762 
1763  case NAMEOID:
1764  collation = C_COLLATION_OID;
1765  constlen = NAMEDATALEN;
1766  break;
1767 
1768  case BYTEAOID:
1769  collation = InvalidOid;
1770  constlen = -1;
1771  break;
1772 
1773  default:
1774  elog(ERROR, "unexpected datatype in string_to_const: %u",
1775  datatype);
1776  return NULL;
1777  }
1778 
1779  return makeConst(datatype, -1, collation, constlen,
1780  conval, false, false);
1781 }
1782 
1783 /*
1784  * Generate a Const node of bytea type from a binary C string and a length.
1785  */
1786 static Const *
1787 string_to_bytea_const(const char *str, size_t str_len)
1788 {
1789  bytea *bstr = palloc(VARHDRSZ + str_len);
1790  Datum conval;
1791 
1792  memcpy(VARDATA(bstr), str, str_len);
1793  SET_VARSIZE(bstr, VARHDRSZ + str_len);
1794  conval = PointerGetDatum(bstr);
1795 
1796  return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1797 }
#define CStringGetTextDatum(s)
Definition: builtins.h:97
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1155
#define Max(x, y)
Definition: c.h:998
char * Pointer
Definition: c.h:483
#define VARHDRSZ
Definition: c.h:692
#define Assert(condition)
Definition: c.h:858
#define OidIsValid(objectId)
Definition: c.h:775
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
int errhint(const char *fmt,...)
Definition: elog.c:1319
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
void fmgr_info(Oid functionId, FmgrInfo *finfo)
Definition: fmgr.c:127
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define DatumGetByteaPP(X)
Definition: fmgr.h:291
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:367
#define DatumGetTextPP(X)
Definition: fmgr.h:292
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:642
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_GET_COLLATION()
Definition: fmgr.h:198
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
static char * locale
Definition: initdb.c:140
Datum icregexnesel(PG_FUNCTION_ARGS)
Definition: like_support.c:848
Datum regexnesel(PG_FUNCTION_ARGS)
Definition: like_support.c:839
static Node * like_regex_support(Node *rawreq, Pattern_Type ptype)
Definition: like_support.c:156
Datum iclikesel(PG_FUNCTION_ARGS)
Definition: like_support.c:830
Datum texticregexeq_support(PG_FUNCTION_ARGS)
Definition: like_support.c:139
static Selectivity prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, Oid eqopr, Oid ltopr, Oid geopr, Oid collation, Const *prefixcon)
#define FULL_WILDCARD_SEL
Datum iclikejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:921
Datum prefixjoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:912
#define ANY_CHAR_SEL
static double patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Definition: like_support.c:760
Datum regexeqsel(PG_FUNCTION_ARGS)
Definition: like_support.c:793
static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, Const **prefix, Selectivity *rest_selec)
Datum likejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:903
static Selectivity like_selectivity(const char *patt, int pattlen, bool case_insensitive)
Datum icregexnejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:939
static Pattern_Prefix_Status like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec)
Definition: like_support.c:992
static List * match_pattern_prefix(Node *leftop, Node *rightop, Pattern_Type ptype, Oid expr_coll, Oid opfamily, Oid indexcollation)
Definition: like_support.c:241
Datum nlikejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:948
static Datum string_to_datum(const char *str, Oid datatype)
Datum icnlikejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:957
static Selectivity regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
Datum texticlike_support(PG_FUNCTION_ARGS)
Definition: like_support.c:123
Datum nlikesel(PG_FUNCTION_ARGS)
Definition: like_support.c:857
static Const * string_to_const(const char *str, Oid datatype)
#define PARTIAL_WILDCARD_SEL
Datum text_starts_with_support(PG_FUNCTION_ARGS)
Definition: like_support.c:147
#define CHAR_RANGE_SEL
static Const * string_to_bytea_const(const char *str, size_t str_len)
static Pattern_Prefix_Status regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec)
Pattern_Type
Definition: like_support.c:59
@ Pattern_Type_Prefix
Definition: like_support.c:64
@ Pattern_Type_Regex_IC
Definition: like_support.c:63
@ Pattern_Type_Like
Definition: like_support.c:60
@ Pattern_Type_Regex
Definition: like_support.c:62
@ Pattern_Type_Like_IC
Definition: like_support.c:61
Pattern_Prefix_Status
Definition: like_support.c:68
@ Pattern_Prefix_Partial
Definition: like_support.c:69
@ Pattern_Prefix_None
Definition: like_support.c:69
@ Pattern_Prefix_Exact
Definition: like_support.c:69
static Const * make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
Datum icregexeqsel(PG_FUNCTION_ARGS)
Definition: like_support.c:802
#define FIXED_CHAR_SEL
Datum textlike_support(PG_FUNCTION_ARGS)
Definition: like_support.c:115
static int pattern_char_isalpha(char c, bool is_multibyte, pg_locale_t locale, bool locale_is_c)
Datum regexnejoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:930
static bool byte_increment(unsigned char *ptr, int len)
static double patternsel_common(PlannerInfo *root, Oid oprid, Oid opfuncid, List *args, int varRelid, Oid collation, Pattern_Type ptype, bool negate)
Definition: like_support.c:486
static Selectivity regex_selectivity(const char *patt, int pattlen, bool case_insensitive, int fixed_prefix_len)
Datum icnlikesel(PG_FUNCTION_ARGS)
Definition: like_support.c:866
Datum textregexeq_support(PG_FUNCTION_ARGS)
Definition: like_support.c:131
Datum prefixsel(PG_FUNCTION_ARGS)
Definition: like_support.c:820
static double patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Definition: like_support.c:875
Datum likesel(PG_FUNCTION_ARGS)
Definition: like_support.c:811
Datum regexeqjoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:885
Datum icregexeqjoinsel(PG_FUNCTION_ARGS)
Definition: like_support.c:894
List * lappend(List *list, void *datum)
Definition: list.c:339
RegProcedure get_opcode(Oid opno)
Definition: lsyscache.c:1285
bool get_collation_isdeterministic(Oid colloid)
Definition: lsyscache.c:1054
bool op_in_opfamily(Oid opno, Oid opfamily)
Definition: lsyscache.c:66
Oid get_negator(Oid opno)
Definition: lsyscache.c:1533
Expr * make_opclause(Oid opno, Oid opresulttype, bool opretset, Expr *leftop, Expr *rightop, Oid opcollid, Oid inputcollid)
Definition: makefuncs.c:628
Const * makeConst(Oid consttype, int32 consttypmod, Oid constcollid, int constlen, Datum constvalue, bool constisnull, bool constbyval)
Definition: makefuncs.c:301
mbcharacter_incrementer pg_database_encoding_character_incrementer(void)
Definition: mbutils.c:1523
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1083
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc(Size size)
Definition: mcxt.c:1316
Datum nameout(PG_FUNCTION_ARGS)
Definition: name.c:71
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:48
Oid exprType(const Node *expr)
Definition: nodeFuncs.c:42
static bool is_opclause(const void *clause)
Definition: nodeFuncs.h:74
static bool is_funcclause(const void *clause)
Definition: nodeFuncs.h:67
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
double Selectivity
Definition: nodes.h:250
Oid oprid(Operator op)
Definition: parse_oper.c:238
#define NAMEDATALEN
const void size_t len
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define list_make1(x1)
Definition: pg_list.h:212
#define linitial(l)
Definition: pg_list.h:178
#define lsecond(l)
Definition: pg_list.h:183
bool lc_collate_is_c(Oid collation)
Definition: pg_locale.c:1317
pg_locale_t pg_newlocale_from_collation(Oid collid)
Definition: pg_locale.c:1551
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1384
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:135
bool(* mbcharacter_incrementer)(unsigned char *mbstr, int len)
Definition: pg_wchar.h:370
void check_stack_depth(void)
Definition: postgres.c:3531
static bool DatumGetBool(Datum X)
Definition: postgres.h:90
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static char * DatumGetCString(Datum X)
Definition: postgres.h:335
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
char * c
char * s1
tree ctl root
Definition: radixtree.h:1880
char * regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation, bool *exact)
Definition: regexp.c:1953
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4883
double var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation, Datum constval, bool constisnull, bool varonleft, bool negate)
Definition: selfuncs.c:295
double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Oid collation, Datum constval, bool varonleft, double *sumcommonp)
Definition: selfuncs.c:732
double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, Oid opoid, FmgrInfo *opproc, bool isgt, bool iseq, Oid collation, Datum constval, Oid consttype)
Definition: selfuncs.c:1041
double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Oid collation, Datum constval, bool varonleft, int min_hist_size, int n_skip, int *hist_size)
Definition: selfuncs.c:823
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:99
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:63
#define DEFAULT_MATCH_SEL
Definition: selfuncs.h:46
Oid consttype
Definition: primnodes.h:312
Definition: fmgr.h:57
List * args
Definition: primnodes.h:738
Definition: pg_list.h:54
Definition: nodes.h:129
List * args
Definition: primnodes.h:806
struct PlannerInfo * root
Definition: supportnodes.h:96
HeapTuple statsTuple
Definition: selfuncs.h:89
Definition: c.h:687
#define VARDATA(PTR)
Definition: varatt.h:278
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
int varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
Definition: varlena.c:1539
Datum byteain(PG_FUNCTION_ARGS)
Definition: varlena.c:290
#define isalpha_l
Definition: win32_port.h:431