PostgreSQL Source Code  git master
like_match.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * like_match.c
4  * LIKE pattern matching internal code.
5  *
6  * This file is included by like.c four times, to provide matching code for
7  * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
8  * and (4) case insensitive matches in single-byte encodings.
9  * (UTF8 is a special case because we can use a much more efficient version
10  * of NextChar than can be used for general multi-byte encodings.)
11  *
12  * Before the inclusion, we need to define the following macros:
13  *
14  * NextChar
15  * MatchText - to name of function wanted
16  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
17  * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
18  *
19  * Copyright (c) 1996-2024, PostgreSQL Global Development Group
20  *
21  * IDENTIFICATION
22  * src/backend/utils/adt/like_match.c
23  *
24  *-------------------------------------------------------------------------
25  */
26 
27 /*
28  * Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
29  * Rich $alz is now <rsalz@bbn.com>.
30  * Special thanks to Lars Mathiesen <thorinn@diku.dk> for the
31  * LIKE_ABORT code.
32  *
33  * This code was shamelessly stolen from the "pql" code by myself and
34  * slightly modified :)
35  *
36  * All references to the word "star" were replaced by "percent"
37  * All references to the word "wild" were replaced by "like"
38  *
39  * All the nice shell RE matching stuff was replaced by just "_" and "%"
40  *
41  * As I don't have a copy of the SQL standard handy I wasn't sure whether
42  * to leave in the '\' escape character handling.
43  *
44  * Keith Parks. <keith@mtcc.demon.co.uk>
45  *
46  * SQL lets you specify the escape character by saying
47  * LIKE <pattern> ESCAPE <escape character>. We are a small operation
48  * so we force you to use '\'. - ay 7/95
49  *
50  * Now we have the like_escape() function that converts patterns with
51  * any specified escape character (or none at all) to the internal
52  * default escape character, which is still '\'. - tgl 9/2000
53  *
54  * The code is rewritten to avoid requiring null-terminated strings,
55  * which in turn allows us to leave out some memcpy() operations.
56  * This code should be faster and take less memory, but no promises...
57  * - thomas 2000-08-06
58  */
59 
60 
61 /*--------------------
62  * Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
63  *
64  * LIKE_TRUE: they match
65  * LIKE_FALSE: they don't match
66  * LIKE_ABORT: not only don't they match, but the text is too short.
67  *
68  * If LIKE_ABORT is returned, then no suffix of the text can match the
69  * pattern either, so an upper-level % scan can stop scanning now.
70  *--------------------
71  */
72 
73 #ifdef MATCH_LOWER
74 #define GETCHAR(t) MATCH_LOWER(t)
75 #else
76 #define GETCHAR(t) (t)
77 #endif
78 
79 static int
80 MatchText(const char *t, int tlen, const char *p, int plen,
81  pg_locale_t locale, bool locale_is_c)
82 {
83  /* Fast path for match-everything pattern */
84  if (plen == 1 && *p == '%')
85  return LIKE_TRUE;
86 
87  /* Since this function recurses, it could be driven to stack overflow */
89 
90  /*
91  * In this loop, we advance by char when matching wildcards (and thus on
92  * recursive entry to this function we are properly char-synced). On other
93  * occasions it is safe to advance by byte, as the text and pattern will
94  * be in lockstep. This allows us to perform all comparisons between the
95  * text and pattern on a byte by byte basis, even for multi-byte
96  * encodings.
97  */
98  while (tlen > 0 && plen > 0)
99  {
100  if (*p == '\\')
101  {
102  /* Next pattern byte must match literally, whatever it is */
103  NextByte(p, plen);
104  /* ... and there had better be one, per SQL standard */
105  if (plen <= 0)
106  ereport(ERROR,
107  (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
108  errmsg("LIKE pattern must not end with escape character")));
109  if (GETCHAR(*p) != GETCHAR(*t))
110  return LIKE_FALSE;
111  }
112  else if (*p == '%')
113  {
114  char firstpat;
115 
116  /*
117  * % processing is essentially a search for a text position at
118  * which the remainder of the text matches the remainder of the
119  * pattern, using a recursive call to check each potential match.
120  *
121  * If there are wildcards immediately following the %, we can skip
122  * over them first, using the idea that any sequence of N _'s and
123  * one or more %'s is equivalent to N _'s and one % (ie, it will
124  * match any sequence of at least N text characters). In this way
125  * we will always run the recursive search loop using a pattern
126  * fragment that begins with a literal character-to-match, thereby
127  * not recursing more than we have to.
128  */
129  NextByte(p, plen);
130 
131  while (plen > 0)
132  {
133  if (*p == '%')
134  NextByte(p, plen);
135  else if (*p == '_')
136  {
137  /* If not enough text left to match the pattern, ABORT */
138  if (tlen <= 0)
139  return LIKE_ABORT;
140  NextChar(t, tlen);
141  NextByte(p, plen);
142  }
143  else
144  break; /* Reached a non-wildcard pattern char */
145  }
146 
147  /*
148  * If we're at end of pattern, match: we have a trailing % which
149  * matches any remaining text string.
150  */
151  if (plen <= 0)
152  return LIKE_TRUE;
153 
154  /*
155  * Otherwise, scan for a text position at which we can match the
156  * rest of the pattern. The first remaining pattern char is known
157  * to be a regular or escaped literal character, so we can compare
158  * the first pattern byte to each text byte to avoid recursing
159  * more than we have to. This fact also guarantees that we don't
160  * have to consider a match to the zero-length substring at the
161  * end of the text.
162  */
163  if (*p == '\\')
164  {
165  if (plen < 2)
166  ereport(ERROR,
167  (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
168  errmsg("LIKE pattern must not end with escape character")));
169  firstpat = GETCHAR(p[1]);
170  }
171  else
172  firstpat = GETCHAR(*p);
173 
174  while (tlen > 0)
175  {
176  if (GETCHAR(*t) == firstpat)
177  {
178  int matched = MatchText(t, tlen, p, plen,
179  locale, locale_is_c);
180 
181  if (matched != LIKE_FALSE)
182  return matched; /* TRUE or ABORT */
183  }
184 
185  NextChar(t, tlen);
186  }
187 
188  /*
189  * End of text with no match, so no point in trying later places
190  * to start matching this pattern.
191  */
192  return LIKE_ABORT;
193  }
194  else if (*p == '_')
195  {
196  /* _ matches any single character, and we know there is one */
197  NextChar(t, tlen);
198  NextByte(p, plen);
199  continue;
200  }
201  else if (GETCHAR(*p) != GETCHAR(*t))
202  {
203  /* non-wildcard pattern char fails to match text char */
204  return LIKE_FALSE;
205  }
206 
207  /*
208  * Pattern and text match, so advance.
209  *
210  * It is safe to use NextByte instead of NextChar here, even for
211  * multi-byte character sets, because we are not following immediately
212  * after a wildcard character. If we are in the middle of a multibyte
213  * character, we must already have matched at least one byte of the
214  * character from both text and pattern; so we cannot get out-of-sync
215  * on character boundaries. And we know that no backend-legal
216  * encoding allows ASCII characters such as '%' to appear as non-first
217  * bytes of characters, so we won't mistakenly detect a new wildcard.
218  */
219  NextByte(t, tlen);
220  NextByte(p, plen);
221  }
222 
223  if (tlen > 0)
224  return LIKE_FALSE; /* end of pattern, but not of text */
225 
226  /*
227  * End of text, but perhaps not of pattern. Match iff the remaining
228  * pattern can match a zero-length string, ie, it's zero or more %'s.
229  */
230  while (plen > 0 && *p == '%')
231  NextByte(p, plen);
232  if (plen <= 0)
233  return LIKE_TRUE;
234 
235  /*
236  * End of text with no match, so no point in trying later places to start
237  * matching this pattern.
238  */
239  return LIKE_ABORT;
240 } /* MatchText() */
241 
242 /*
243  * like_escape() --- given a pattern and an ESCAPE string,
244  * convert the pattern to use Postgres' standard backslash escape convention.
245  */
246 #ifdef do_like_escape
247 
248 static text *
249 do_like_escape(text *pat, text *esc)
250 {
251  text *result;
252  char *p,
253  *e,
254  *r;
255  int plen,
256  elen;
257  bool afterescape;
258 
259  p = VARDATA_ANY(pat);
260  plen = VARSIZE_ANY_EXHDR(pat);
261  e = VARDATA_ANY(esc);
262  elen = VARSIZE_ANY_EXHDR(esc);
263 
264  /*
265  * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
266  * trying to calculate the size more accurately than that.
267  */
268  result = (text *) palloc(plen * 2 + VARHDRSZ);
269  r = VARDATA(result);
270 
271  if (elen == 0)
272  {
273  /*
274  * No escape character is wanted. Double any backslashes in the
275  * pattern to make them act like ordinary characters.
276  */
277  while (plen > 0)
278  {
279  if (*p == '\\')
280  *r++ = '\\';
281  CopyAdvChar(r, p, plen);
282  }
283  }
284  else
285  {
286  /*
287  * The specified escape must be only a single character.
288  */
289  NextChar(e, elen);
290  if (elen != 0)
291  ereport(ERROR,
292  (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
293  errmsg("invalid escape string"),
294  errhint("Escape string must be empty or one character.")));
295 
296  e = VARDATA_ANY(esc);
297 
298  /*
299  * If specified escape is '\', just copy the pattern as-is.
300  */
301  if (*e == '\\')
302  {
303  memcpy(result, pat, VARSIZE_ANY(pat));
304  return result;
305  }
306 
307  /*
308  * Otherwise, convert occurrences of the specified escape character to
309  * '\', and double occurrences of '\' --- unless they immediately
310  * follow an escape character!
311  */
312  afterescape = false;
313  while (plen > 0)
314  {
315  if (CHAREQ(p, e) && !afterescape)
316  {
317  *r++ = '\\';
318  NextChar(p, plen);
319  afterescape = true;
320  }
321  else if (*p == '\\')
322  {
323  *r++ = '\\';
324  if (!afterescape)
325  *r++ = '\\';
326  NextChar(p, plen);
327  afterescape = false;
328  }
329  else
330  {
331  CopyAdvChar(r, p, plen);
332  afterescape = false;
333  }
334  }
335  }
336 
337  SET_VARSIZE(result, r - ((char *) result));
338 
339  return result;
340 }
341 #endif /* do_like_escape */
342 
343 #ifdef CHAREQ
344 #undef CHAREQ
345 #endif
346 
347 #undef NextChar
348 #undef CopyAdvChar
349 #undef MatchText
350 
351 #ifdef do_like_escape
352 #undef do_like_escape
353 #endif
354 
355 #undef GETCHAR
356 
357 #ifdef MATCH_LOWER
358 #undef MATCH_LOWER
359 
360 #endif
#define VARHDRSZ
Definition: c.h:692
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static char * locale
Definition: initdb.c:140
#define LIKE_ABORT
Definition: like.c:32
#define CopyAdvChar(dst, src, srclen)
Definition: like.c:124
#define LIKE_TRUE
Definition: like.c:30
#define CHAREQ(p1, p2)
Definition: like.c:122
#define LIKE_FALSE
Definition: like.c:31
#define NextByte(p, plen)
Definition: like.c:103
#define do_like_escape
Definition: like.c:127
#define NextChar(p, plen)
Definition: like.c:140
static int MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale, bool locale_is_c)
Definition: like_match.c:80
#define GETCHAR(t)
Definition: like_match.c:76
void * palloc(Size size)
Definition: mcxt.c:1317
void check_stack_depth(void)
Definition: postgres.c:3540
e
Definition: preproc-init.c:82
Definition: c.h:687
#define VARSIZE_ANY(PTR)
Definition: varatt.h:311
#define VARDATA(PTR)
Definition: varatt.h:278
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317