PostgreSQL Source Code  git master
regprefix.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * regprefix.c
4  * Extract a common prefix, if any, from a compiled regex.
5  *
6  *
7  * Portions Copyright (c) 2012-2024, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1998, 1999 Henry Spencer
9  *
10  * IDENTIFICATION
11  * src/backend/regex/regprefix.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "regex/regguts.h"
17 
18 
19 /*
20  * forward declarations
21  */
22 static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23  chr *string, size_t *slength);
24 
25 
26 /*
27  * pg_regprefix - get common prefix for regular expression
28  *
29  * Returns one of:
30  * REG_NOMATCH: there is no common prefix of strings matching the regex
31  * REG_PREFIX: there is a common prefix of strings matching the regex
32  * REG_EXACT: all strings satisfying the regex must match the same string
33  * or a REG_XXX error code
34  *
35  * In the non-failure cases, *string is set to a palloc'd string containing
36  * the common prefix or exact value, of length *slength (measured in chrs
37  * not bytes!).
38  *
39  * This function does not analyze all complex cases (such as lookaround
40  * constraints) exactly. Therefore it is possible that some strings matching
41  * the reported prefix or exact-match string do not satisfy the regex. But
42  * it should never be the case that a string satisfying the regex does not
43  * match the reported prefix or exact-match string.
44  */
45 int
47  chr **string,
48  size_t *slength)
49 {
50  struct guts *g;
51  struct cnfa *cnfa;
52  int st;
53 
54  /* sanity checks */
55  if (string == NULL || slength == NULL)
56  return REG_INVARG;
57  *string = NULL; /* initialize for failure cases */
58  *slength = 0;
59  if (re == NULL || re->re_magic != REMAGIC)
60  return REG_INVARG;
61  if (re->re_csize != sizeof(chr))
62  return REG_MIXED;
63 
64  /* Initialize locale-dependent support */
66 
67  /* setup */
68  g = (struct guts *) re->re_guts;
69  if (g->info & REG_UIMPOSSIBLE)
70  return REG_NOMATCH;
71 
72  /*
73  * This implementation considers only the search NFA for the topmost regex
74  * tree node. Therefore, constraints such as backrefs are not fully
75  * applied, which is allowed per the function's API spec.
76  */
77  assert(g->tree != NULL);
78  cnfa = &g->tree->cnfa;
79 
80  /* matchall NFAs never have a fixed prefix */
81  if (cnfa->flags & MATCHALL)
82  return REG_NOMATCH;
83 
84  /*
85  * Since a correct NFA should never contain any exit-free loops, it should
86  * not be possible for our traversal to return to a previously visited NFA
87  * state. Hence we need at most nstates chrs in the output string.
88  */
89  *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90  if (*string == NULL)
91  return REG_ESPACE;
92 
93  /* do it */
94  st = findprefix(cnfa, &g->cmap, *string, slength);
95 
96  assert(*slength <= cnfa->nstates);
97 
98  /* clean up */
99  if (st != REG_PREFIX && st != REG_EXACT)
100  {
101  FREE(*string);
102  *string = NULL;
103  *slength = 0;
104  }
105 
106  return st;
107 }
108 
109 /*
110  * findprefix - extract common prefix from cNFA
111  *
112  * Results are returned into the preallocated chr array string[], with
113  * *slength (which must be preset to zero) incremented for each chr.
114  */
115 static int /* regprefix return code */
117  struct colormap *cm,
118  chr *string,
119  size_t *slength)
120 {
121  int st;
122  int nextst;
123  color thiscolor;
124  chr c;
125  struct carc *ca;
126 
127  /*
128  * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
129  * anchored left. If we have both BOS and BOL, they must go to the same
130  * next state.
131  */
132  st = cnfa->pre;
133  nextst = -1;
134  for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
135  {
136  if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
137  {
138  if (nextst == -1)
139  nextst = ca->to;
140  else if (nextst != ca->to)
141  return REG_NOMATCH;
142  }
143  else
144  return REG_NOMATCH;
145  }
146  if (nextst == -1)
147  return REG_NOMATCH;
148 
149  /*
150  * Scan through successive states, stopping as soon as we find one with
151  * more than one acceptable transition character (either multiple colors
152  * on out-arcs, or a color with more than one member chr).
153  *
154  * We could find a state with multiple out-arcs that are all labeled with
155  * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
156  * In that case we add the chr "c" to the output string but then exit the
157  * loop with nextst == -1. This leaves a little bit on the table: if the
158  * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
159  * to the prefix. But chasing multiple parallel state chains doesn't seem
160  * worth the trouble.
161  */
162  do
163  {
164  st = nextst;
165  nextst = -1;
166  thiscolor = COLORLESS;
167  for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
168  {
169  /* We can ignore BOS/BOL arcs */
170  if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
171  continue;
172 
173  /*
174  * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
175  * and LACONs
176  */
177  if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
178  ca->co == RAINBOW || ca->co >= cnfa->ncolors)
179  {
180  thiscolor = COLORLESS;
181  break;
182  }
183  if (thiscolor == COLORLESS)
184  {
185  /* First plain outarc */
186  thiscolor = ca->co;
187  nextst = ca->to;
188  }
189  else if (thiscolor == ca->co)
190  {
191  /* Another plain outarc for same color */
192  nextst = -1;
193  }
194  else
195  {
196  /* More than one plain outarc color terminates the search */
197  thiscolor = COLORLESS;
198  break;
199  }
200  }
201  /* Done if we didn't find exactly one color on plain outarcs */
202  if (thiscolor == COLORLESS)
203  break;
204  /* The color must be a singleton */
205  if (cm->cd[thiscolor].nschrs != 1)
206  break;
207  /* Must not have any high-color-map entries */
208  if (cm->cd[thiscolor].nuchrs != 0)
209  break;
210 
211  /*
212  * Identify the color's sole member chr and add it to the prefix
213  * string. In general the colormap data structure doesn't provide a
214  * way to find color member chrs, except by trying GETCOLOR() on each
215  * possible chr value, which won't do at all. However, for the cases
216  * we care about it should be sufficient to test the "firstchr" value,
217  * that is the first chr ever added to the color. There are cases
218  * where this might no longer be a member of the color (so we do need
219  * to test), but none of them are likely to arise for a character that
220  * is a member of a common prefix. If we do hit such a corner case,
221  * we just fall out without adding anything to the prefix string.
222  */
223  c = cm->cd[thiscolor].firstchr;
224  if (GETCOLOR(cm, c) != thiscolor)
225  break;
226 
227  string[(*slength)++] = c;
228 
229  /* Advance to next state, but only if we have a unique next state */
230  } while (nextst != -1);
231 
232  /*
233  * If we ended at a state that only has EOS/EOL outarcs leading to the
234  * "post" state, then we have an exact-match string. Note this is true
235  * even if the string is of zero length.
236  */
237  nextst = -1;
238  for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
239  {
240  if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
241  {
242  if (nextst == -1)
243  nextst = ca->to;
244  else if (nextst != ca->to)
245  {
246  nextst = -1;
247  break;
248  }
249  }
250  else
251  {
252  nextst = -1;
253  break;
254  }
255  }
256  if (nextst == cnfa->post)
257  return REG_EXACT;
258 
259  /*
260  * Otherwise, if we were unable to identify any prefix characters, say
261  * NOMATCH --- the pattern is anchored left, but doesn't specify any
262  * particular first character.
263  */
264  if (*slength > 0)
265  return REG_PREFIX;
266 
267  return REG_NOMATCH;
268 }
#define FREE(ptr)
Definition: cryptohash.c:37
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
char * c
void pg_set_regex_collation(Oid collation)
#define MALLOC(n)
Definition: regcustom.h:52
pg_wchar chr
Definition: regcustom.h:59
#define assert(x)
Definition: regcustom.h:56
#define REG_NOMATCH
Definition: regex.h:138
#define REG_EXACT
Definition: regex.h:162
#define REG_PREFIX
Definition: regex.h:161
#define REG_UIMPOSSIBLE
Definition: regex.h:72
#define REG_INVARG
Definition: regex.h:152
#define REG_MIXED
Definition: regex.h:153
#define REG_ESPACE
Definition: regex.h:149
#define RAINBOW
Definition: regguts.h:159
#define REMAGIC
Definition: regguts.h:101
#define GETCOLOR(cm, c)
Definition: regguts.h:257
short color
Definition: regguts.h:155
#define COLORLESS
Definition: regguts.h:158
#define MATCHALL
Definition: regguts.h:412
int pg_regprefix(regex_t *re, chr **string, size_t *slength)
Definition: regprefix.c:46
static int findprefix(struct cnfa *cnfa, struct colormap *cm, chr *string, size_t *slength)
Definition: regprefix.c:116
Definition: regguts.h:401
int to
Definition: regguts.h:403
color co
Definition: regguts.h:402
Definition: regguts.h:407
int pre
Definition: regguts.h:413
color eos[2]
Definition: regguts.h:416
struct carc ** states
Definition: regguts.h:419
int ncolors
Definition: regguts.h:409
int flags
Definition: regguts.h:410
int post
Definition: regguts.h:414
color bos[2]
Definition: regguts.h:415
int nstates
Definition: regguts.h:408
chr firstchr
Definition: regguts.h:183
int nuchrs
Definition: regguts.h:179
int nschrs
Definition: regguts.h:178
struct colordesc * cd
Definition: regguts.h:236
Definition: regguts.h:529
struct subre * tree
Definition: regguts.h:535
long info
Definition: regguts.h:533
struct colormap cmap
Definition: regguts.h:538
Definition: regex.h:56
char * re_guts
Definition: regex.h:78
int re_csize
Definition: regex.h:74
int re_magic
Definition: regex.h:57
Oid re_collation
Definition: regex.h:76
struct cnfa cnfa
Definition: regguts.h:505