PostgreSQL Source Code  git master
rege_dfa.c
Go to the documentation of this file.
1 /*
2  * DFA routines
3  * This file is #included by regexec.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results. The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/rege_dfa.c
32  *
33  */
34 
35 /*
36  * longest - longest-preferred matching engine
37  *
38  * On success, returns match endpoint address. Returns NULL on no match.
39  * Internal errors also return NULL, with v->err set.
40  */
41 static chr *
42 longest(struct vars *v,
43  struct dfa *d,
44  chr *start, /* where the match should start */
45  chr *stop, /* match must end at or before here */
46  int *hitstopp) /* record whether hit v->stop, if non-NULL */
47 {
48  chr *cp;
49  chr *realstop = (stop == v->stop) ? stop : stop + 1;
50  color co;
51  struct sset *css;
52  struct sset *ss;
53  chr *post;
54  int i;
55  struct colormap *cm = d->cm;
56 
57  /* prevent "uninitialized variable" warnings */
58  if (hitstopp != NULL)
59  *hitstopp = 0;
60 
61  /* if this is a backref to a known string, just match against that */
62  if (d->backno >= 0)
63  {
64  assert((size_t) d->backno < v->nmatch);
65  if (v->pmatch[d->backno].rm_so >= 0)
66  {
67  cp = dfa_backref(v, d, start, start, stop, false);
68  if (cp == v->stop && stop == v->stop && hitstopp != NULL)
69  *hitstopp = 1;
70  return cp;
71  }
72  }
73 
74  /* fast path for matchall NFAs */
75  if (d->cnfa->flags & MATCHALL)
76  {
77  size_t nchr = stop - start;
78  size_t maxmatchall = d->cnfa->maxmatchall;
79 
80  if (nchr < d->cnfa->minmatchall)
81  return NULL;
82  if (maxmatchall == DUPINF)
83  {
84  if (stop == v->stop && hitstopp != NULL)
85  *hitstopp = 1;
86  }
87  else
88  {
89  if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
90  *hitstopp = 1;
91  if (nchr > maxmatchall)
92  return start + maxmatchall;
93  }
94  return stop;
95  }
96 
97  /* initialize */
98  css = initialize(v, d, start);
99  if (css == NULL)
100  return NULL;
101  cp = start;
102 
103  /* startup */
104  FDEBUG(("+++ startup +++\n"));
105  if (cp == v->start)
106  {
107  co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
108  FDEBUG(("color %ld\n", (long) co));
109  }
110  else
111  {
112  co = GETCOLOR(cm, *(cp - 1));
113  FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
114  }
115  css = miss(v, d, css, co, cp, start);
116  if (css == NULL)
117  return NULL;
118  css->lastseen = cp;
119 
120  /*
121  * This is the main text-scanning loop. It seems worth having two copies
122  * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
123  * builds, when you're not actively tracing.
124  */
125 #ifdef REG_DEBUG
126  if (v->eflags & REG_FTRACE)
127  {
128  while (cp < realstop)
129  {
130  FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
131  co = GETCOLOR(cm, *cp);
132  FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
133  ss = css->outs[co];
134  if (ss == NULL)
135  {
136  ss = miss(v, d, css, co, cp + 1, start);
137  if (ss == NULL)
138  break; /* NOTE BREAK OUT */
139  }
140  cp++;
141  ss->lastseen = cp;
142  css = ss;
143  }
144  }
145  else
146 #endif
147  {
148  while (cp < realstop)
149  {
150  co = GETCOLOR(cm, *cp);
151  ss = css->outs[co];
152  if (ss == NULL)
153  {
154  ss = miss(v, d, css, co, cp + 1, start);
155  if (ss == NULL)
156  break; /* NOTE BREAK OUT */
157  }
158  cp++;
159  ss->lastseen = cp;
160  css = ss;
161  }
162  }
163 
164  if (ISERR())
165  return NULL;
166 
167  /* shutdown */
168  FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
169  if (cp == v->stop && stop == v->stop)
170  {
171  if (hitstopp != NULL)
172  *hitstopp = 1;
173  co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
174  FDEBUG(("color %ld\n", (long) co));
175  ss = miss(v, d, css, co, cp, start);
176  if (ISERR())
177  return NULL;
178  /* special case: match ended at eol? */
179  if (ss != NULL && (ss->flags & POSTSTATE))
180  return cp;
181  else if (ss != NULL)
182  ss->lastseen = cp; /* to be tidy */
183  }
184 
185  /* find last match, if any */
186  post = d->lastpost;
187  for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
188  if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
189  (post == NULL || post < ss->lastseen))
190  post = ss->lastseen;
191  if (post != NULL) /* found one */
192  return post - 1;
193 
194  return NULL;
195 }
196 
197 /*
198  * shortest - shortest-preferred matching engine
199  *
200  * On success, returns match endpoint address. Returns NULL on no match.
201  * Internal errors also return NULL, with v->err set.
202  */
203 static chr *
204 shortest(struct vars *v,
205  struct dfa *d,
206  chr *start, /* where the match should start */
207  chr *min, /* match must end at or after here */
208  chr *max, /* match must end at or before here */
209  chr **coldp, /* store coldstart pointer here, if non-NULL */
210  int *hitstopp) /* record whether hit v->stop, if non-NULL */
211 {
212  chr *cp;
213  chr *realmin = (min == v->stop) ? min : min + 1;
214  chr *realmax = (max == v->stop) ? max : max + 1;
215  color co;
216  struct sset *css;
217  struct sset *ss;
218  struct colormap *cm = d->cm;
219 
220  /* prevent "uninitialized variable" warnings */
221  if (coldp != NULL)
222  *coldp = NULL;
223  if (hitstopp != NULL)
224  *hitstopp = 0;
225 
226  /* if this is a backref to a known string, just match against that */
227  if (d->backno >= 0)
228  {
229  assert((size_t) d->backno < v->nmatch);
230  if (v->pmatch[d->backno].rm_so >= 0)
231  {
232  cp = dfa_backref(v, d, start, min, max, true);
233  if (cp != NULL && coldp != NULL)
234  *coldp = start;
235  /* there is no case where we should set *hitstopp */
236  return cp;
237  }
238  }
239 
240  /* fast path for matchall NFAs */
241  if (d->cnfa->flags & MATCHALL)
242  {
243  size_t nchr = min - start;
244 
245  if (d->cnfa->maxmatchall != DUPINF &&
246  nchr > d->cnfa->maxmatchall)
247  return NULL;
248  if ((max - start) < d->cnfa->minmatchall)
249  return NULL;
250  if (nchr < d->cnfa->minmatchall)
251  min = start + d->cnfa->minmatchall;
252  if (coldp != NULL)
253  *coldp = start;
254  /* there is no case where we should set *hitstopp */
255  return min;
256  }
257 
258  /* initialize */
259  css = initialize(v, d, start);
260  if (css == NULL)
261  return NULL;
262  cp = start;
263 
264  /* startup */
265  FDEBUG(("--- startup ---\n"));
266  if (cp == v->start)
267  {
268  co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
269  FDEBUG(("color %ld\n", (long) co));
270  }
271  else
272  {
273  co = GETCOLOR(cm, *(cp - 1));
274  FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
275  }
276  css = miss(v, d, css, co, cp, start);
277  if (css == NULL)
278  return NULL;
279  css->lastseen = cp;
280  ss = css;
281 
282  /*
283  * This is the main text-scanning loop. It seems worth having two copies
284  * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
285  * builds, when you're not actively tracing.
286  */
287 #ifdef REG_DEBUG
288  if (v->eflags & REG_FTRACE)
289  {
290  while (cp < realmax)
291  {
292  FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
293  co = GETCOLOR(cm, *cp);
294  FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
295  ss = css->outs[co];
296  if (ss == NULL)
297  {
298  ss = miss(v, d, css, co, cp + 1, start);
299  if (ss == NULL)
300  break; /* NOTE BREAK OUT */
301  }
302  cp++;
303  ss->lastseen = cp;
304  css = ss;
305  if ((ss->flags & POSTSTATE) && cp >= realmin)
306  break; /* NOTE BREAK OUT */
307  }
308  }
309  else
310 #endif
311  {
312  while (cp < realmax)
313  {
314  co = GETCOLOR(cm, *cp);
315  ss = css->outs[co];
316  if (ss == NULL)
317  {
318  ss = miss(v, d, css, co, cp + 1, start);
319  if (ss == NULL)
320  break; /* NOTE BREAK OUT */
321  }
322  cp++;
323  ss->lastseen = cp;
324  css = ss;
325  if ((ss->flags & POSTSTATE) && cp >= realmin)
326  break; /* NOTE BREAK OUT */
327  }
328  }
329 
330  if (ss == NULL)
331  return NULL;
332 
333  if (coldp != NULL) /* report last no-progress state set, if any */
334  *coldp = lastcold(v, d);
335 
336  if ((ss->flags & POSTSTATE) && cp > min)
337  {
338  assert(cp >= realmin);
339  cp--;
340  }
341  else if (cp == v->stop && max == v->stop)
342  {
343  co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
344  FDEBUG(("color %ld\n", (long) co));
345  ss = miss(v, d, css, co, cp, start);
346  /* match might have ended at eol */
347  if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
348  *hitstopp = 1;
349  }
350 
351  if (ss == NULL || !(ss->flags & POSTSTATE))
352  return NULL;
353 
354  return cp;
355 }
356 
357 /*
358  * matchuntil - incremental matching engine
359  *
360  * This is meant for use with a search-style NFA (that is, the pattern is
361  * known to act as though it had a leading .*). We determine whether a
362  * match exists starting at v->start and ending at probe. Multiple calls
363  * require only O(N) time not O(N^2) so long as the probe values are
364  * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
365  * starting a series of calls.
366  *
367  * Returns 1 if a match exists, 0 if not.
368  * Internal errors also return 0, with v->err set.
369  */
370 static int
371 matchuntil(struct vars *v,
372  struct dfa *d,
373  chr *probe, /* we want to know if a match ends here */
374  struct sset **lastcss, /* state storage across calls */
375  chr **lastcp) /* state storage across calls */
376 {
377  chr *cp = *lastcp;
378  color co;
379  struct sset *css = *lastcss;
380  struct sset *ss;
381  struct colormap *cm = d->cm;
382 
383  /* fast path for matchall NFAs */
384  if (d->cnfa->flags & MATCHALL)
385  {
386  size_t nchr = probe - v->start;
387 
388  /*
389  * It might seem that we should check maxmatchall too, but the .* at
390  * the front of the pattern absorbs any extra characters (and it was
391  * tacked on *after* computing minmatchall/maxmatchall). Thus, we
392  * should match if there are at least minmatchall characters.
393  */
394  if (nchr < d->cnfa->minmatchall)
395  return 0;
396  return 1;
397  }
398 
399  /* initialize and startup, or restart, if necessary */
400  if (cp == NULL || cp > probe)
401  {
402  cp = v->start;
403  css = initialize(v, d, cp);
404  if (css == NULL)
405  return 0;
406 
407  FDEBUG((">>> startup >>>\n"));
408  co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
409  FDEBUG(("color %ld\n", (long) co));
410 
411  css = miss(v, d, css, co, cp, v->start);
412  if (css == NULL)
413  return 0;
414  css->lastseen = cp;
415  }
416  else if (css == NULL)
417  {
418  /* we previously found that no match is possible beyond *lastcp */
419  return 0;
420  }
421  ss = css;
422 
423  /*
424  * This is the main text-scanning loop. It seems worth having two copies
425  * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
426  * builds, when you're not actively tracing.
427  */
428 #ifdef REG_DEBUG
429  if (v->eflags & REG_FTRACE)
430  {
431  while (cp < probe)
432  {
433  FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
434  co = GETCOLOR(cm, *cp);
435  FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
436  ss = css->outs[co];
437  if (ss == NULL)
438  {
439  ss = miss(v, d, css, co, cp + 1, v->start);
440  if (ss == NULL)
441  break; /* NOTE BREAK OUT */
442  }
443  cp++;
444  ss->lastseen = cp;
445  css = ss;
446  }
447  }
448  else
449 #endif
450  {
451  while (cp < probe)
452  {
453  co = GETCOLOR(cm, *cp);
454  ss = css->outs[co];
455  if (ss == NULL)
456  {
457  ss = miss(v, d, css, co, cp + 1, v->start);
458  if (ss == NULL)
459  break; /* NOTE BREAK OUT */
460  }
461  cp++;
462  ss->lastseen = cp;
463  css = ss;
464  }
465  }
466 
467  *lastcss = ss;
468  *lastcp = cp;
469 
470  if (ss == NULL)
471  return 0; /* impossible match, or internal error */
472 
473  /* We need to process one more chr, or the EOS symbol, to check match */
474  if (cp < v->stop)
475  {
476  FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
477  co = GETCOLOR(cm, *cp);
478  FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
479  ss = css->outs[co];
480  if (ss == NULL)
481  ss = miss(v, d, css, co, cp + 1, v->start);
482  }
483  else
484  {
485  assert(cp == v->stop);
486  co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
487  FDEBUG(("color %ld\n", (long) co));
488  ss = miss(v, d, css, co, cp, v->start);
489  }
490 
491  if (ss == NULL || !(ss->flags & POSTSTATE))
492  return 0;
493 
494  return 1;
495 }
496 
497 /*
498  * dfa_backref - find best match length for a known backref string
499  *
500  * When the backref's referent is already available, we can deliver an exact
501  * answer with considerably less work than running the backref node's NFA.
502  *
503  * Return match endpoint for longest or shortest valid repeated match,
504  * or NULL if there is no valid match.
505  *
506  * Should be in sync with cbrdissect(), although that has the different task
507  * of checking a match to a predetermined section of the string.
508  */
509 static chr *
511  struct dfa *d,
512  chr *start, /* where the match should start */
513  chr *min, /* match must end at or after here */
514  chr *max, /* match must end at or before here */
515  bool shortest)
516 {
517  int n = d->backno;
518  int backmin = d->backmin;
519  int backmax = d->backmax;
520  size_t numreps;
521  size_t minreps;
522  size_t maxreps;
523  size_t brlen;
524  chr *brstring;
525  chr *p;
526 
527  /* get the backreferenced string (caller should have checked this) */
528  if (v->pmatch[n].rm_so == -1)
529  return NULL;
530  brstring = v->start + v->pmatch[n].rm_so;
531  brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
532 
533  /* special-case zero-length backreference to avoid divide by zero */
534  if (brlen == 0)
535  {
536  /*
537  * matches only a zero-length string, but any number of repetitions
538  * can be considered to be present
539  */
540  if (min == start && backmin <= backmax)
541  return start;
542  return NULL;
543  }
544 
545  /*
546  * convert min and max into numbers of possible repetitions of the backref
547  * string, rounding appropriately
548  */
549  if (min <= start)
550  minreps = 0;
551  else
552  minreps = (min - start - 1) / brlen + 1;
553  maxreps = (max - start) / brlen;
554 
555  /* apply bounds, then see if there is any allowed match length */
556  if (minreps < backmin)
557  minreps = backmin;
558  if (backmax != DUPINF && maxreps > backmax)
559  maxreps = backmax;
560  if (maxreps < minreps)
561  return NULL;
562 
563  /* quick exit if zero-repetitions match is valid and preferred */
564  if (shortest && minreps == 0)
565  return start;
566 
567  /* okay, compare the actual string contents */
568  p = start;
569  numreps = 0;
570  while (numreps < maxreps)
571  {
572  if ((*v->g->compare) (brstring, p, brlen) != 0)
573  break;
574  p += brlen;
575  numreps++;
576  if (shortest && numreps >= minreps)
577  break;
578  }
579 
580  if (numreps >= minreps)
581  return p;
582  return NULL;
583 }
584 
585 /*
586  * lastcold - determine last point at which no progress had been made
587  */
588 static chr * /* endpoint, or NULL */
589 lastcold(struct vars *v,
590  struct dfa *d)
591 {
592  struct sset *ss;
593  chr *nopr;
594  int i;
595 
596  nopr = d->lastnopr;
597  if (nopr == NULL)
598  nopr = v->start;
599  for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
600  if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
601  nopr = ss->lastseen;
602  return nopr;
603 }
604 
605 /*
606  * newdfa - set up a fresh DFA
607  *
608  * Returns NULL (and sets v->err) on failure.
609  */
610 static struct dfa *
611 newdfa(struct vars *v,
612  struct cnfa *cnfa,
613  struct colormap *cm,
614  struct smalldfa *sml) /* preallocated space, may be NULL */
615 {
616  struct dfa *d;
617  size_t nss = cnfa->nstates * 2;
618  int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
619  bool ismalloced = false;
620 
621  assert(cnfa != NULL && cnfa->nstates != 0);
622 
623  if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
624  {
625  assert(wordsper == 1);
626  if (sml == NULL)
627  {
628  sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
629  if (sml == NULL)
630  {
631  ERR(REG_ESPACE);
632  return NULL;
633  }
634  ismalloced = true;
635  }
636  d = &sml->dfa;
637  d->ssets = sml->ssets;
638  d->statesarea = sml->statesarea;
639  d->work = &d->statesarea[nss];
640  d->outsarea = sml->outsarea;
641  d->incarea = sml->incarea;
642  d->ismalloced = ismalloced;
643  d->arraysmalloced = false; /* not separately allocated, anyway */
644  }
645  else
646  {
647  d = (struct dfa *) MALLOC(sizeof(struct dfa));
648  if (d == NULL)
649  {
650  ERR(REG_ESPACE);
651  return NULL;
652  }
653  d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
654  d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
655  sizeof(unsigned));
656  d->work = &d->statesarea[nss * wordsper];
657  d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
658  sizeof(struct sset *));
659  d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
660  sizeof(struct arcp));
661  d->ismalloced = true;
662  d->arraysmalloced = true;
663  /* now freedfa() will behave sanely */
664  if (d->ssets == NULL || d->statesarea == NULL ||
665  d->outsarea == NULL || d->incarea == NULL)
666  {
667  freedfa(d);
668  ERR(REG_ESPACE);
669  return NULL;
670  }
671  }
672 
673  d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
674  d->nssused = 0;
675  d->nstates = cnfa->nstates;
676  d->ncolors = cnfa->ncolors;
677  d->wordsper = wordsper;
678  d->cnfa = cnfa;
679  d->cm = cm;
680  d->lastpost = NULL;
681  d->lastnopr = NULL;
682  d->search = d->ssets;
683  d->backno = -1; /* may be set by caller */
684  d->backmin = d->backmax = 0;
685 
686  /* initialization of sset fields is done as needed */
687 
688  return d;
689 }
690 
691 /*
692  * freedfa - free a DFA
693  */
694 static void
695 freedfa(struct dfa *d)
696 {
697  if (d->arraysmalloced)
698  {
699  if (d->ssets != NULL)
700  FREE(d->ssets);
701  if (d->statesarea != NULL)
702  FREE(d->statesarea);
703  if (d->outsarea != NULL)
704  FREE(d->outsarea);
705  if (d->incarea != NULL)
706  FREE(d->incarea);
707  }
708 
709  if (d->ismalloced)
710  FREE(d);
711 }
712 
713 /*
714  * hash - construct a hash code for a bitvector
715  *
716  * There are probably better ways, but they're more expensive.
717  */
718 static unsigned
719 hash(unsigned *uv,
720  int n)
721 {
722  int i;
723  unsigned h;
724 
725  h = 0;
726  for (i = 0; i < n; i++)
727  h ^= uv[i];
728  return h;
729 }
730 
731 /*
732  * initialize - hand-craft a cache entry for startup, otherwise get ready
733  */
734 static struct sset *
735 initialize(struct vars *v,
736  struct dfa *d,
737  chr *start)
738 {
739  struct sset *ss;
740  int i;
741 
742  /* is previous one still there? */
743  if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
744  ss = &d->ssets[0];
745  else
746  { /* no, must (re)build it */
747  ss = getvacant(v, d, start, start);
748  if (ss == NULL)
749  return NULL;
750  for (i = 0; i < d->wordsper; i++)
751  ss->states[i] = 0;
752  BSET(ss->states, d->cnfa->pre);
753  ss->hash = HASH(ss->states, d->wordsper);
754  assert(d->cnfa->pre != d->cnfa->post);
755  ss->flags = STARTER | LOCKED | NOPROGRESS;
756  /* lastseen dealt with below */
757  }
758 
759  for (i = 0; i < d->nssused; i++)
760  d->ssets[i].lastseen = NULL;
761  ss->lastseen = start; /* maybe untrue, but harmless */
762  d->lastpost = NULL;
763  d->lastnopr = NULL;
764  return ss;
765 }
766 
767 /*
768  * miss - handle a stateset cache miss
769  *
770  * css is the current stateset, co is the color of the current input character,
771  * cp points to the character after that (which is where we may need to test
772  * LACONs). start does not affect matching behavior but is needed for pickss'
773  * heuristics about which stateset cache entry to replace.
774  *
775  * Ordinarily, returns the address of the next stateset (the one that is
776  * valid after consuming the input character). Returns NULL if no valid
777  * NFA states remain, ie we have a certain match failure.
778  * Internal errors also return NULL, with v->err set.
779  */
780 static struct sset *
781 miss(struct vars *v,
782  struct dfa *d,
783  struct sset *css,
784  color co,
785  chr *cp, /* next chr */
786  chr *start) /* where the attempt got started */
787 {
788  struct cnfa *cnfa = d->cnfa;
789  int i;
790  unsigned h;
791  struct carc *ca;
792  struct sset *p;
793  int ispseudocolor;
794  int ispost;
795  int noprogress;
796  int gotstate;
797  int dolacons;
798  int sawlacons;
799 
800  /* for convenience, we can be called even if it might not be a miss */
801  if (css->outs[co] != NULL)
802  {
803  FDEBUG(("hit\n"));
804  return css->outs[co];
805  }
806  FDEBUG(("miss\n"));
807 
808  /*
809  * Checking for operation cancel in the inner text search loop seems
810  * unduly expensive. As a compromise, check during cache misses.
811  */
812  if (CANCEL_REQUESTED(v->re))
813  {
814  ERR(REG_CANCEL);
815  return NULL;
816  }
817 
818  /*
819  * What set of states would we end up in after consuming the co character?
820  * We first consider PLAIN arcs that consume the character, and then look
821  * to see what LACON arcs could be traversed after consuming it.
822  */
823  for (i = 0; i < d->wordsper; i++)
824  d->work[i] = 0; /* build new stateset bitmap in d->work */
825  ispseudocolor = d->cm->cd[co].flags & PSEUDO;
826  ispost = 0;
827  noprogress = 1;
828  gotstate = 0;
829  for (i = 0; i < d->nstates; i++)
830  if (ISBSET(css->states, i))
831  for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
832  if (ca->co == co ||
833  (ca->co == RAINBOW && !ispseudocolor))
834  {
835  BSET(d->work, ca->to);
836  gotstate = 1;
837  if (ca->to == cnfa->post)
838  ispost = 1;
839  if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
840  noprogress = 0;
841  FDEBUG(("%d -> %d\n", i, ca->to));
842  }
843  if (!gotstate)
844  return NULL; /* character cannot reach any new state */
845  dolacons = (cnfa->flags & HASLACONS);
846  sawlacons = 0;
847  /* outer loop handles transitive closure of reachable-by-LACON states */
848  while (dolacons)
849  {
850  dolacons = 0;
851  for (i = 0; i < d->nstates; i++)
852  if (ISBSET(d->work, i))
853  for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
854  {
855  if (ca->co < cnfa->ncolors)
856  continue; /* not a LACON arc */
857  if (ISBSET(d->work, ca->to))
858  continue; /* arc would be a no-op anyway */
859  sawlacons = 1; /* this LACON affects our result */
860  if (!lacon(v, cnfa, cp, ca->co))
861  {
862  if (ISERR())
863  return NULL;
864  continue; /* LACON arc cannot be traversed */
865  }
866  if (ISERR())
867  return NULL;
868  BSET(d->work, ca->to);
869  dolacons = 1;
870  if (ca->to == cnfa->post)
871  ispost = 1;
872  if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
873  noprogress = 0;
874  FDEBUG(("%d :> %d\n", i, ca->to));
875  }
876  }
877  h = HASH(d->work, d->wordsper);
878 
879  /* Is this stateset already in the cache? */
880  for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
881  if (HIT(h, d->work, p, d->wordsper))
882  {
883  FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
884  break; /* NOTE BREAK OUT */
885  }
886  if (i == 0)
887  { /* nope, need a new cache entry */
888  p = getvacant(v, d, cp, start);
889  if (p == NULL)
890  return NULL;
891  assert(p != css);
892  for (i = 0; i < d->wordsper; i++)
893  p->states[i] = d->work[i];
894  p->hash = h;
895  p->flags = (ispost) ? POSTSTATE : 0;
896  if (noprogress)
897  p->flags |= NOPROGRESS;
898  /* lastseen to be dealt with by caller */
899  }
900 
901  /*
902  * Link new stateset to old, unless a LACON affected the result, in which
903  * case we don't create the link. That forces future transitions across
904  * this same arc (same prior stateset and character color) to come through
905  * miss() again, so that we can recheck the LACON(s), which might or might
906  * not pass since context will be different.
907  */
908  if (!sawlacons)
909  {
910  FDEBUG(("c%d[%d]->c%d\n",
911  (int) (css - d->ssets), co, (int) (p - d->ssets)));
912  css->outs[co] = p;
913  css->inchain[co] = p->ins;
914  p->ins.ss = css;
915  p->ins.co = co;
916  }
917  return p;
918 }
919 
920 /*
921  * lacon - lookaround-constraint checker for miss()
922  */
923 static int /* predicate: constraint satisfied? */
924 lacon(struct vars *v,
925  struct cnfa *pcnfa, /* parent cnfa */
926  chr *cp,
927  color co) /* "color" of the lookaround constraint */
928 {
929  int n;
930  struct subre *sub;
931  struct dfa *d;
932  chr *end;
933  int satisfied;
934 
935  /* Since this is recursive, it could be driven to stack overflow */
936  if (STACK_TOO_DEEP(v->re))
937  {
938  ERR(REG_ETOOBIG);
939  return 0;
940  }
941 
942  n = co - pcnfa->ncolors;
943  assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
944  FDEBUG(("=== testing lacon %d\n", n));
945  sub = &v->g->lacons[n];
946  d = getladfa(v, n);
947  if (d == NULL)
948  return 0;
949  if (LATYPE_IS_AHEAD(sub->latype))
950  {
951  /* used to use longest() here, but shortest() could be much cheaper */
952  end = shortest(v, d, cp, cp, v->stop,
953  (chr **) NULL, (int *) NULL);
954  satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
955  }
956  else
957  {
958  /*
959  * To avoid doing O(N^2) work when repeatedly testing a lookbehind
960  * constraint in an N-character string, we use matchuntil() which can
961  * cache the DFA state across calls. We only need to restart if the
962  * probe point decreases, which is not common. The NFA we're using is
963  * a search NFA, so it doesn't mind scanning over stuff before the
964  * nominal match.
965  */
966  satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
967  if (!LATYPE_IS_POS(sub->latype))
968  satisfied = !satisfied;
969  }
970  FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
971  return satisfied;
972 }
973 
974 /*
975  * getvacant - get a vacant state set
976  *
977  * This routine clears out the inarcs and outarcs, but does not otherwise
978  * clear the innards of the state set -- that's up to the caller.
979  */
980 static struct sset *
981 getvacant(struct vars *v,
982  struct dfa *d,
983  chr *cp,
984  chr *start)
985 {
986  int i;
987  struct sset *ss;
988  struct sset *p;
989  struct arcp ap;
990  color co;
991 
992  ss = pickss(v, d, cp, start);
993  if (ss == NULL)
994  return NULL;
995  assert(!(ss->flags & LOCKED));
996 
997  /* clear out its inarcs, including self-referential ones */
998  ap = ss->ins;
999  while ((p = ap.ss) != NULL)
1000  {
1001  co = ap.co;
1002  FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
1003  p->outs[co] = NULL;
1004  ap = p->inchain[co];
1005  p->inchain[co].ss = NULL; /* paranoia */
1006  }
1007  ss->ins.ss = NULL;
1008 
1009  /* take it off the inarc chains of the ssets reached by its outarcs */
1010  for (i = 0; i < d->ncolors; i++)
1011  {
1012  p = ss->outs[i];
1013  assert(p != ss); /* not self-referential */
1014  if (p == NULL)
1015  continue; /* NOTE CONTINUE */
1016  FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
1017  if (p->ins.ss == ss && p->ins.co == i)
1018  p->ins = ss->inchain[i];
1019  else
1020  {
1021  struct arcp lastap = {NULL, 0};
1022 
1023  assert(p->ins.ss != NULL);
1024  for (ap = p->ins; ap.ss != NULL &&
1025  !(ap.ss == ss && ap.co == i);
1026  ap = ap.ss->inchain[ap.co])
1027  lastap = ap;
1028  assert(ap.ss != NULL);
1029  lastap.ss->inchain[lastap.co] = ss->inchain[i];
1030  }
1031  ss->outs[i] = NULL;
1032  ss->inchain[i].ss = NULL;
1033  }
1034 
1035  /* if ss was a success state, may need to remember location */
1036  if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
1037  (d->lastpost == NULL || d->lastpost < ss->lastseen))
1038  d->lastpost = ss->lastseen;
1039 
1040  /* likewise for a no-progress state */
1041  if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
1042  (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
1043  d->lastnopr = ss->lastseen;
1044 
1045  return ss;
1046 }
1047 
1048 /*
1049  * pickss - pick the next stateset to be used
1050  */
1051 static struct sset *
1052 pickss(struct vars *v,
1053  struct dfa *d,
1054  chr *cp,
1055  chr *start)
1056 {
1057  int i;
1058  struct sset *ss;
1059  struct sset *end;
1060  chr *ancient;
1061 
1062  /* shortcut for cases where cache isn't full */
1063  if (d->nssused < d->nssets)
1064  {
1065  i = d->nssused;
1066  d->nssused++;
1067  ss = &d->ssets[i];
1068  FDEBUG(("new c%d\n", i));
1069  /* set up innards */
1070  ss->states = &d->statesarea[i * d->wordsper];
1071  ss->flags = 0;
1072  ss->ins.ss = NULL;
1073  ss->ins.co = WHITE; /* give it some value */
1074  ss->outs = &d->outsarea[i * d->ncolors];
1075  ss->inchain = &d->incarea[i * d->ncolors];
1076  for (i = 0; i < d->ncolors; i++)
1077  {
1078  ss->outs[i] = NULL;
1079  ss->inchain[i].ss = NULL;
1080  }
1081  return ss;
1082  }
1083 
1084  /* look for oldest, or old enough anyway */
1085  if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
1086  ancient = cp - d->nssets * 2 / 3;
1087  else
1088  ancient = start;
1089  for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
1090  if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1091  !(ss->flags & LOCKED))
1092  {
1093  d->search = ss + 1;
1094  FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1095  return ss;
1096  }
1097  for (ss = d->ssets, end = d->search; ss < end; ss++)
1098  if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1099  !(ss->flags & LOCKED))
1100  {
1101  d->search = ss + 1;
1102  FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1103  return ss;
1104  }
1105 
1106  /* nobody's old enough?!? -- something's really wrong */
1107  FDEBUG(("cannot find victim to replace!\n"));
1108  ERR(REG_ASSERT);
1109  return NULL;
1110 }
struct sset * ssets
Definition: regexec.c:70
#define STARTER
Definition: regexec.c:53
#define LATYPE_IS_POS(la)
Definition: regguts.h:103
static int matchuntil(struct vars *v, struct dfa *d, chr *probe, struct sset **lastcss, chr **lastcp)
Definition: rege_dfa.c:371
chr * lastseen
Definition: regexec.c:58
#define REG_NOTBOL
Definition: regex.h:126
struct sset ** outs
Definition: regexec.c:59
#define ERR
Definition: _int.h:161
short backmin
Definition: regexec.c:81
struct arcp * incarea
Definition: regexec.c:74
int maxmatchall
Definition: regguts.h:419
struct subre * lacons
Definition: regguts.h:538
struct colormap * cm
Definition: regexec.c:76
#define CNFA_NOPROGRESS
Definition: regguts.h:413
#define REG_ESPACE
Definition: regex.h:151
struct cnfa * cnfa
Definition: regexec.c:75
#define RAINBOW
Definition: regguts.h:154
regoff_t rm_so
Definition: regex.h:87
Definition: regguts.h:401
short color
Definition: regguts.h:150
static chr * dfa_backref(struct vars *v, struct dfa *d, chr *start, chr *min, chr *max, bool shortest)
Definition: rege_dfa.c:510
int pre
Definition: regguts.h:408
#define WORK
Definition: regexec.c:87
static struct sset * pickss(struct vars *v, struct dfa *d, chr *cp, chr *start)
Definition: rege_dfa.c:1052
#define REG_FTRACE
Definition: regex.h:129
unsigned * work
Definition: regexec.c:72
#define FREE(ptr)
Definition: cryptohash.c:37
struct sset * search
Definition: regexec.c:79
static void freedfa(struct dfa *d)
Definition: rege_dfa.c:695
#define ISERR()
Definition: regcomp.c:273
static struct sset * initialize(struct vars *v, struct dfa *d, chr *start)
Definition: rege_dfa.c:735
char latype
Definition: regguts.h:489
#define FEWCOLORS
Definition: regexec.c:91
struct carc ** states
Definition: regguts.h:414
struct colordesc * cd
Definition: regguts.h:231
#define HIT(h, bv, ss, nw)
Definition: regexec.c:50
chr * start
Definition: regexec.c:114
regoff_t rm_eo
Definition: regex.h:88
#define MALLOC(n)
Definition: regcustom.h:60
Definition: regexec.c:63
struct sset ** lblastcss
Definition: regexec.c:120
static struct sset * getvacant(struct vars *v, struct dfa *d, chr *cp, chr *start)
Definition: rege_dfa.c:981
int nstates
Definition: regguts.h:403
static struct dfa * newdfa(struct vars *v, struct cnfa *cnfa, struct colormap *cm, struct smalldfa *sml)
Definition: rege_dfa.c:611
#define HASH(sign, val, siglen)
Definition: hstore_gist.c:45
pg_wchar chr
Definition: regcustom.h:66
unsigned statesarea[FEWSTATES *2+WORK]
Definition: regexec.c:96
unsigned * states
Definition: regexec.c:47
static chr * lastcold(struct vars *v, struct dfa *d)
Definition: rege_dfa.c:589
bool ismalloced
Definition: regexec.c:83
regex_t * re
Definition: regcomp.c:239
struct arcp * inchain
Definition: regexec.c:60
struct dfa dfa
Definition: regexec.c:94
#define NOPROGRESS
Definition: regexec.c:56
bool arraysmalloced
Definition: regexec.c:84
#define assert(TEST)
Definition: imath.c:73
int ncolors
Definition: regexec.c:68
#define LATYPE_IS_AHEAD(la)
Definition: regguts.h:104
char * stflags
Definition: regguts.h:412
int nstates
Definition: regexec.c:67
int nssets
Definition: regexec.c:65
int nssused
Definition: regexec.c:66
int wordsper
Definition: regexec.c:69
struct guts * g
Definition: regexec.c:109
#define MATCHALL
Definition: regguts.h:407
int eflags
Definition: regexec.c:110
#define UBITS
Definition: regguts.h:125
color bos[2]
Definition: regguts.h:410
struct arcp incarea[FEWSTATES *2 *FEWCOLORS]
Definition: regexec.c:98
chr * lastnopr
Definition: regexec.c:78
#define REG_SMALL
Definition: regex.h:131
#define FDEBUG(arglist)
Definition: regguts.h:116
static struct sset * miss(struct vars *v, struct dfa *d, struct sset *css, color co, chr *cp, chr *start)
Definition: rege_dfa.c:781
static chr * shortest(struct vars *v, struct dfa *d, chr *start, chr *min, chr *max, chr **coldp, int *hitstopp)
Definition: rege_dfa.c:204
Definition: regexec.c:39
int to
Definition: regguts.h:398
struct sset ssets[FEWSTATES *2]
Definition: regexec.c:95
#define DUPINF
Definition: regguts.h:94
#define REG_ASSERT
Definition: regex.h:153
#define ISBSET(uv, sn)
Definition: regguts.h:127
int backno
Definition: regexec.c:80
int flags
Definition: regguts.h:179
regmatch_t * pmatch
Definition: regexec.c:112
int flags
Definition: regguts.h:405
short backmax
Definition: regexec.c:82
struct sset * outsarea[FEWSTATES *2 *FEWCOLORS]
Definition: regexec.c:97
#define POSTSTATE
Definition: regexec.c:54
#define REG_ETOOBIG
Definition: regex.h:157
#define WHITE
Definition: regguts.h:155
int minmatchall
Definition: regguts.h:418
static chr * longest(struct vars *v, struct dfa *d, chr *start, chr *stop, int *hitstopp)
Definition: rege_dfa.c:42
Definition: regguts.h:471
size_t nmatch
Definition: regexec.c:111
struct arcp ins
Definition: regexec.c:57
int post
Definition: regguts.h:409
const chr * stop
Definition: regcomp.c:241
struct vars * v
Definition: regguts.h:227
#define BSET(uv, sn)
Definition: regguts.h:126
struct sset ** outsarea
Definition: regexec.c:73
#define PSEUDO
Definition: regguts.h:181
#define LOCKED
Definition: regexec.c:55
color co
Definition: regguts.h:397
Definition: regexec.c:45
#define CANCEL_REQUESTED(re)
Definition: regguts.h:516
int i
#define COLORLESS
Definition: regguts.h:153
chr * lastpost
Definition: regexec.c:77
chr ** lblastcp
Definition: regexec.c:121
static int lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co)
Definition: rege_dfa.c:924
unsigned hash
Definition: regexec.c:48
struct sset * ss
Definition: regexec.c:41
unsigned * statesarea
Definition: regexec.c:71
static struct dfa * getladfa(struct vars *, int)
Definition: regexec.c:374
int flags
Definition: regexec.c:52
Definition: regcomp.c:237
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:719
#define HASLACONS
Definition: regguts.h:406
#define REG_NOTEOL
Definition: regex.h:127
#define GETCOLOR(cm, c)
Definition: regguts.h:252
size_t max
Definition: regguts.h:229
#define STACK_TOO_DEEP(re)
Definition: regguts.h:519
color eos[2]
Definition: regguts.h:411
int ncolors
Definition: regguts.h:404
color co
Definition: regexec.c:42
#define REG_CANCEL
Definition: regex.h:159
Definition: regguts.h:395