PostgreSQL Source Code  git master
regc_nfa.c
Go to the documentation of this file.
1 /*
2  * NFA utilities.
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results. The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/regc_nfa.c
32  *
33  *
34  * One or two things that technically ought to be in here
35  * are actually in color.c, thanks to some incestuous relationships in
36  * the color chains.
37  */
38 
39 #define NISERR() VISERR(nfa->v)
40 #define NERR(e) VERR(nfa->v, (e))
41 
42 
43 /*
44  * newnfa - set up an NFA
45  */
46 static struct nfa * /* the NFA, or NULL */
47 newnfa(struct vars *v,
48  struct colormap *cm,
49  struct nfa *parent) /* NULL if primary NFA */
50 {
51  struct nfa *nfa;
52 
53  nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54  if (nfa == NULL)
55  {
56  ERR(REG_ESPACE);
57  return NULL;
58  }
59 
60  /* Make the NFA minimally valid, so freenfa() will behave sanely */
61  nfa->states = NULL;
62  nfa->slast = NULL;
63  nfa->freestates = NULL;
64  nfa->freearcs = NULL;
65  nfa->lastsb = NULL;
66  nfa->lastab = NULL;
67  nfa->lastsbused = 0;
68  nfa->lastabused = 0;
69  nfa->nstates = 0;
70  nfa->cm = cm;
71  nfa->v = v;
72  nfa->bos[0] = nfa->bos[1] = COLORLESS;
73  nfa->eos[0] = nfa->eos[1] = COLORLESS;
74  nfa->flags = 0;
75  nfa->minmatchall = nfa->maxmatchall = -1;
76  nfa->parent = parent; /* Precedes newfstate so parent is valid. */
77 
78  /* Create required infrastructure */
79  nfa->post = newfstate(nfa, '@'); /* number 0 */
80  nfa->pre = newfstate(nfa, '>'); /* number 1 */
81  nfa->init = newstate(nfa); /* may become invalid later */
82  nfa->final = newstate(nfa);
83  if (ISERR())
84  {
85  freenfa(nfa);
86  return NULL;
87  }
88  rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
89  newarc(nfa, '^', 1, nfa->pre, nfa->init);
90  newarc(nfa, '^', 0, nfa->pre, nfa->init);
91  rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
92  newarc(nfa, '$', 1, nfa->final, nfa->post);
93  newarc(nfa, '$', 0, nfa->final, nfa->post);
94 
95  if (ISERR())
96  {
97  freenfa(nfa);
98  return NULL;
99  }
100  return nfa;
101 }
102 
103 /*
104  * freenfa - free an entire NFA
105  */
106 static void
107 freenfa(struct nfa *nfa)
108 {
109  struct statebatch *sb;
110  struct statebatch *sbnext;
111  struct arcbatch *ab;
112  struct arcbatch *abnext;
113 
114  for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
115  {
116  sbnext = sb->next;
117  nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
118  FREE(sb);
119  }
120  nfa->lastsb = NULL;
121  for (ab = nfa->lastab; ab != NULL; ab = abnext)
122  {
123  abnext = ab->next;
124  nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
125  FREE(ab);
126  }
127  nfa->lastab = NULL;
128 
129  nfa->nstates = -1;
130  FREE(nfa);
131 }
132 
133 /*
134  * newstate - allocate an NFA state, with zero flag value
135  */
136 static struct state * /* NULL on error */
137 newstate(struct nfa *nfa)
138 {
139  struct state *s;
140 
141  /*
142  * This is a handy place to check for operation cancel during regex
143  * compilation, since no code path will go very long without making a new
144  * state or arc.
145  */
146  if (CANCEL_REQUESTED(nfa->v->re))
147  {
148  NERR(REG_CANCEL);
149  return NULL;
150  }
151 
152  /* first, recycle anything that's on the freelist */
153  if (nfa->freestates != NULL)
154  {
155  s = nfa->freestates;
156  nfa->freestates = s->next;
157  }
158  /* otherwise, is there anything left in the last statebatch? */
159  else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
160  {
161  s = &nfa->lastsb->s[nfa->lastsbused++];
162  }
163  /* otherwise, need to allocate a new statebatch */
164  else
165  {
166  struct statebatch *newSb;
167  size_t nstates;
168 
169  if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
170  {
171  NERR(REG_ETOOBIG);
172  return NULL;
173  }
174  nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
175  if (nstates > MAXSBSIZE)
176  nstates = MAXSBSIZE;
177  newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
178  if (newSb == NULL)
179  {
180  NERR(REG_ESPACE);
181  return NULL;
182  }
183  nfa->v->spaceused += STATEBATCHSIZE(nstates);
184  newSb->nstates = nstates;
185  newSb->next = nfa->lastsb;
186  nfa->lastsb = newSb;
187  nfa->lastsbused = 1;
188  s = &newSb->s[0];
189  }
190 
191  assert(nfa->nstates >= 0);
192  s->no = nfa->nstates++;
193  s->flag = 0;
194  if (nfa->states == NULL)
195  nfa->states = s;
196  s->nins = 0;
197  s->ins = NULL;
198  s->nouts = 0;
199  s->outs = NULL;
200  s->tmp = NULL;
201  s->next = NULL;
202  if (nfa->slast != NULL)
203  {
204  assert(nfa->slast->next == NULL);
205  nfa->slast->next = s;
206  }
207  s->prev = nfa->slast;
208  nfa->slast = s;
209  return s;
210 }
211 
212 /*
213  * newfstate - allocate an NFA state with a specified flag value
214  */
215 static struct state * /* NULL on error */
216 newfstate(struct nfa *nfa, int flag)
217 {
218  struct state *s;
219 
220  s = newstate(nfa);
221  if (s != NULL)
222  s->flag = (char) flag;
223  return s;
224 }
225 
226 /*
227  * dropstate - delete a state's inarcs and outarcs and free it
228  */
229 static void
230 dropstate(struct nfa *nfa,
231  struct state *s)
232 {
233  struct arc *a;
234 
235  while ((a = s->ins) != NULL)
236  freearc(nfa, a);
237  while ((a = s->outs) != NULL)
238  freearc(nfa, a);
239  freestate(nfa, s);
240 }
241 
242 /*
243  * freestate - free a state, which has no in-arcs or out-arcs
244  */
245 static void
246 freestate(struct nfa *nfa,
247  struct state *s)
248 {
249  assert(s != NULL);
250  assert(s->nins == 0 && s->nouts == 0);
251 
252  s->no = FREESTATE;
253  s->flag = 0;
254  if (s->next != NULL)
255  s->next->prev = s->prev;
256  else
257  {
258  assert(s == nfa->slast);
259  nfa->slast = s->prev;
260  }
261  if (s->prev != NULL)
262  s->prev->next = s->next;
263  else
264  {
265  assert(s == nfa->states);
266  nfa->states = s->next;
267  }
268  s->prev = NULL;
269  s->next = nfa->freestates; /* don't delete it, put it on the free list */
270  nfa->freestates = s;
271 }
272 
273 /*
274  * newarc - set up a new arc within an NFA
275  *
276  * This function checks to make sure that no duplicate arcs are created.
277  * In general we never want duplicates.
278  *
279  * However: in principle, a RAINBOW arc is redundant with any plain arc
280  * (unless that arc is for a pseudocolor). But we don't try to recognize
281  * that redundancy, either here or in allied operations such as moveins().
282  * The pseudocolor consideration makes that more costly than it seems worth.
283  */
284 static void
285 newarc(struct nfa *nfa,
286  int t,
287  color co,
288  struct state *from,
289  struct state *to)
290 {
291  struct arc *a;
292 
293  assert(from != NULL && to != NULL);
294 
295  /*
296  * This is a handy place to check for operation cancel during regex
297  * compilation, since no code path will go very long without making a new
298  * state or arc.
299  */
300  if (CANCEL_REQUESTED(nfa->v->re))
301  {
302  NERR(REG_CANCEL);
303  return;
304  }
305 
306  /* check for duplicate arc, using whichever chain is shorter */
307  if (from->nouts <= to->nins)
308  {
309  for (a = from->outs; a != NULL; a = a->outchain)
310  if (a->to == to && a->co == co && a->type == t)
311  return;
312  }
313  else
314  {
315  for (a = to->ins; a != NULL; a = a->inchain)
316  if (a->from == from && a->co == co && a->type == t)
317  return;
318  }
319 
320  /* no dup, so create the arc */
321  createarc(nfa, t, co, from, to);
322 }
323 
324 /*
325  * createarc - create a new arc within an NFA
326  *
327  * This function must *only* be used after verifying that there is no existing
328  * identical arc (same type/color/from/to).
329  */
330 static void
331 createarc(struct nfa *nfa,
332  int t,
333  color co,
334  struct state *from,
335  struct state *to)
336 {
337  struct arc *a;
338 
339  a = allocarc(nfa);
340  if (NISERR())
341  return;
342  assert(a != NULL);
343 
344  a->type = t;
345  a->co = co;
346  a->to = to;
347  a->from = from;
348 
349  /*
350  * Put the new arc on the beginning, not the end, of the chains; it's
351  * simpler here, and freearc() is the same cost either way. See also the
352  * logic in moveins() and its cohorts, as well as fixempties().
353  */
354  a->inchain = to->ins;
355  a->inchainRev = NULL;
356  if (to->ins)
357  to->ins->inchainRev = a;
358  to->ins = a;
359  a->outchain = from->outs;
360  a->outchainRev = NULL;
361  if (from->outs)
362  from->outs->outchainRev = a;
363  from->outs = a;
364 
365  from->nouts++;
366  to->nins++;
367 
368  if (COLORED(a) && nfa->parent == NULL)
369  colorchain(nfa->cm, a);
370 }
371 
372 /*
373  * allocarc - allocate a new arc within an NFA
374  */
375 static struct arc * /* NULL for failure */
376 allocarc(struct nfa *nfa)
377 {
378  struct arc *a;
379 
380  /* first, recycle anything that's on the freelist */
381  if (nfa->freearcs != NULL)
382  {
383  a = nfa->freearcs;
384  nfa->freearcs = a->freechain;
385  }
386  /* otherwise, is there anything left in the last arcbatch? */
387  else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
388  {
389  a = &nfa->lastab->a[nfa->lastabused++];
390  }
391  /* otherwise, need to allocate a new arcbatch */
392  else
393  {
394  struct arcbatch *newAb;
395  size_t narcs;
396 
397  if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
398  {
399  NERR(REG_ETOOBIG);
400  return NULL;
401  }
402  narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
403  if (narcs > MAXABSIZE)
404  narcs = MAXABSIZE;
405  newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
406  if (newAb == NULL)
407  {
408  NERR(REG_ESPACE);
409  return NULL;
410  }
411  nfa->v->spaceused += ARCBATCHSIZE(narcs);
412  newAb->narcs = narcs;
413  newAb->next = nfa->lastab;
414  nfa->lastab = newAb;
415  nfa->lastabused = 1;
416  a = &newAb->a[0];
417  }
418 
419  return a;
420 }
421 
422 /*
423  * freearc - free an arc
424  */
425 static void
426 freearc(struct nfa *nfa,
427  struct arc *victim)
428 {
429  struct state *from = victim->from;
430  struct state *to = victim->to;
431  struct arc *predecessor;
432 
433  assert(victim->type != 0);
434 
435  /* take it off color chain if necessary */
436  if (COLORED(victim) && nfa->parent == NULL)
437  uncolorchain(nfa->cm, victim);
438 
439  /* take it off source's out-chain */
440  assert(from != NULL);
441  predecessor = victim->outchainRev;
442  if (predecessor == NULL)
443  {
444  assert(from->outs == victim);
445  from->outs = victim->outchain;
446  }
447  else
448  {
449  assert(predecessor->outchain == victim);
450  predecessor->outchain = victim->outchain;
451  }
452  if (victim->outchain != NULL)
453  {
454  assert(victim->outchain->outchainRev == victim);
455  victim->outchain->outchainRev = predecessor;
456  }
457  from->nouts--;
458 
459  /* take it off target's in-chain */
460  assert(to != NULL);
461  predecessor = victim->inchainRev;
462  if (predecessor == NULL)
463  {
464  assert(to->ins == victim);
465  to->ins = victim->inchain;
466  }
467  else
468  {
469  assert(predecessor->inchain == victim);
470  predecessor->inchain = victim->inchain;
471  }
472  if (victim->inchain != NULL)
473  {
474  assert(victim->inchain->inchainRev == victim);
475  victim->inchain->inchainRev = predecessor;
476  }
477  to->nins--;
478 
479  /* clean up and place on NFA's free list */
480  victim->type = 0;
481  victim->from = NULL; /* precautions... */
482  victim->to = NULL;
483  victim->inchain = NULL;
484  victim->inchainRev = NULL;
485  victim->outchain = NULL;
486  victim->outchainRev = NULL;
487  victim->freechain = nfa->freearcs;
488  nfa->freearcs = victim;
489 }
490 
491 /*
492  * changearcsource - flip an arc to have a different from state
493  *
494  * Caller must have verified that there is no pre-existing duplicate arc.
495  */
496 static void
497 changearcsource(struct arc *a, struct state *newfrom)
498 {
499  struct state *oldfrom = a->from;
500  struct arc *predecessor;
501 
502  assert(oldfrom != newfrom);
503 
504  /* take it off old source's out-chain */
505  assert(oldfrom != NULL);
506  predecessor = a->outchainRev;
507  if (predecessor == NULL)
508  {
509  assert(oldfrom->outs == a);
510  oldfrom->outs = a->outchain;
511  }
512  else
513  {
514  assert(predecessor->outchain == a);
515  predecessor->outchain = a->outchain;
516  }
517  if (a->outchain != NULL)
518  {
519  assert(a->outchain->outchainRev == a);
520  a->outchain->outchainRev = predecessor;
521  }
522  oldfrom->nouts--;
523 
524  a->from = newfrom;
525 
526  /* prepend it to new source's out-chain */
527  a->outchain = newfrom->outs;
528  a->outchainRev = NULL;
529  if (newfrom->outs)
530  newfrom->outs->outchainRev = a;
531  newfrom->outs = a;
532  newfrom->nouts++;
533 }
534 
535 /*
536  * changearctarget - flip an arc to have a different to state
537  *
538  * Caller must have verified that there is no pre-existing duplicate arc.
539  */
540 static void
541 changearctarget(struct arc *a, struct state *newto)
542 {
543  struct state *oldto = a->to;
544  struct arc *predecessor;
545 
546  assert(oldto != newto);
547 
548  /* take it off old target's in-chain */
549  assert(oldto != NULL);
550  predecessor = a->inchainRev;
551  if (predecessor == NULL)
552  {
553  assert(oldto->ins == a);
554  oldto->ins = a->inchain;
555  }
556  else
557  {
558  assert(predecessor->inchain == a);
559  predecessor->inchain = a->inchain;
560  }
561  if (a->inchain != NULL)
562  {
563  assert(a->inchain->inchainRev == a);
564  a->inchain->inchainRev = predecessor;
565  }
566  oldto->nins--;
567 
568  a->to = newto;
569 
570  /* prepend it to new target's in-chain */
571  a->inchain = newto->ins;
572  a->inchainRev = NULL;
573  if (newto->ins)
574  newto->ins->inchainRev = a;
575  newto->ins = a;
576  newto->nins++;
577 }
578 
579 /*
580  * hasnonemptyout - Does state have a non-EMPTY out arc?
581  */
582 static int
584 {
585  struct arc *a;
586 
587  for (a = s->outs; a != NULL; a = a->outchain)
588  {
589  if (a->type != EMPTY)
590  return 1;
591  }
592  return 0;
593 }
594 
595 /*
596  * findarc - find arc, if any, from given source with given type and color
597  * If there is more than one such arc, the result is random.
598  */
599 static struct arc *
600 findarc(struct state *s,
601  int type,
602  color co)
603 {
604  struct arc *a;
605 
606  for (a = s->outs; a != NULL; a = a->outchain)
607  if (a->type == type && a->co == co)
608  return a;
609  return NULL;
610 }
611 
612 /*
613  * cparc - allocate a new arc within an NFA, copying details from old one
614  */
615 static void
616 cparc(struct nfa *nfa,
617  struct arc *oa,
618  struct state *from,
619  struct state *to)
620 {
621  newarc(nfa, oa->type, oa->co, from, to);
622 }
623 
624 /*
625  * sortins - sort the in arcs of a state by from/color/type
626  */
627 static void
628 sortins(struct nfa *nfa,
629  struct state *s)
630 {
631  struct arc **sortarray;
632  struct arc *a;
633  int n = s->nins;
634  int i;
635 
636  if (n <= 1)
637  return; /* nothing to do */
638  /* make an array of arc pointers ... */
639  sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
640  if (sortarray == NULL)
641  {
642  NERR(REG_ESPACE);
643  return;
644  }
645  i = 0;
646  for (a = s->ins; a != NULL; a = a->inchain)
647  sortarray[i++] = a;
648  assert(i == n);
649  /* ... sort the array */
650  qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
651  /* ... and rebuild arc list in order */
652  /* it seems worth special-casing first and last items to simplify loop */
653  a = sortarray[0];
654  s->ins = a;
655  a->inchain = sortarray[1];
656  a->inchainRev = NULL;
657  for (i = 1; i < n - 1; i++)
658  {
659  a = sortarray[i];
660  a->inchain = sortarray[i + 1];
661  a->inchainRev = sortarray[i - 1];
662  }
663  a = sortarray[i];
664  a->inchain = NULL;
665  a->inchainRev = sortarray[i - 1];
666  FREE(sortarray);
667 }
668 
669 static int
670 sortins_cmp(const void *a, const void *b)
671 {
672  const struct arc *aa = *((const struct arc *const *) a);
673  const struct arc *bb = *((const struct arc *const *) b);
674 
675  /* we check the fields in the order they are most likely to be different */
676  if (aa->from->no < bb->from->no)
677  return -1;
678  if (aa->from->no > bb->from->no)
679  return 1;
680  if (aa->co < bb->co)
681  return -1;
682  if (aa->co > bb->co)
683  return 1;
684  if (aa->type < bb->type)
685  return -1;
686  if (aa->type > bb->type)
687  return 1;
688  return 0;
689 }
690 
691 /*
692  * sortouts - sort the out arcs of a state by to/color/type
693  */
694 static void
695 sortouts(struct nfa *nfa,
696  struct state *s)
697 {
698  struct arc **sortarray;
699  struct arc *a;
700  int n = s->nouts;
701  int i;
702 
703  if (n <= 1)
704  return; /* nothing to do */
705  /* make an array of arc pointers ... */
706  sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
707  if (sortarray == NULL)
708  {
709  NERR(REG_ESPACE);
710  return;
711  }
712  i = 0;
713  for (a = s->outs; a != NULL; a = a->outchain)
714  sortarray[i++] = a;
715  assert(i == n);
716  /* ... sort the array */
717  qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
718  /* ... and rebuild arc list in order */
719  /* it seems worth special-casing first and last items to simplify loop */
720  a = sortarray[0];
721  s->outs = a;
722  a->outchain = sortarray[1];
723  a->outchainRev = NULL;
724  for (i = 1; i < n - 1; i++)
725  {
726  a = sortarray[i];
727  a->outchain = sortarray[i + 1];
728  a->outchainRev = sortarray[i - 1];
729  }
730  a = sortarray[i];
731  a->outchain = NULL;
732  a->outchainRev = sortarray[i - 1];
733  FREE(sortarray);
734 }
735 
736 static int
737 sortouts_cmp(const void *a, const void *b)
738 {
739  const struct arc *aa = *((const struct arc *const *) a);
740  const struct arc *bb = *((const struct arc *const *) b);
741 
742  /* we check the fields in the order they are most likely to be different */
743  if (aa->to->no < bb->to->no)
744  return -1;
745  if (aa->to->no > bb->to->no)
746  return 1;
747  if (aa->co < bb->co)
748  return -1;
749  if (aa->co > bb->co)
750  return 1;
751  if (aa->type < bb->type)
752  return -1;
753  if (aa->type > bb->type)
754  return 1;
755  return 0;
756 }
757 
758 /*
759  * Common decision logic about whether to use arc-by-arc operations or
760  * sort/merge. If there's just a few source arcs we cannot recoup the
761  * cost of sorting the destination arc list, no matter how large it is.
762  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
763  * (a somewhat arbitrary choice, but the breakeven point would probably
764  * be machine dependent anyway).
765  */
766 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
767  ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
768 
769 /*
770  * moveins - move all in arcs of a state to another state
771  *
772  * You might think this could be done better by just updating the
773  * existing arcs, and you would be right if it weren't for the need
774  * for duplicate suppression, which makes it easier to just make new
775  * ones to exploit the suppression built into newarc.
776  *
777  * However, if we have a whole lot of arcs to deal with, retail duplicate
778  * checks become too slow. In that case we proceed by sorting and merging
779  * the arc lists, and then we can indeed just update the arcs in-place.
780  */
781 static void
782 moveins(struct nfa *nfa,
783  struct state *oldState,
784  struct state *newState)
785 {
786  assert(oldState != newState);
787 
788  if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
789  {
790  /* With not too many arcs, just do them one at a time */
791  struct arc *a;
792 
793  while ((a = oldState->ins) != NULL)
794  {
795  cparc(nfa, a, a->from, newState);
796  freearc(nfa, a);
797  }
798  }
799  else
800  {
801  /*
802  * With many arcs, use a sort-merge approach. Note changearctarget()
803  * will put the arc onto the front of newState's chain, so it does not
804  * break our walk through the sorted part of the chain.
805  */
806  struct arc *oa;
807  struct arc *na;
808 
809  /*
810  * Because we bypass newarc() in this code path, we'd better include a
811  * cancel check.
812  */
813  if (CANCEL_REQUESTED(nfa->v->re))
814  {
815  NERR(REG_CANCEL);
816  return;
817  }
818 
819  sortins(nfa, oldState);
820  sortins(nfa, newState);
821  if (NISERR())
822  return; /* might have failed to sort */
823  oa = oldState->ins;
824  na = newState->ins;
825  while (oa != NULL && na != NULL)
826  {
827  struct arc *a = oa;
828 
829  switch (sortins_cmp(&oa, &na))
830  {
831  case -1:
832  /* newState does not have anything matching oa */
833  oa = oa->inchain;
834 
835  /*
836  * Rather than doing createarc+freearc, we can just unlink
837  * and relink the existing arc struct.
838  */
839  changearctarget(a, newState);
840  break;
841  case 0:
842  /* match, advance in both lists */
843  oa = oa->inchain;
844  na = na->inchain;
845  /* ... and drop duplicate arc from oldState */
846  freearc(nfa, a);
847  break;
848  case +1:
849  /* advance only na; oa might have a match later */
850  na = na->inchain;
851  break;
852  default:
854  }
855  }
856  while (oa != NULL)
857  {
858  /* newState does not have anything matching oa */
859  struct arc *a = oa;
860 
861  oa = oa->inchain;
862  changearctarget(a, newState);
863  }
864  }
865 
866  assert(oldState->nins == 0);
867  assert(oldState->ins == NULL);
868 }
869 
870 /*
871  * copyins - copy in arcs of a state to another state
872  */
873 static void
874 copyins(struct nfa *nfa,
875  struct state *oldState,
876  struct state *newState)
877 {
878  assert(oldState != newState);
879 
880  if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
881  {
882  /* With not too many arcs, just do them one at a time */
883  struct arc *a;
884 
885  for (a = oldState->ins; a != NULL; a = a->inchain)
886  cparc(nfa, a, a->from, newState);
887  }
888  else
889  {
890  /*
891  * With many arcs, use a sort-merge approach. Note that createarc()
892  * will put new arcs onto the front of newState's chain, so it does
893  * not break our walk through the sorted part of the chain.
894  */
895  struct arc *oa;
896  struct arc *na;
897 
898  /*
899  * Because we bypass newarc() in this code path, we'd better include a
900  * cancel check.
901  */
902  if (CANCEL_REQUESTED(nfa->v->re))
903  {
904  NERR(REG_CANCEL);
905  return;
906  }
907 
908  sortins(nfa, oldState);
909  sortins(nfa, newState);
910  if (NISERR())
911  return; /* might have failed to sort */
912  oa = oldState->ins;
913  na = newState->ins;
914  while (oa != NULL && na != NULL)
915  {
916  struct arc *a = oa;
917 
918  switch (sortins_cmp(&oa, &na))
919  {
920  case -1:
921  /* newState does not have anything matching oa */
922  oa = oa->inchain;
923  createarc(nfa, a->type, a->co, a->from, newState);
924  break;
925  case 0:
926  /* match, advance in both lists */
927  oa = oa->inchain;
928  na = na->inchain;
929  break;
930  case +1:
931  /* advance only na; oa might have a match later */
932  na = na->inchain;
933  break;
934  default:
936  }
937  }
938  while (oa != NULL)
939  {
940  /* newState does not have anything matching oa */
941  struct arc *a = oa;
942 
943  oa = oa->inchain;
944  createarc(nfa, a->type, a->co, a->from, newState);
945  }
946  }
947 }
948 
949 /*
950  * mergeins - merge a list of inarcs into a state
951  *
952  * This is much like copyins, but the source arcs are listed in an array,
953  * and are not guaranteed unique. It's okay to clobber the array contents.
954  */
955 static void
956 mergeins(struct nfa *nfa,
957  struct state *s,
958  struct arc **arcarray,
959  int arccount)
960 {
961  struct arc *na;
962  int i;
963  int j;
964 
965  if (arccount <= 0)
966  return;
967 
968  /*
969  * Because we bypass newarc() in this code path, we'd better include a
970  * cancel check.
971  */
972  if (CANCEL_REQUESTED(nfa->v->re))
973  {
974  NERR(REG_CANCEL);
975  return;
976  }
977 
978  /* Sort existing inarcs as well as proposed new ones */
979  sortins(nfa, s);
980  if (NISERR())
981  return; /* might have failed to sort */
982 
983  qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
984 
985  /*
986  * arcarray very likely includes dups, so we must eliminate them. (This
987  * could be folded into the next loop, but it's not worth the trouble.)
988  */
989  j = 0;
990  for (i = 1; i < arccount; i++)
991  {
992  switch (sortins_cmp(&arcarray[j], &arcarray[i]))
993  {
994  case -1:
995  /* non-dup */
996  arcarray[++j] = arcarray[i];
997  break;
998  case 0:
999  /* dup */
1000  break;
1001  default:
1002  /* trouble */
1003  assert(NOTREACHED);
1004  }
1005  }
1006  arccount = j + 1;
1007 
1008  /*
1009  * Now merge into s' inchain. Note that createarc() will put new arcs
1010  * onto the front of s's chain, so it does not break our walk through the
1011  * sorted part of the chain.
1012  */
1013  i = 0;
1014  na = s->ins;
1015  while (i < arccount && na != NULL)
1016  {
1017  struct arc *a = arcarray[i];
1018 
1019  switch (sortins_cmp(&a, &na))
1020  {
1021  case -1:
1022  /* s does not have anything matching a */
1023  createarc(nfa, a->type, a->co, a->from, s);
1024  i++;
1025  break;
1026  case 0:
1027  /* match, advance in both lists */
1028  i++;
1029  na = na->inchain;
1030  break;
1031  case +1:
1032  /* advance only na; array might have a match later */
1033  na = na->inchain;
1034  break;
1035  default:
1036  assert(NOTREACHED);
1037  }
1038  }
1039  while (i < arccount)
1040  {
1041  /* s does not have anything matching a */
1042  struct arc *a = arcarray[i];
1043 
1044  createarc(nfa, a->type, a->co, a->from, s);
1045  i++;
1046  }
1047 }
1048 
1049 /*
1050  * moveouts - move all out arcs of a state to another state
1051  *
1052  * See comments for moveins()
1053  */
1054 static void
1055 moveouts(struct nfa *nfa,
1056  struct state *oldState,
1057  struct state *newState)
1058 {
1059  assert(oldState != newState);
1060 
1061  if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1062  {
1063  /* With not too many arcs, just do them one at a time */
1064  struct arc *a;
1065 
1066  while ((a = oldState->outs) != NULL)
1067  {
1068  cparc(nfa, a, newState, a->to);
1069  freearc(nfa, a);
1070  }
1071  }
1072  else
1073  {
1074  /*
1075  * With many arcs, use a sort-merge approach. Note changearcsource()
1076  * will put the arc onto the front of newState's chain, so it does not
1077  * break our walk through the sorted part of the chain.
1078  */
1079  struct arc *oa;
1080  struct arc *na;
1081 
1082  /*
1083  * Because we bypass newarc() in this code path, we'd better include a
1084  * cancel check.
1085  */
1086  if (CANCEL_REQUESTED(nfa->v->re))
1087  {
1088  NERR(REG_CANCEL);
1089  return;
1090  }
1091 
1092  sortouts(nfa, oldState);
1093  sortouts(nfa, newState);
1094  if (NISERR())
1095  return; /* might have failed to sort */
1096  oa = oldState->outs;
1097  na = newState->outs;
1098  while (oa != NULL && na != NULL)
1099  {
1100  struct arc *a = oa;
1101 
1102  switch (sortouts_cmp(&oa, &na))
1103  {
1104  case -1:
1105  /* newState does not have anything matching oa */
1106  oa = oa->outchain;
1107 
1108  /*
1109  * Rather than doing createarc+freearc, we can just unlink
1110  * and relink the existing arc struct.
1111  */
1112  changearcsource(a, newState);
1113  break;
1114  case 0:
1115  /* match, advance in both lists */
1116  oa = oa->outchain;
1117  na = na->outchain;
1118  /* ... and drop duplicate arc from oldState */
1119  freearc(nfa, a);
1120  break;
1121  case +1:
1122  /* advance only na; oa might have a match later */
1123  na = na->outchain;
1124  break;
1125  default:
1126  assert(NOTREACHED);
1127  }
1128  }
1129  while (oa != NULL)
1130  {
1131  /* newState does not have anything matching oa */
1132  struct arc *a = oa;
1133 
1134  oa = oa->outchain;
1135  changearcsource(a, newState);
1136  }
1137  }
1138 
1139  assert(oldState->nouts == 0);
1140  assert(oldState->outs == NULL);
1141 }
1142 
1143 /*
1144  * copyouts - copy out arcs of a state to another state
1145  */
1146 static void
1147 copyouts(struct nfa *nfa,
1148  struct state *oldState,
1149  struct state *newState)
1150 {
1151  assert(oldState != newState);
1152 
1153  if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1154  {
1155  /* With not too many arcs, just do them one at a time */
1156  struct arc *a;
1157 
1158  for (a = oldState->outs; a != NULL; a = a->outchain)
1159  cparc(nfa, a, newState, a->to);
1160  }
1161  else
1162  {
1163  /*
1164  * With many arcs, use a sort-merge approach. Note that createarc()
1165  * will put new arcs onto the front of newState's chain, so it does
1166  * not break our walk through the sorted part of the chain.
1167  */
1168  struct arc *oa;
1169  struct arc *na;
1170 
1171  /*
1172  * Because we bypass newarc() in this code path, we'd better include a
1173  * cancel check.
1174  */
1175  if (CANCEL_REQUESTED(nfa->v->re))
1176  {
1177  NERR(REG_CANCEL);
1178  return;
1179  }
1180 
1181  sortouts(nfa, oldState);
1182  sortouts(nfa, newState);
1183  if (NISERR())
1184  return; /* might have failed to sort */
1185  oa = oldState->outs;
1186  na = newState->outs;
1187  while (oa != NULL && na != NULL)
1188  {
1189  struct arc *a = oa;
1190 
1191  switch (sortouts_cmp(&oa, &na))
1192  {
1193  case -1:
1194  /* newState does not have anything matching oa */
1195  oa = oa->outchain;
1196  createarc(nfa, a->type, a->co, newState, a->to);
1197  break;
1198  case 0:
1199  /* match, advance in both lists */
1200  oa = oa->outchain;
1201  na = na->outchain;
1202  break;
1203  case +1:
1204  /* advance only na; oa might have a match later */
1205  na = na->outchain;
1206  break;
1207  default:
1208  assert(NOTREACHED);
1209  }
1210  }
1211  while (oa != NULL)
1212  {
1213  /* newState does not have anything matching oa */
1214  struct arc *a = oa;
1215 
1216  oa = oa->outchain;
1217  createarc(nfa, a->type, a->co, newState, a->to);
1218  }
1219  }
1220 }
1221 
1222 /*
1223  * cloneouts - copy out arcs of a state to another state pair, modifying type
1224  *
1225  * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
1226  * the same interpretation of "co". It wouldn't be sensible with LACONs.
1227  */
1228 static void
1230  struct state *old,
1231  struct state *from,
1232  struct state *to,
1233  int type)
1234 {
1235  struct arc *a;
1236 
1237  assert(old != from);
1238  assert(type == AHEAD || type == BEHIND);
1239 
1240  for (a = old->outs; a != NULL; a = a->outchain)
1241  {
1242  assert(a->type == PLAIN);
1243  newarc(nfa, type, a->co, from, to);
1244  }
1245 }
1246 
1247 /*
1248  * delsub - delete a sub-NFA, updating subre pointers if necessary
1249  *
1250  * This uses a recursive traversal of the sub-NFA, marking already-seen
1251  * states using their tmp pointer.
1252  */
1253 static void
1254 delsub(struct nfa *nfa,
1255  struct state *lp, /* the sub-NFA goes from here... */
1256  struct state *rp) /* ...to here, *not* inclusive */
1257 {
1258  assert(lp != rp);
1259 
1260  rp->tmp = rp; /* mark end */
1261 
1262  deltraverse(nfa, lp, lp);
1263  if (NISERR())
1264  return; /* asserts might not hold after failure */
1265  assert(lp->nouts == 0 && rp->nins == 0); /* did the job */
1266  assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1267 
1268  rp->tmp = NULL; /* unmark end */
1269  lp->tmp = NULL; /* and begin, marked by deltraverse */
1270 }
1271 
1272 /*
1273  * deltraverse - the recursive heart of delsub
1274  * This routine's basic job is to destroy all out-arcs of the state.
1275  */
1276 static void
1278  struct state *leftend,
1279  struct state *s)
1280 {
1281  struct arc *a;
1282  struct state *to;
1283 
1284  /* Since this is recursive, it could be driven to stack overflow */
1285  if (STACK_TOO_DEEP(nfa->v->re))
1286  {
1287  NERR(REG_ETOOBIG);
1288  return;
1289  }
1290 
1291  if (s->nouts == 0)
1292  return; /* nothing to do */
1293  if (s->tmp != NULL)
1294  return; /* already in progress */
1295 
1296  s->tmp = s; /* mark as in progress */
1297 
1298  while ((a = s->outs) != NULL)
1299  {
1300  to = a->to;
1301  deltraverse(nfa, leftend, to);
1302  if (NISERR())
1303  return; /* asserts might not hold after failure */
1304  assert(to->nouts == 0 || to->tmp != NULL);
1305  freearc(nfa, a);
1306  if (to->nins == 0 && to->tmp == NULL)
1307  {
1308  assert(to->nouts == 0);
1309  freestate(nfa, to);
1310  }
1311  }
1312 
1313  assert(s->no != FREESTATE); /* we're still here */
1314  assert(s == leftend || s->nins != 0); /* and still reachable */
1315  assert(s->nouts == 0); /* but have no outarcs */
1316 
1317  s->tmp = NULL; /* we're done here */
1318 }
1319 
1320 /*
1321  * dupnfa - duplicate sub-NFA
1322  *
1323  * Another recursive traversal, this time using tmp to point to duplicates
1324  * as well as mark already-seen states. (You knew there was a reason why
1325  * it's a state pointer, didn't you? :-))
1326  */
1327 static void
1328 dupnfa(struct nfa *nfa,
1329  struct state *start, /* duplicate of subNFA starting here */
1330  struct state *stop, /* and stopping here */
1331  struct state *from, /* stringing duplicate from here */
1332  struct state *to) /* to here */
1333 {
1334  if (start == stop)
1335  {
1336  newarc(nfa, EMPTY, 0, from, to);
1337  return;
1338  }
1339 
1340  stop->tmp = to;
1341  duptraverse(nfa, start, from);
1342  /* done, except for clearing out the tmp pointers */
1343 
1344  stop->tmp = NULL;
1345  cleartraverse(nfa, start);
1346 }
1347 
1348 /*
1349  * duptraverse - recursive heart of dupnfa
1350  */
1351 static void
1353  struct state *s,
1354  struct state *stmp) /* s's duplicate, or NULL */
1355 {
1356  struct arc *a;
1357 
1358  /* Since this is recursive, it could be driven to stack overflow */
1359  if (STACK_TOO_DEEP(nfa->v->re))
1360  {
1361  NERR(REG_ETOOBIG);
1362  return;
1363  }
1364 
1365  if (s->tmp != NULL)
1366  return; /* already done */
1367 
1368  s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1369  if (s->tmp == NULL)
1370  {
1371  assert(NISERR());
1372  return;
1373  }
1374 
1375  for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
1376  {
1377  duptraverse(nfa, a->to, (struct state *) NULL);
1378  if (NISERR())
1379  break;
1380  assert(a->to->tmp != NULL);
1381  cparc(nfa, a, s->tmp, a->to->tmp);
1382  }
1383 }
1384 
1385 /*
1386  * removeconstraints - remove any constraints in an NFA
1387  *
1388  * Constraint arcs are replaced by empty arcs, essentially treating all
1389  * constraints as automatically satisfied.
1390  */
1391 static void
1393  struct state *start, /* process subNFA starting here */
1394  struct state *stop) /* and stopping here */
1395 {
1396  if (start == stop)
1397  return;
1398 
1399  stop->tmp = stop;
1400  removetraverse(nfa, start);
1401  /* done, except for clearing out the tmp pointers */
1402 
1403  stop->tmp = NULL;
1404  cleartraverse(nfa, start);
1405 }
1406 
1407 /*
1408  * removetraverse - recursive heart of removeconstraints
1409  */
1410 static void
1412  struct state *s)
1413 {
1414  struct arc *a;
1415  struct arc *oa;
1416 
1417  /* Since this is recursive, it could be driven to stack overflow */
1418  if (STACK_TOO_DEEP(nfa->v->re))
1419  {
1420  NERR(REG_ETOOBIG);
1421  return;
1422  }
1423 
1424  if (s->tmp != NULL)
1425  return; /* already done */
1426 
1427  s->tmp = s;
1428  for (a = s->outs; a != NULL && !NISERR(); a = oa)
1429  {
1430  removetraverse(nfa, a->to);
1431  if (NISERR())
1432  break;
1433  oa = a->outchain;
1434  switch (a->type)
1435  {
1436  case PLAIN:
1437  case EMPTY:
1438  /* nothing to do */
1439  break;
1440  case AHEAD:
1441  case BEHIND:
1442  case '^':
1443  case '$':
1444  case LACON:
1445  /* replace it */
1446  newarc(nfa, EMPTY, 0, s, a->to);
1447  freearc(nfa, a);
1448  break;
1449  default:
1450  NERR(REG_ASSERT);
1451  break;
1452  }
1453  }
1454 }
1455 
1456 /*
1457  * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1458  */
1459 static void
1461  struct state *s)
1462 {
1463  struct arc *a;
1464 
1465  /* Since this is recursive, it could be driven to stack overflow */
1466  if (STACK_TOO_DEEP(nfa->v->re))
1467  {
1468  NERR(REG_ETOOBIG);
1469  return;
1470  }
1471 
1472  if (s->tmp == NULL)
1473  return;
1474  s->tmp = NULL;
1475 
1476  for (a = s->outs; a != NULL; a = a->outchain)
1477  cleartraverse(nfa, a->to);
1478 }
1479 
1480 /*
1481  * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
1482  *
1483  * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
1484  * of a set of colors), return a state whose outarc list contains only PLAIN
1485  * arcs of those color(s). Otherwise return NULL.
1486  *
1487  * This is used before optimizing the NFA, so there may be EMPTY arcs, which
1488  * we should ignore; the possibility of an EMPTY is why the result state could
1489  * be different from s1.
1490  *
1491  * It's worth troubling to handle multiple parallel PLAIN arcs here because a
1492  * bracket construct such as [abc] might yield either one or several parallel
1493  * PLAIN arcs depending on earlier atoms in the expression. We'd rather that
1494  * that implementation detail not create user-visible performance differences.
1495  */
1496 static struct state *
1498 {
1499  struct arc *a;
1500 
1501  /* Ignore leading EMPTY arc, if any */
1502  if (s1->nouts == 1 && s1->outs->type == EMPTY)
1503  s1 = s1->outs->to;
1504  /* Likewise for any trailing EMPTY arc */
1505  if (s2->nins == 1 && s2->ins->type == EMPTY)
1506  s2 = s2->ins->from;
1507  /* Perhaps we could have a single-state loop in between, if so reject */
1508  if (s1 == s2)
1509  return NULL;
1510  /* s1 must have at least one outarc... */
1511  if (s1->outs == NULL)
1512  return NULL;
1513  /* ... and they must all be PLAIN arcs to s2 */
1514  for (a = s1->outs; a != NULL; a = a->outchain)
1515  {
1516  if (a->type != PLAIN || a->to != s2)
1517  return NULL;
1518  }
1519  /* OK, return s1 as the possessor of the relevant outarcs */
1520  return s1;
1521 }
1522 
1523 /*
1524  * specialcolors - fill in special colors for an NFA
1525  */
1526 static void
1528 {
1529  /* false colors for BOS, BOL, EOS, EOL */
1530  if (nfa->parent == NULL)
1531  {
1532  nfa->bos[0] = pseudocolor(nfa->cm);
1533  nfa->bos[1] = pseudocolor(nfa->cm);
1534  nfa->eos[0] = pseudocolor(nfa->cm);
1535  nfa->eos[1] = pseudocolor(nfa->cm);
1536  }
1537  else
1538  {
1539  assert(nfa->parent->bos[0] != COLORLESS);
1540  nfa->bos[0] = nfa->parent->bos[0];
1541  assert(nfa->parent->bos[1] != COLORLESS);
1542  nfa->bos[1] = nfa->parent->bos[1];
1543  assert(nfa->parent->eos[0] != COLORLESS);
1544  nfa->eos[0] = nfa->parent->eos[0];
1545  assert(nfa->parent->eos[1] != COLORLESS);
1546  nfa->eos[1] = nfa->parent->eos[1];
1547  }
1548 }
1549 
1550 /*
1551  * optimize - optimize an NFA
1552  *
1553  * The main goal of this function is not so much "optimization" (though it
1554  * does try to get rid of useless NFA states) as reducing the NFA to a form
1555  * the regex executor can handle. The executor, and indeed the cNFA format
1556  * that is its input, can only handle PLAIN and LACON arcs. The output of
1557  * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1558  * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1559  * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1560  * The hardest part of either job is to get rid of circular loops of the
1561  * target arc type. We would have to do that in any case, though, as such a
1562  * loop would otherwise allow the executor to cycle through the loop endlessly
1563  * without making any progress in the input string.
1564  */
1565 static long /* re_info bits */
1566 optimize(struct nfa *nfa,
1567  FILE *f) /* for debug output; NULL none */
1568 {
1569 #ifdef REG_DEBUG
1570  int verbose = (f != NULL) ? 1 : 0;
1571 
1572  if (verbose)
1573  fprintf(f, "\ninitial cleanup:\n");
1574 #endif
1575  cleanup(nfa); /* may simplify situation */
1576 #ifdef REG_DEBUG
1577  if (verbose)
1578  dumpnfa(nfa, f);
1579  if (verbose)
1580  fprintf(f, "\nempties:\n");
1581 #endif
1582  fixempties(nfa, f); /* get rid of EMPTY arcs */
1583 #ifdef REG_DEBUG
1584  if (verbose)
1585  fprintf(f, "\nconstraints:\n");
1586 #endif
1587  fixconstraintloops(nfa, f); /* get rid of constraint loops */
1588  pullback(nfa, f); /* pull back constraints backward */
1589  pushfwd(nfa, f); /* push fwd constraints forward */
1590 #ifdef REG_DEBUG
1591  if (verbose)
1592  fprintf(f, "\nfinal cleanup:\n");
1593 #endif
1594  cleanup(nfa); /* final tidying */
1595 #ifdef REG_DEBUG
1596  if (verbose)
1597  dumpnfa(nfa, f);
1598 #endif
1599  return analyze(nfa); /* and analysis */
1600 }
1601 
1602 /*
1603  * pullback - pull back constraints backward to eliminate them
1604  */
1605 static void
1606 pullback(struct nfa *nfa,
1607  FILE *f) /* for debug output; NULL none */
1608 {
1609  struct state *s;
1610  struct state *nexts;
1611  struct arc *a;
1612  struct arc *nexta;
1613  struct state *intermediates;
1614  int progress;
1615 
1616  /* find and pull until there are no more */
1617  do
1618  {
1619  progress = 0;
1620  for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1621  {
1622  nexts = s->next;
1623  intermediates = NULL;
1624  for (a = s->outs; a != NULL && !NISERR(); a = nexta)
1625  {
1626  nexta = a->outchain;
1627  if (a->type == '^' || a->type == BEHIND)
1628  if (pull(nfa, a, &intermediates))
1629  progress = 1;
1630  }
1631  /* clear tmp fields of intermediate states created here */
1632  while (intermediates != NULL)
1633  {
1634  struct state *ns = intermediates->tmp;
1635 
1636  intermediates->tmp = NULL;
1637  intermediates = ns;
1638  }
1639  /* if s is now useless, get rid of it */
1640  if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1641  dropstate(nfa, s);
1642  }
1643  if (progress && f != NULL)
1644  dumpnfa(nfa, f);
1645  } while (progress && !NISERR());
1646  if (NISERR())
1647  return;
1648 
1649  /*
1650  * Any ^ constraints we were able to pull to the start state can now be
1651  * replaced by PLAIN arcs referencing the BOS or BOL colors. There should
1652  * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1653  * that here (compact() will fail if so).
1654  */
1655  for (a = nfa->pre->outs; a != NULL; a = nexta)
1656  {
1657  nexta = a->outchain;
1658  if (a->type == '^')
1659  {
1660  assert(a->co == 0 || a->co == 1);
1661  newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1662  freearc(nfa, a);
1663  }
1664  }
1665 }
1666 
1667 /*
1668  * pull - pull a back constraint backward past its source state
1669  *
1670  * Returns 1 if successful (which it always is unless the source is the
1671  * start state or we have an internal error), 0 if nothing happened.
1672  *
1673  * A significant property of this function is that it deletes no pre-existing
1674  * states, and no outarcs of the constraint's from state other than the given
1675  * constraint arc. This makes the loops in pullback() safe, at the cost that
1676  * we may leave useless states behind. Therefore, we leave it to pullback()
1677  * to delete such states.
1678  *
1679  * If the from state has multiple back-constraint outarcs, and/or multiple
1680  * compatible constraint inarcs, we only need to create one new intermediate
1681  * state per combination of predecessor and successor states. *intermediates
1682  * points to a list of such intermediate states for this from state (chained
1683  * through their tmp fields).
1684  */
1685 static int
1686 pull(struct nfa *nfa,
1687  struct arc *con,
1688  struct state **intermediates)
1689 {
1690  struct state *from = con->from;
1691  struct state *to = con->to;
1692  struct arc *a;
1693  struct arc *nexta;
1694  struct state *s;
1695 
1696  assert(from != to); /* should have gotten rid of this earlier */
1697  if (from->flag) /* can't pull back beyond start */
1698  return 0;
1699  if (from->nins == 0)
1700  { /* unreachable */
1701  freearc(nfa, con);
1702  return 1;
1703  }
1704 
1705  /*
1706  * First, clone from state if necessary to avoid other outarcs. This may
1707  * seem wasteful, but it simplifies the logic, and we'll get rid of the
1708  * clone state again at the bottom.
1709  */
1710  if (from->nouts > 1)
1711  {
1712  s = newstate(nfa);
1713  if (NISERR())
1714  return 0;
1715  copyins(nfa, from, s); /* duplicate inarcs */
1716  cparc(nfa, con, s, to); /* move constraint arc */
1717  freearc(nfa, con);
1718  if (NISERR())
1719  return 0;
1720  from = s;
1721  con = from->outs;
1722  }
1723  assert(from->nouts == 1);
1724 
1725  /* propagate the constraint into the from state's inarcs */
1726  for (a = from->ins; a != NULL && !NISERR(); a = nexta)
1727  {
1728  nexta = a->inchain;
1729  switch (combine(nfa, con, a))
1730  {
1731  case INCOMPATIBLE: /* destroy the arc */
1732  freearc(nfa, a);
1733  break;
1734  case SATISFIED: /* no action needed */
1735  break;
1736  case COMPATIBLE: /* swap the two arcs, more or less */
1737  /* need an intermediate state, but might have one already */
1738  for (s = *intermediates; s != NULL; s = s->tmp)
1739  {
1740  assert(s->nins > 0 && s->nouts > 0);
1741  if (s->ins->from == a->from && s->outs->to == to)
1742  break;
1743  }
1744  if (s == NULL)
1745  {
1746  s = newstate(nfa);
1747  if (NISERR())
1748  return 0;
1749  s->tmp = *intermediates;
1750  *intermediates = s;
1751  }
1752  cparc(nfa, con, a->from, s);
1753  cparc(nfa, a, s, to);
1754  freearc(nfa, a);
1755  break;
1756  case REPLACEARC: /* replace arc's color */
1757  newarc(nfa, a->type, con->co, a->from, to);
1758  freearc(nfa, a);
1759  break;
1760  default:
1761  assert(NOTREACHED);
1762  break;
1763  }
1764  }
1765 
1766  /* remaining inarcs, if any, incorporate the constraint */
1767  moveins(nfa, from, to);
1768  freearc(nfa, con);
1769  /* from state is now useless, but we leave it to pullback() to clean up */
1770  return 1;
1771 }
1772 
1773 /*
1774  * pushfwd - push forward constraints forward to eliminate them
1775  */
1776 static void
1777 pushfwd(struct nfa *nfa,
1778  FILE *f) /* for debug output; NULL none */
1779 {
1780  struct state *s;
1781  struct state *nexts;
1782  struct arc *a;
1783  struct arc *nexta;
1784  struct state *intermediates;
1785  int progress;
1786 
1787  /* find and push until there are no more */
1788  do
1789  {
1790  progress = 0;
1791  for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1792  {
1793  nexts = s->next;
1794  intermediates = NULL;
1795  for (a = s->ins; a != NULL && !NISERR(); a = nexta)
1796  {
1797  nexta = a->inchain;
1798  if (a->type == '$' || a->type == AHEAD)
1799  if (push(nfa, a, &intermediates))
1800  progress = 1;
1801  }
1802  /* clear tmp fields of intermediate states created here */
1803  while (intermediates != NULL)
1804  {
1805  struct state *ns = intermediates->tmp;
1806 
1807  intermediates->tmp = NULL;
1808  intermediates = ns;
1809  }
1810  /* if s is now useless, get rid of it */
1811  if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1812  dropstate(nfa, s);
1813  }
1814  if (progress && f != NULL)
1815  dumpnfa(nfa, f);
1816  } while (progress && !NISERR());
1817  if (NISERR())
1818  return;
1819 
1820  /*
1821  * Any $ constraints we were able to push to the post state can now be
1822  * replaced by PLAIN arcs referencing the EOS or EOL colors. There should
1823  * be no other $ or AHEAD arcs left in the NFA, though we do not check
1824  * that here (compact() will fail if so).
1825  */
1826  for (a = nfa->post->ins; a != NULL; a = nexta)
1827  {
1828  nexta = a->inchain;
1829  if (a->type == '$')
1830  {
1831  assert(a->co == 0 || a->co == 1);
1832  newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1833  freearc(nfa, a);
1834  }
1835  }
1836 }
1837 
1838 /*
1839  * push - push a forward constraint forward past its destination state
1840  *
1841  * Returns 1 if successful (which it always is unless the destination is the
1842  * post state or we have an internal error), 0 if nothing happened.
1843  *
1844  * A significant property of this function is that it deletes no pre-existing
1845  * states, and no inarcs of the constraint's to state other than the given
1846  * constraint arc. This makes the loops in pushfwd() safe, at the cost that
1847  * we may leave useless states behind. Therefore, we leave it to pushfwd()
1848  * to delete such states.
1849  *
1850  * If the to state has multiple forward-constraint inarcs, and/or multiple
1851  * compatible constraint outarcs, we only need to create one new intermediate
1852  * state per combination of predecessor and successor states. *intermediates
1853  * points to a list of such intermediate states for this to state (chained
1854  * through their tmp fields).
1855  */
1856 static int
1857 push(struct nfa *nfa,
1858  struct arc *con,
1859  struct state **intermediates)
1860 {
1861  struct state *from = con->from;
1862  struct state *to = con->to;
1863  struct arc *a;
1864  struct arc *nexta;
1865  struct state *s;
1866 
1867  assert(to != from); /* should have gotten rid of this earlier */
1868  if (to->flag) /* can't push forward beyond end */
1869  return 0;
1870  if (to->nouts == 0)
1871  { /* dead end */
1872  freearc(nfa, con);
1873  return 1;
1874  }
1875 
1876  /*
1877  * First, clone to state if necessary to avoid other inarcs. This may
1878  * seem wasteful, but it simplifies the logic, and we'll get rid of the
1879  * clone state again at the bottom.
1880  */
1881  if (to->nins > 1)
1882  {
1883  s = newstate(nfa);
1884  if (NISERR())
1885  return 0;
1886  copyouts(nfa, to, s); /* duplicate outarcs */
1887  cparc(nfa, con, from, s); /* move constraint arc */
1888  freearc(nfa, con);
1889  if (NISERR())
1890  return 0;
1891  to = s;
1892  con = to->ins;
1893  }
1894  assert(to->nins == 1);
1895 
1896  /* propagate the constraint into the to state's outarcs */
1897  for (a = to->outs; a != NULL && !NISERR(); a = nexta)
1898  {
1899  nexta = a->outchain;
1900  switch (combine(nfa, con, a))
1901  {
1902  case INCOMPATIBLE: /* destroy the arc */
1903  freearc(nfa, a);
1904  break;
1905  case SATISFIED: /* no action needed */
1906  break;
1907  case COMPATIBLE: /* swap the two arcs, more or less */
1908  /* need an intermediate state, but might have one already */
1909  for (s = *intermediates; s != NULL; s = s->tmp)
1910  {
1911  assert(s->nins > 0 && s->nouts > 0);
1912  if (s->ins->from == from && s->outs->to == a->to)
1913  break;
1914  }
1915  if (s == NULL)
1916  {
1917  s = newstate(nfa);
1918  if (NISERR())
1919  return 0;
1920  s->tmp = *intermediates;
1921  *intermediates = s;
1922  }
1923  cparc(nfa, con, s, a->to);
1924  cparc(nfa, a, from, s);
1925  freearc(nfa, a);
1926  break;
1927  case REPLACEARC: /* replace arc's color */
1928  newarc(nfa, a->type, con->co, from, a->to);
1929  freearc(nfa, a);
1930  break;
1931  default:
1932  assert(NOTREACHED);
1933  break;
1934  }
1935  }
1936 
1937  /* remaining outarcs, if any, incorporate the constraint */
1938  moveouts(nfa, to, from);
1939  freearc(nfa, con);
1940  /* to state is now useless, but we leave it to pushfwd() to clean up */
1941  return 1;
1942 }
1943 
1944 /*
1945  * combine - constraint lands on an arc, what happens?
1946  *
1947  * #def INCOMPATIBLE 1 // destroys arc
1948  * #def SATISFIED 2 // constraint satisfied
1949  * #def COMPATIBLE 3 // compatible but not satisfied yet
1950  * #def REPLACEARC 4 // replace arc's color with constraint color
1951  */
1952 static int
1953 combine(struct nfa *nfa,
1954  struct arc *con,
1955  struct arc *a)
1956 {
1957 #define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
1958 
1959  switch (CA(con->type, a->type))
1960  {
1961  case CA('^', PLAIN): /* newlines are handled separately */
1962  case CA('$', PLAIN):
1963  return INCOMPATIBLE;
1964  break;
1965  case CA(AHEAD, PLAIN): /* color constraints meet colors */
1966  case CA(BEHIND, PLAIN):
1967  if (con->co == a->co)
1968  return SATISFIED;
1969  if (con->co == RAINBOW)
1970  {
1971  /* con is satisfied unless arc's color is a pseudocolor */
1972  if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1973  return SATISFIED;
1974  }
1975  else if (a->co == RAINBOW)
1976  {
1977  /* con is incompatible if it's for a pseudocolor */
1978  if (nfa->cm->cd[con->co].flags & PSEUDO)
1979  return INCOMPATIBLE;
1980  /* otherwise, constraint constrains arc to be only its color */
1981  return REPLACEARC;
1982  }
1983  return INCOMPATIBLE;
1984  break;
1985  case CA('^', '^'): /* collision, similar constraints */
1986  case CA('$', '$'):
1987  if (con->co == a->co) /* true duplication */
1988  return SATISFIED;
1989  return INCOMPATIBLE;
1990  break;
1991  case CA(AHEAD, AHEAD): /* collision, similar constraints */
1992  case CA(BEHIND, BEHIND):
1993  if (con->co == a->co) /* true duplication */
1994  return SATISFIED;
1995  if (con->co == RAINBOW)
1996  {
1997  /* con is satisfied unless arc's color is a pseudocolor */
1998  if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1999  return SATISFIED;
2000  }
2001  else if (a->co == RAINBOW)
2002  {
2003  /* con is incompatible if it's for a pseudocolor */
2004  if (nfa->cm->cd[con->co].flags & PSEUDO)
2005  return INCOMPATIBLE;
2006  /* otherwise, constraint constrains arc to be only its color */
2007  return REPLACEARC;
2008  }
2009  return INCOMPATIBLE;
2010  break;
2011  case CA('^', BEHIND): /* collision, dissimilar constraints */
2012  case CA(BEHIND, '^'):
2013  case CA('$', AHEAD):
2014  case CA(AHEAD, '$'):
2015  return INCOMPATIBLE;
2016  break;
2017  case CA('^', '$'): /* constraints passing each other */
2018  case CA('^', AHEAD):
2019  case CA(BEHIND, '$'):
2020  case CA(BEHIND, AHEAD):
2021  case CA('$', '^'):
2022  case CA('$', BEHIND):
2023  case CA(AHEAD, '^'):
2024  case CA(AHEAD, BEHIND):
2025  case CA('^', LACON):
2026  case CA(BEHIND, LACON):
2027  case CA('$', LACON):
2028  case CA(AHEAD, LACON):
2029  return COMPATIBLE;
2030  break;
2031  }
2032  assert(NOTREACHED);
2033  return INCOMPATIBLE; /* for benefit of blind compilers */
2034 }
2035 
2036 /*
2037  * fixempties - get rid of EMPTY arcs
2038  */
2039 static void
2041  FILE *f) /* for debug output; NULL none */
2042 {
2043  struct state *s;
2044  struct state *s2;
2045  struct state *nexts;
2046  struct arc *a;
2047  struct arc *nexta;
2048  int totalinarcs;
2049  struct arc **inarcsorig;
2050  struct arc **arcarray;
2051  int arccount;
2052  int prevnins;
2053  int nskip;
2054 
2055  /*
2056  * First, get rid of any states whose sole out-arc is an EMPTY, since
2057  * they're basically just aliases for their successor. The parsing
2058  * algorithm creates enough of these that it's worth special-casing this.
2059  */
2060  for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2061  {
2062  nexts = s->next;
2063  if (s->flag || s->nouts != 1)
2064  continue;
2065  a = s->outs;
2066  assert(a != NULL && a->outchain == NULL);
2067  if (a->type != EMPTY)
2068  continue;
2069  if (s != a->to)
2070  moveins(nfa, s, a->to);
2071  dropstate(nfa, s);
2072  }
2073 
2074  /*
2075  * Similarly, get rid of any state with a single EMPTY in-arc, by folding
2076  * it into its predecessor.
2077  */
2078  for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2079  {
2080  nexts = s->next;
2081  /* while we're at it, ensure tmp fields are clear for next step */
2082  assert(s->tmp == NULL);
2083  if (s->flag || s->nins != 1)
2084  continue;
2085  a = s->ins;
2086  assert(a != NULL && a->inchain == NULL);
2087  if (a->type != EMPTY)
2088  continue;
2089  if (s != a->from)
2090  moveouts(nfa, s, a->from);
2091  dropstate(nfa, s);
2092  }
2093 
2094  if (NISERR())
2095  return;
2096 
2097  /*
2098  * For each remaining NFA state, find all other states from which it is
2099  * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
2100  * that eliminate the need for each such chain.
2101  *
2102  * We could replace a chain of EMPTY arcs that leads from a "from" state
2103  * to a "to" state either by pushing non-EMPTY arcs forward (linking
2104  * directly from "from"'s predecessors to "to") or by pulling them back
2105  * (linking directly from "from" to "to"'s successors). We choose to
2106  * always do the former; this choice is somewhat arbitrary, but the
2107  * approach below requires that we uniformly do one or the other.
2108  *
2109  * Suppose we have a chain of N successive EMPTY arcs (where N can easily
2110  * approach the size of the NFA). All of the intermediate states must
2111  * have additional inarcs and outarcs, else they'd have been removed by
2112  * the steps above. Assuming their inarcs are mostly not empties, we will
2113  * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
2114  * state in the chain must be duplicated to lead to all its successor
2115  * states as well. So there is no hope of doing less than O(N^2) work;
2116  * however, we should endeavor to keep the big-O cost from being even
2117  * worse than that, which it can easily become without care. In
2118  * particular, suppose we were to copy all S1's inarcs forward to S2, and
2119  * then also to S3, and then later we consider pushing S2's inarcs forward
2120  * to S3. If we include the arcs already copied from S1 in that, we'd be
2121  * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
2122  * and its cohorts would get rid of the extra arcs, but not without cost.)
2123  *
2124  * We can avoid this cost by treating only arcs that existed at the start
2125  * of this phase as candidates to be pushed forward. To identify those,
2126  * we remember the first inarc each state had to start with. We rely on
2127  * the fact that newarc() and friends put new arcs on the front of their
2128  * to-states' inchains, and that this phase never deletes arcs, so that
2129  * the original arcs must be the last arcs in their to-states' inchains.
2130  *
2131  * So the process here is that, for each state in the NFA, we gather up
2132  * all non-EMPTY inarcs of states that can reach the target state via
2133  * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
2134  * target state's inchain. (We can safely use sort-merge for this as long
2135  * as we update each state's original-arcs pointer after we add arcs to
2136  * it; the sort step of mergeins probably changed the order of the old
2137  * arcs.)
2138  *
2139  * Another refinement worth making is that, because we only add non-EMPTY
2140  * arcs during this phase, and all added arcs have the same from-state as
2141  * the non-EMPTY arc they were cloned from, we know ahead of time that any
2142  * states having only EMPTY outarcs will be useless for lack of outarcs
2143  * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
2144  * they had none to start with.) So we need not bother to update the
2145  * inchains of such states at all.
2146  */
2147 
2148  /* Remember the states' first original inarcs */
2149  /* ... and while at it, count how many old inarcs there are altogether */
2150  inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
2151  if (inarcsorig == NULL)
2152  {
2153  NERR(REG_ESPACE);
2154  return;
2155  }
2156  totalinarcs = 0;
2157  for (s = nfa->states; s != NULL; s = s->next)
2158  {
2159  inarcsorig[s->no] = s->ins;
2160  totalinarcs += s->nins;
2161  }
2162 
2163  /*
2164  * Create a workspace for accumulating the inarcs to be added to the
2165  * current target state. totalinarcs is probably a considerable
2166  * overestimate of the space needed, but the NFA is unlikely to be large
2167  * enough at this point to make it worth being smarter.
2168  */
2169  arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
2170  if (arcarray == NULL)
2171  {
2172  NERR(REG_ESPACE);
2173  FREE(inarcsorig);
2174  return;
2175  }
2176 
2177  /* And iterate over the target states */
2178  for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2179  {
2180  /* Ignore target states without non-EMPTY outarcs, per note above */
2181  if (!s->flag && !hasnonemptyout(s))
2182  continue;
2183 
2184  /* Find predecessor states and accumulate their original inarcs */
2185  arccount = 0;
2186  for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
2187  {
2188  /* Add s2's original inarcs to arcarray[], but ignore empties */
2189  for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
2190  {
2191  if (a->type != EMPTY)
2192  arcarray[arccount++] = a;
2193  }
2194 
2195  /* Reset the tmp fields as we walk back */
2196  nexts = s2->tmp;
2197  s2->tmp = NULL;
2198  }
2199  s->tmp = NULL;
2200  assert(arccount <= totalinarcs);
2201 
2202  /* Remember how many original inarcs this state has */
2203  prevnins = s->nins;
2204 
2205  /* Add non-duplicate inarcs to target state */
2206  mergeins(nfa, s, arcarray, arccount);
2207 
2208  /* Now we must update the state's inarcsorig pointer */
2209  nskip = s->nins - prevnins;
2210  a = s->ins;
2211  while (nskip-- > 0)
2212  a = a->inchain;
2213  inarcsorig[s->no] = a;
2214  }
2215 
2216  FREE(arcarray);
2217  FREE(inarcsorig);
2218 
2219  if (NISERR())
2220  return;
2221 
2222  /*
2223  * Now remove all the EMPTY arcs, since we don't need them anymore.
2224  */
2225  for (s = nfa->states; s != NULL; s = s->next)
2226  {
2227  for (a = s->outs; a != NULL; a = nexta)
2228  {
2229  nexta = a->outchain;
2230  if (a->type == EMPTY)
2231  freearc(nfa, a);
2232  }
2233  }
2234 
2235  /*
2236  * And remove any states that have become useless. (This cleanup is not
2237  * very thorough, and would be even less so if we tried to combine it with
2238  * the previous step; but cleanup() will take care of anything we miss.)
2239  */
2240  for (s = nfa->states; s != NULL; s = nexts)
2241  {
2242  nexts = s->next;
2243  if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2244  dropstate(nfa, s);
2245  }
2246 
2247  if (f != NULL)
2248  dumpnfa(nfa, f);
2249 }
2250 
2251 /*
2252  * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2253  *
2254  * The return value is the last such state found. Its tmp field links back
2255  * to the next-to-last such state, and so on back to s, so that all these
2256  * states can be located without searching the whole NFA.
2257  *
2258  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2259  * maintained by that function. This lets us skip over all new inarcs, which
2260  * are certainly not EMPTY arcs.
2261  *
2262  * The maximum recursion depth here is equal to the length of the longest
2263  * loop-free chain of EMPTY arcs, which is surely no more than the size of
2264  * the NFA ... but that could still be enough to cause trouble.
2265  */
2266 static struct state *
2268  struct state *s,
2269  struct state *lastfound,
2270  struct arc **inarcsorig)
2271 {
2272  struct arc *a;
2273 
2274  /* Since this is recursive, it could be driven to stack overflow */
2275  if (STACK_TOO_DEEP(nfa->v->re))
2276  {
2277  NERR(REG_ETOOBIG);
2278  return lastfound;
2279  }
2280 
2281  s->tmp = lastfound;
2282  lastfound = s;
2283  for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2284  {
2285  if (a->type == EMPTY && a->from->tmp == NULL)
2286  lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2287  }
2288  return lastfound;
2289 }
2290 
2291 /*
2292  * isconstraintarc - detect whether an arc is of a constraint type
2293  */
2294 static inline int
2296 {
2297  switch (a->type)
2298  {
2299  case '^':
2300  case '$':
2301  case BEHIND:
2302  case AHEAD:
2303  case LACON:
2304  return 1;
2305  }
2306  return 0;
2307 }
2308 
2309 /*
2310  * hasconstraintout - does state have a constraint out arc?
2311  */
2312 static int
2314 {
2315  struct arc *a;
2316 
2317  for (a = s->outs; a != NULL; a = a->outchain)
2318  {
2319  if (isconstraintarc(a))
2320  return 1;
2321  }
2322  return 0;
2323 }
2324 
2325 /*
2326  * fixconstraintloops - get rid of loops containing only constraint arcs
2327  *
2328  * A loop of states that contains only constraint arcs is useless, since
2329  * passing around the loop represents no forward progress. Moreover, it
2330  * would cause infinite looping in pullback/pushfwd, so we need to get rid
2331  * of such loops before doing that.
2332  */
2333 static void
2335  FILE *f) /* for debug output; NULL none */
2336 {
2337  struct state *s;
2338  struct state *nexts;
2339  struct arc *a;
2340  struct arc *nexta;
2341  int hasconstraints;
2342 
2343  /*
2344  * In the trivial case of a state that loops to itself, we can just drop
2345  * the constraint arc altogether. This is worth special-casing because
2346  * such loops are far more common than loops containing multiple states.
2347  * While we're at it, note whether any constraint arcs survive.
2348  */
2349  hasconstraints = 0;
2350  for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2351  {
2352  nexts = s->next;
2353  /* while we're at it, ensure tmp fields are clear for next step */
2354  assert(s->tmp == NULL);
2355  for (a = s->outs; a != NULL && !NISERR(); a = nexta)
2356  {
2357  nexta = a->outchain;
2358  if (isconstraintarc(a))
2359  {
2360  if (a->to == s)
2361  freearc(nfa, a);
2362  else
2363  hasconstraints = 1;
2364  }
2365  }
2366  /* If we removed all the outarcs, the state is useless. */
2367  if (s->nouts == 0 && !s->flag)
2368  dropstate(nfa, s);
2369  }
2370 
2371  /* Nothing to do if no remaining constraint arcs */
2372  if (NISERR() || !hasconstraints)
2373  return;
2374 
2375  /*
2376  * Starting from each remaining NFA state, search outwards for a
2377  * constraint loop. If we find a loop, break the loop, then start the
2378  * search over. (We could possibly retain some state from the first scan,
2379  * but it would complicate things greatly, and multi-state constraint
2380  * loops are rare enough that it's not worth optimizing the case.)
2381  */
2382 restart:
2383  for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2384  {
2385  if (findconstraintloop(nfa, s))
2386  goto restart;
2387  }
2388 
2389  if (NISERR())
2390  return;
2391 
2392  /*
2393  * Now remove any states that have become useless. (This cleanup is not
2394  * very thorough, and would be even less so if we tried to combine it with
2395  * the previous step; but cleanup() will take care of anything we miss.)
2396  *
2397  * Because findconstraintloop intentionally doesn't reset all tmp fields,
2398  * we have to clear them after it's done. This is a convenient place to
2399  * do that, too.
2400  */
2401  for (s = nfa->states; s != NULL; s = nexts)
2402  {
2403  nexts = s->next;
2404  s->tmp = NULL;
2405  if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2406  dropstate(nfa, s);
2407  }
2408 
2409  if (f != NULL)
2410  dumpnfa(nfa, f);
2411 }
2412 
2413 /*
2414  * findconstraintloop - recursively find a loop of constraint arcs
2415  *
2416  * If we find a loop, break it by calling breakconstraintloop(), then
2417  * return 1; otherwise return 0.
2418  *
2419  * State tmp fields are guaranteed all NULL on a success return, because
2420  * breakconstraintloop does that. After a failure return, any state that
2421  * is known not to be part of a loop is marked with s->tmp == s; this allows
2422  * us not to have to re-prove that fact on later calls. (This convention is
2423  * workable because we already eliminated single-state loops.)
2424  *
2425  * Note that the found loop doesn't necessarily include the first state we
2426  * are called on. Any loop reachable from that state will do.
2427  *
2428  * The maximum recursion depth here is one more than the length of the longest
2429  * loop-free chain of constraint arcs, which is surely no more than the size
2430  * of the NFA ... but that could still be enough to cause trouble.
2431  */
2432 static int
2433 findconstraintloop(struct nfa *nfa, struct state *s)
2434 {
2435  struct arc *a;
2436 
2437  /* Since this is recursive, it could be driven to stack overflow */
2438  if (STACK_TOO_DEEP(nfa->v->re))
2439  {
2440  NERR(REG_ETOOBIG);
2441  return 1; /* to exit as quickly as possible */
2442  }
2443 
2444  if (s->tmp != NULL)
2445  {
2446  /* Already proven uninteresting? */
2447  if (s->tmp == s)
2448  return 0;
2449  /* Found a loop involving s */
2450  breakconstraintloop(nfa, s);
2451  /* The tmp fields have been cleaned up by breakconstraintloop */
2452  return 1;
2453  }
2454  for (a = s->outs; a != NULL; a = a->outchain)
2455  {
2456  if (isconstraintarc(a))
2457  {
2458  struct state *sto = a->to;
2459 
2460  assert(sto != s);
2461  s->tmp = sto;
2462  if (findconstraintloop(nfa, sto))
2463  return 1;
2464  }
2465  }
2466 
2467  /*
2468  * If we get here, no constraint loop exists leading out from s. Mark it
2469  * with s->tmp == s so we need not rediscover that fact again later.
2470  */
2471  s->tmp = s;
2472  return 0;
2473 }
2474 
2475 /*
2476  * breakconstraintloop - break a loop of constraint arcs
2477  *
2478  * sinitial is any one member state of the loop. Each loop member's tmp
2479  * field links to its successor within the loop. (Note that this function
2480  * will reset all the tmp fields to NULL.)
2481  *
2482  * We can break the loop by, for any one state S1 in the loop, cloning its
2483  * loop successor state S2 (and possibly following states), and then moving
2484  * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should
2485  * copy any non-constraint outarcs of S2. Constraint outarcs should be
2486  * dropped if they point back to S1, else they need to be copied as arcs to
2487  * similarly cloned states S3, S4, etc. In general, each cloned state copies
2488  * non-constraint outarcs, drops constraint outarcs that would lead to itself
2489  * or any earlier cloned state, and sends other constraint outarcs to newly
2490  * cloned states. No cloned state will have any inarcs that aren't constraint
2491  * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop
2492  * constraint back-arcs since they would not take us to any state we've not
2493  * already been in; therefore, no new constraint loop is created. In this way
2494  * we generate a modified NFA that can still represent every useful state
2495  * sequence, but not sequences that represent state loops with no consumption
2496  * of input data. Note that the set of cloned states will certainly include
2497  * all of the loop member states other than S1, and it may also include
2498  * non-loop states that are reachable from S2 via constraint arcs. This is
2499  * important because there is no guarantee that findconstraintloop found a
2500  * maximal loop (and searching for one would be NP-hard, so don't try).
2501  * Frequently the "non-loop states" are actually part of a larger loop that
2502  * we didn't notice, and indeed there may be several overlapping loops.
2503  * This technique ensures convergence in such cases, while considering only
2504  * the originally-found loop does not.
2505  *
2506  * If there is only one S1->S2 constraint arc, then that constraint is
2507  * certainly satisfied when we enter any of the clone states. This means that
2508  * in the common case where many of the constraint arcs are identically
2509  * labeled, we can merge together clone states linked by a similarly-labeled
2510  * constraint: if we can get to the first one we can certainly get to the
2511  * second, so there's no need to distinguish. This greatly reduces the number
2512  * of new states needed, so we preferentially break the given loop at a state
2513  * pair where this is true.
2514  *
2515  * Furthermore, it's fairly common to find that a cloned successor state has
2516  * no outarcs, especially if we're a bit aggressive about removing unnecessary
2517  * outarcs. If that happens, then there is simply not any interesting state
2518  * that can be reached through the predecessor's loop arcs, which means we can
2519  * break the loop just by removing those loop arcs, with no new states added.
2520  */
2521 static void
2522 breakconstraintloop(struct nfa *nfa, struct state *sinitial)
2523 {
2524  struct state *s;
2525  struct state *shead;
2526  struct state *stail;
2527  struct state *sclone;
2528  struct state *nexts;
2529  struct arc *refarc;
2530  struct arc *a;
2531  struct arc *nexta;
2532 
2533  /*
2534  * Start by identifying which loop step we want to break at.
2535  * Preferentially this is one with only one constraint arc. (XXX are
2536  * there any other secondary heuristics we want to use here?) Set refarc
2537  * to point to the selected lone constraint arc, if there is one.
2538  */
2539  refarc = NULL;
2540  s = sinitial;
2541  do
2542  {
2543  nexts = s->tmp;
2544  assert(nexts != s); /* should not see any one-element loops */
2545  if (refarc == NULL)
2546  {
2547  int narcs = 0;
2548 
2549  for (a = s->outs; a != NULL; a = a->outchain)
2550  {
2551  if (a->to == nexts && isconstraintarc(a))
2552  {
2553  refarc = a;
2554  narcs++;
2555  }
2556  }
2557  assert(narcs > 0);
2558  if (narcs > 1)
2559  refarc = NULL; /* multiple constraint arcs here, no good */
2560  }
2561  s = nexts;
2562  } while (s != sinitial);
2563 
2564  if (refarc)
2565  {
2566  /* break at the refarc */
2567  shead = refarc->from;
2568  stail = refarc->to;
2569  assert(stail == shead->tmp);
2570  }
2571  else
2572  {
2573  /* for lack of a better idea, break after sinitial */
2574  shead = sinitial;
2575  stail = sinitial->tmp;
2576  }
2577 
2578  /*
2579  * Reset the tmp fields so that we can use them for local storage in
2580  * clonesuccessorstates. (findconstraintloop won't mind, since it's just
2581  * going to abandon its search anyway.)
2582  */
2583  for (s = nfa->states; s != NULL; s = s->next)
2584  s->tmp = NULL;
2585 
2586  /*
2587  * Recursively build clone state(s) as needed.
2588  */
2589  sclone = newstate(nfa);
2590  if (sclone == NULL)
2591  {
2592  assert(NISERR());
2593  return;
2594  }
2595 
2596  clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2597  NULL, NULL, nfa->nstates);
2598 
2599  if (NISERR())
2600  return;
2601 
2602  /*
2603  * It's possible that sclone has no outarcs at all, in which case it's
2604  * useless. (We don't try extremely hard to get rid of useless states
2605  * here, but this is an easy and fairly common case.)
2606  */
2607  if (sclone->nouts == 0)
2608  {
2609  freestate(nfa, sclone);
2610  sclone = NULL;
2611  }
2612 
2613  /*
2614  * Move shead's constraint-loop arcs to point to sclone, or just drop them
2615  * if we discovered we don't need sclone.
2616  */
2617  for (a = shead->outs; a != NULL; a = nexta)
2618  {
2619  nexta = a->outchain;
2620  if (a->to == stail && isconstraintarc(a))
2621  {
2622  if (sclone)
2623  cparc(nfa, a, shead, sclone);
2624  freearc(nfa, a);
2625  if (NISERR())
2626  break;
2627  }
2628  }
2629 }
2630 
2631 /*
2632  * clonesuccessorstates - create a tree of constraint-arc successor states
2633  *
2634  * ssource is the state to be cloned, and sclone is the state to copy its
2635  * outarcs into. sclone's inarcs, if any, should already be set up.
2636  *
2637  * spredecessor is the original predecessor state that we are trying to build
2638  * successors for (it may not be the immediate predecessor of ssource).
2639  * refarc, if not NULL, is the original constraint arc that is known to have
2640  * been traversed out of spredecessor to reach the successor(s).
2641  *
2642  * For each cloned successor state, we transiently create a "donemap" that is
2643  * a boolean array showing which source states we've already visited for this
2644  * clone state. This prevents infinite recursion as well as useless repeat
2645  * visits to the same state subtree (which can add up fast, since typical NFAs
2646  * have multiple redundant arc pathways). Each donemap is a char array
2647  * indexed by state number. The donemaps are all of the same size "nstates",
2648  * which is nfa->nstates as of the start of the recursion. This is enough to
2649  * have entries for all pre-existing states, but *not* entries for clone
2650  * states created during the recursion. That's okay since we have no need to
2651  * mark those.
2652  *
2653  * curdonemap is NULL when recursing to a new sclone state, or sclone's
2654  * donemap when we are recursing without having created a new state (which we
2655  * do when we decide we can merge a successor state into the current clone
2656  * state). outerdonemap is NULL at the top level and otherwise the parent
2657  * clone state's donemap.
2658  *
2659  * The successor states we create and fill here form a strict tree structure,
2660  * with each state having exactly one predecessor, except that the toplevel
2661  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2662  * spredecessor after we're done). Thus, we can examine sclone's inarcs back
2663  * to the root, plus refarc if any, to identify the set of constraints already
2664  * known valid at the current point. This allows us to avoid generating extra
2665  * successor states.
2666  */
2667 static void
2669  struct state *ssource,
2670  struct state *sclone,
2671  struct state *spredecessor,
2672  struct arc *refarc,
2673  char *curdonemap,
2674  char *outerdonemap,
2675  int nstates)
2676 {
2677  char *donemap;
2678  struct arc *a;
2679 
2680  /* Since this is recursive, it could be driven to stack overflow */
2681  if (STACK_TOO_DEEP(nfa->v->re))
2682  {
2683  NERR(REG_ETOOBIG);
2684  return;
2685  }
2686 
2687  /* If this state hasn't already got a donemap, create one */
2688  donemap = curdonemap;
2689  if (donemap == NULL)
2690  {
2691  donemap = (char *) MALLOC(nstates * sizeof(char));
2692  if (donemap == NULL)
2693  {
2694  NERR(REG_ESPACE);
2695  return;
2696  }
2697 
2698  if (outerdonemap != NULL)
2699  {
2700  /*
2701  * Not at outermost recursion level, so copy the outer level's
2702  * donemap; this ensures that we see states in process of being
2703  * visited at outer levels, or already merged into predecessor
2704  * states, as ones we shouldn't traverse back to.
2705  */
2706  memcpy(donemap, outerdonemap, nstates * sizeof(char));
2707  }
2708  else
2709  {
2710  /* At outermost level, only spredecessor is off-limits */
2711  memset(donemap, 0, nstates * sizeof(char));
2712  assert(spredecessor->no < nstates);
2713  donemap[spredecessor->no] = 1;
2714  }
2715  }
2716 
2717  /* Mark ssource as visited in the donemap */
2718  assert(ssource->no < nstates);
2719  assert(donemap[ssource->no] == 0);
2720  donemap[ssource->no] = 1;
2721 
2722  /*
2723  * We proceed by first cloning all of ssource's outarcs, creating new
2724  * clone states as needed but not doing more with them than that. Then in
2725  * a second pass, recurse to process the child clone states. This allows
2726  * us to have only one child clone state per reachable source state, even
2727  * when there are multiple outarcs leading to the same state. Also, when
2728  * we do visit a child state, its set of inarcs is known exactly, which
2729  * makes it safe to apply the constraint-is-already-checked optimization.
2730  * Also, this ensures that we've merged all the states we can into the
2731  * current clone before we recurse to any children, thus possibly saving
2732  * them from making extra images of those states.
2733  *
2734  * While this function runs, child clone states of the current state are
2735  * marked by setting their tmp fields to point to the original state they
2736  * were cloned from. This makes it possible to detect multiple outarcs
2737  * leading to the same state, and also makes it easy to distinguish clone
2738  * states from original states (which will have tmp == NULL).
2739  */
2740  for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
2741  {
2742  struct state *sto = a->to;
2743 
2744  /*
2745  * We do not consider cloning successor states that have no constraint
2746  * outarcs; just link to them as-is. They cannot be part of a
2747  * constraint loop so there is no need to make copies. In particular,
2748  * this rule keeps us from trying to clone the post state, which would
2749  * be a bad idea.
2750  */
2751  if (isconstraintarc(a) && hasconstraintout(sto))
2752  {
2753  struct state *prevclone;
2754  int canmerge;
2755  struct arc *a2;
2756 
2757  /*
2758  * Back-link constraint arcs must not be followed. Nor is there a
2759  * need to revisit states previously merged into this clone.
2760  */
2761  assert(sto->no < nstates);
2762  if (donemap[sto->no] != 0)
2763  continue;
2764 
2765  /*
2766  * Check whether we already have a child clone state for this
2767  * source state.
2768  */
2769  prevclone = NULL;
2770  for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
2771  {
2772  if (a2->to->tmp == sto)
2773  {
2774  prevclone = a2->to;
2775  break;
2776  }
2777  }
2778 
2779  /*
2780  * If this arc is labeled the same as refarc, or the same as any
2781  * arc we must have traversed to get to sclone, then no additional
2782  * constraints need to be met to get to sto, so we should just
2783  * merge its outarcs into sclone.
2784  */
2785  if (refarc && a->type == refarc->type && a->co == refarc->co)
2786  canmerge = 1;
2787  else
2788  {
2789  struct state *s;
2790 
2791  canmerge = 0;
2792  for (s = sclone; s->ins; s = s->ins->from)
2793  {
2794  if (s->nins == 1 &&
2795  a->type == s->ins->type && a->co == s->ins->co)
2796  {
2797  canmerge = 1;
2798  break;
2799  }
2800  }
2801  }
2802 
2803  if (canmerge)
2804  {
2805  /*
2806  * We can merge into sclone. If we previously made a child
2807  * clone state, drop it; there's no need to visit it. (This
2808  * can happen if ssource has multiple pathways to sto, and we
2809  * only just now found one that is provably a no-op.)
2810  */
2811  if (prevclone)
2812  dropstate(nfa, prevclone); /* kills our outarc, too */
2813 
2814  /* Recurse to merge sto's outarcs into sclone */
2816  sto,
2817  sclone,
2818  spredecessor,
2819  refarc,
2820  donemap,
2821  outerdonemap,
2822  nstates);
2823  /* sto should now be marked as previously visited */
2824  assert(NISERR() || donemap[sto->no] == 1);
2825  }
2826  else if (prevclone)
2827  {
2828  /*
2829  * We already have a clone state for this successor, so just
2830  * make another arc to it.
2831  */
2832  cparc(nfa, a, sclone, prevclone);
2833  }
2834  else
2835  {
2836  /*
2837  * We need to create a new successor clone state.
2838  */
2839  struct state *stoclone;
2840 
2841  stoclone = newstate(nfa);
2842  if (stoclone == NULL)
2843  {
2844  assert(NISERR());
2845  break;
2846  }
2847  /* Mark it as to what it's a clone of */
2848  stoclone->tmp = sto;
2849  /* ... and add the outarc leading to it */
2850  cparc(nfa, a, sclone, stoclone);
2851  }
2852  }
2853  else
2854  {
2855  /*
2856  * Non-constraint outarcs just get copied to sclone, as do outarcs
2857  * leading to states with no constraint outarc.
2858  */
2859  cparc(nfa, a, sclone, sto);
2860  }
2861  }
2862 
2863  /*
2864  * If we are at outer level for this clone state, recurse to all its child
2865  * clone states, clearing their tmp fields as we go. (If we're not
2866  * outermost for sclone, leave this to be done by the outer call level.)
2867  * Note that if we have multiple outarcs leading to the same clone state,
2868  * it will only be recursed-to once.
2869  */
2870  if (curdonemap == NULL)
2871  {
2872  for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
2873  {
2874  struct state *stoclone = a->to;
2875  struct state *sto = stoclone->tmp;
2876 
2877  if (sto != NULL)
2878  {
2879  stoclone->tmp = NULL;
2881  sto,
2882  stoclone,
2883  spredecessor,
2884  refarc,
2885  NULL,
2886  donemap,
2887  nstates);
2888  }
2889  }
2890 
2891  /* Don't forget to free sclone's donemap when done with it */
2892  FREE(donemap);
2893  }
2894 }
2895 
2896 /*
2897  * cleanup - clean up NFA after optimizations
2898  */
2899 static void
2900 cleanup(struct nfa *nfa)
2901 {
2902  struct state *s;
2903  struct state *nexts;
2904  int n;
2905 
2906  if (NISERR())
2907  return;
2908 
2909  /* clear out unreachable or dead-end states */
2910  /* use pre to mark reachable, then post to mark can-reach-post */
2911  markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2912  markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2913  for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2914  {
2915  nexts = s->next;
2916  if (s->tmp != nfa->post && !s->flag)
2917  dropstate(nfa, s);
2918  }
2919  assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2920  cleartraverse(nfa, nfa->pre);
2921  assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2922  /* the nins==0 (final unreachable) case will be caught later */
2923 
2924  /* renumber surviving states */
2925  n = 0;
2926  for (s = nfa->states; s != NULL; s = s->next)
2927  s->no = n++;
2928  nfa->nstates = n;
2929 }
2930 
2931 /*
2932  * markreachable - recursive marking of reachable states
2933  */
2934 static void
2936  struct state *s,
2937  struct state *okay, /* consider only states with this mark */
2938  struct state *mark) /* the value to mark with */
2939 {
2940  struct arc *a;
2941 
2942  /* Since this is recursive, it could be driven to stack overflow */
2943  if (STACK_TOO_DEEP(nfa->v->re))
2944  {
2945  NERR(REG_ETOOBIG);
2946  return;
2947  }
2948 
2949  if (s->tmp != okay)
2950  return;
2951  s->tmp = mark;
2952 
2953  for (a = s->outs; a != NULL; a = a->outchain)
2954  markreachable(nfa, a->to, okay, mark);
2955 }
2956 
2957 /*
2958  * markcanreach - recursive marking of states which can reach here
2959  */
2960 static void
2962  struct state *s,
2963  struct state *okay, /* consider only states with this mark */
2964  struct state *mark) /* the value to mark with */
2965 {
2966  struct arc *a;
2967 
2968  /* Since this is recursive, it could be driven to stack overflow */
2969  if (STACK_TOO_DEEP(nfa->v->re))
2970  {
2971  NERR(REG_ETOOBIG);
2972  return;
2973  }
2974 
2975  if (s->tmp != okay)
2976  return;
2977  s->tmp = mark;
2978 
2979  for (a = s->ins; a != NULL; a = a->inchain)
2980  markcanreach(nfa, a->from, okay, mark);
2981 }
2982 
2983 /*
2984  * analyze - ascertain potentially-useful facts about an optimized NFA
2985  */
2986 static long /* re_info bits to be ORed in */
2987 analyze(struct nfa *nfa)
2988 {
2989  struct arc *a;
2990  struct arc *aa;
2991 
2992  if (NISERR())
2993  return 0;
2994 
2995  /* Detect whether NFA can't match anything */
2996  if (nfa->pre->outs == NULL)
2997  return REG_UIMPOSSIBLE;
2998 
2999  /* Detect whether NFA matches all strings (possibly with length bounds) */
3000  checkmatchall(nfa);
3001 
3002  /* Detect whether NFA can possibly match a zero-length string */
3003  for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3004  for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
3005  if (aa->to == nfa->post)
3006  return REG_UEMPTYMATCH;
3007  return 0;
3008 }
3009 
3010 /*
3011  * checkmatchall - does the NFA represent no more than a string length test?
3012  *
3013  * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
3014  * to begin with) and set the MATCHALL bit in nfa->flags.
3015  *
3016  * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
3017  * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
3018  * post state via RAINBOW arcs, and if there are any loops in the graph, they
3019  * must be loop-to-self arcs, ensuring that each loop iteration consumes
3020  * exactly one character. (Longer loops are problematic because they create
3021  * non-consecutive possible match lengths; we have no good way to represent
3022  * that situation for lengths beyond the DUPINF limit.)
3023  *
3024  * Pseudocolor arcs complicate things a little. We know that they can only
3025  * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
3026  * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
3027  * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
3028  * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
3029  * arcs can occur, meaning that the NFA involves some constraint on the
3030  * adjacent characters, which makes it not a matchall NFA.
3031  */
3032 static void
3034 {
3035  bool **haspaths;
3036  struct state *s;
3037  int i;
3038 
3039  /*
3040  * If there are too many states, don't bother trying to detect matchall.
3041  * This limit serves to bound the time and memory we could consume below.
3042  * Note that even if the graph is all-RAINBOW, if there are significantly
3043  * more than DUPINF states then it's likely that there are paths of length
3044  * more than DUPINF, which would force us to fail anyhow. In practice,
3045  * plausible ways of writing a matchall regex with maximum finite path
3046  * length K tend not to have very many more than K states.
3047  */
3048  if (nfa->nstates > DUPINF * 2)
3049  return;
3050 
3051  /*
3052  * First, scan all the states to verify that only RAINBOW arcs appear,
3053  * plus pseudocolor arcs adjacent to the pre and post states. This lets
3054  * us quickly eliminate most cases that aren't matchall NFAs.
3055  */
3056  for (s = nfa->states; s != NULL; s = s->next)
3057  {
3058  struct arc *a;
3059 
3060  for (a = s->outs; a != NULL; a = a->outchain)
3061  {
3062  if (a->type != PLAIN)
3063  return; /* any LACONs make it non-matchall */
3064  if (a->co != RAINBOW)
3065  {
3066  if (nfa->cm->cd[a->co].flags & PSEUDO)
3067  {
3068  /*
3069  * Pseudocolor arc: verify it's in a valid place (this
3070  * seems quite unlikely to fail, but let's be sure).
3071  */
3072  if (s == nfa->pre &&
3073  (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
3074  /* okay BOS/BOL arc */ ;
3075  else if (a->to == nfa->post &&
3076  (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
3077  /* okay EOS/EOL arc */ ;
3078  else
3079  return; /* unexpected pseudocolor arc */
3080  /* We'll check these arcs some more below. */
3081  }
3082  else
3083  return; /* any other color makes it non-matchall */
3084  }
3085  }
3086  /* Also, assert that the tmp fields are available for use. */
3087  assert(s->tmp == NULL);
3088  }
3089 
3090  /*
3091  * The next cheapest check we can make is to verify that the BOS/BOL
3092  * outarcs of the pre state reach the same states as its RAINBOW outarcs.
3093  * If they don't, the NFA expresses some constraints on the character
3094  * before the matched string, making it non-matchall. Likewise, the
3095  * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
3096  */
3097  if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
3098  !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
3099  !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
3100  !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
3101  return;
3102 
3103  /*
3104  * Initialize an array of path-length arrays, in which
3105  * checkmatchall_recurse will return per-state results. This lets us
3106  * memo-ize the recursive search and avoid exponential time consumption.
3107  */
3108  haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
3109  if (haspaths == NULL)
3110  return; /* fail quietly */
3111  memset(haspaths, 0, nfa->nstates * sizeof(bool *));
3112 
3113  /*
3114  * Recursively search the graph for all-RAINBOW paths to the "post" state,
3115  * starting at the "pre" state, and computing the lengths of the paths.
3116  * (Given the preceding checks, there should be at least one such path.
3117  * However we could get back a false result anyway, in case there are
3118  * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
3119  * failures such as ENOMEM.)
3120  */
3121  if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
3122  {
3123  /* The useful result is the path length array for the pre state */
3124  bool *haspath = haspaths[nfa->pre->no];
3125  int minmatch,
3126  maxmatch,
3127  morematch;
3128 
3129  assert(haspath != NULL);
3130 
3131  /*
3132  * haspath[] now represents the set of possible path lengths; but we
3133  * want to reduce that to a min and max value, because it doesn't seem
3134  * worth complicating regexec.c to deal with nonconsecutive possible
3135  * match lengths. Find min and max of first run of lengths, then
3136  * verify there are no nonconsecutive lengths.
3137  */
3138  for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
3139  {
3140  if (haspath[minmatch])
3141  break;
3142  }
3143  assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
3144  for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
3145  {
3146  if (!haspath[maxmatch + 1])
3147  break;
3148  }
3149  for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
3150  {
3151  if (haspath[morematch])
3152  {
3153  haspath = NULL; /* fail, there are nonconsecutive lengths */
3154  break;
3155  }
3156  }
3157 
3158  if (haspath != NULL)
3159  {
3160  /*
3161  * Success, so record the info. Here we have a fine point: the
3162  * path length from the pre state includes the pre-to-initial
3163  * transition, so it's one more than the actually matched string
3164  * length. (We avoided counting the final-to-post transition
3165  * within checkmatchall_recurse, but not this one.) This is why
3166  * checkmatchall_recurse allows one more level of path length than
3167  * might seem necessary. This decrement also takes care of
3168  * converting checkmatchall_recurse's definition of "infinity" as
3169  * "DUPINF+1" to our normal representation as "DUPINF".
3170  */
3171  assert(minmatch > 0); /* else pre and post states were adjacent */
3172  nfa->minmatchall = minmatch - 1;
3173  nfa->maxmatchall = maxmatch - 1;
3174  nfa->flags |= MATCHALL;
3175  }
3176  }
3177 
3178  /* Clean up */
3179  for (i = 0; i < nfa->nstates; i++)
3180  {
3181  if (haspaths[i] != NULL)
3182  FREE(haspaths[i]);
3183  }
3184  FREE(haspaths);
3185 }
3186 
3187 /*
3188  * checkmatchall_recurse - recursive search for checkmatchall
3189  *
3190  * s is the state to be examined in this recursion level.
3191  * haspaths[] is an array of per-state exit path length arrays.
3192  *
3193  * We return true if the search was performed successfully, false if
3194  * we had to fail because of multi-state loops or other internal reasons.
3195  * (Because "dead" states that can't reach the post state have been
3196  * eliminated, and we already verified that only RAINBOW and matching
3197  * pseudocolor arcs exist, every state should have RAINBOW path(s) to
3198  * the post state. Hence we take a false result from recursive calls
3199  * as meaning that we'd better fail altogether, not just that that
3200  * particular state can't reach the post state.)
3201  *
3202  * On success, we store a malloc'd result array in haspaths[s->no],
3203  * showing the possible path lengths from s to the post state.
3204  * Each state's haspath[] array is of length DUPINF+2. The entries from
3205  * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
3206  * from this state to the string end. haspath[DUPINF+1] is true if all
3207  * path lengths >= DUPINF+1 are possible. (Situations that cannot be
3208  * represented under these rules cause failure.)
3209  *
3210  * checkmatchall is responsible for eventually freeing the haspath[] arrays.
3211  */
3212 static bool
3213 checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
3214 {
3215  bool result = false;
3216  bool foundloop = false;
3217  bool *haspath;
3218  struct arc *a;
3219 
3220  /*
3221  * Since this is recursive, it could be driven to stack overflow. But we
3222  * need not treat that as a hard failure; just deem the NFA non-matchall.
3223  */
3224  if (STACK_TOO_DEEP(nfa->v->re))
3225  return false;
3226 
3227  /* In case the search takes a long time, check for cancel */
3228  if (CANCEL_REQUESTED(nfa->v->re))
3229  {
3230  NERR(REG_CANCEL);
3231  return false;
3232  }
3233 
3234  /* Create a haspath array for this state */
3235  haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
3236  if (haspath == NULL)
3237  return false; /* again, treat as non-matchall */
3238  memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
3239 
3240  /* Mark this state as being visited */
3241  assert(s->tmp == NULL);
3242  s->tmp = s;
3243 
3244  for (a = s->outs; a != NULL; a = a->outchain)
3245  {
3246  if (a->co != RAINBOW)
3247  continue; /* ignore pseudocolor arcs */
3248  if (a->to == nfa->post)
3249  {
3250  /* We found an all-RAINBOW path to the post state */
3251  result = true;
3252 
3253  /*
3254  * Mark this state as being zero steps away from the string end
3255  * (the transition to the post state isn't counted).
3256  */
3257  haspath[0] = true;
3258  }
3259  else if (a->to == s)
3260  {
3261  /* We found a cycle of length 1, which we'll deal with below. */
3262  foundloop = true;
3263  }
3264  else if (a->to->tmp != NULL)
3265  {
3266  /* It's busy, so we found a cycle of length > 1, so fail. */
3267  result = false;
3268  break;
3269  }
3270  else
3271  {
3272  /* Consider paths forward through this to-state. */
3273  bool *nexthaspath;
3274  int i;
3275 
3276  /* If to-state was not already visited, recurse */
3277  if (haspaths[a->to->no] == NULL)
3278  {
3279  result = checkmatchall_recurse(nfa, a->to, haspaths);
3280  /* Fail if any recursive path fails */
3281  if (!result)
3282  break;
3283  }
3284  else
3285  {
3286  /* The previous visit must have found path(s) to the end */
3287  result = true;
3288  }
3289  assert(a->to->tmp == NULL);
3290  nexthaspath = haspaths[a->to->no];
3291  assert(nexthaspath != NULL);
3292 
3293  /*
3294  * Now, for every path of length i from a->to to the string end,
3295  * there is a path of length i + 1 from s to the string end.
3296  */
3297  if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
3298  {
3299  /*
3300  * a->to has a path of length exactly DUPINF, but not longer;
3301  * or it has paths of all lengths > DUPINF but not one of
3302  * exactly that length. In either case, we cannot represent
3303  * the possible path lengths from s correctly, so fail.
3304  */
3305  result = false;
3306  break;
3307  }
3308  /* Merge knowledge of these path lengths into what we have */
3309  for (i = 0; i < DUPINF; i++)
3310  haspath[i + 1] |= nexthaspath[i];
3311  /* Infinity + 1 is still infinity */
3312  haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
3313  }
3314  }
3315 
3316  if (result && foundloop)
3317  {
3318  /*
3319  * If there is a length-1 loop at this state, then find the shortest
3320  * known path length to the end. The loop means that every larger
3321  * path length is possible, too. (It doesn't matter whether any of
3322  * the longer lengths were already known possible.)
3323  */
3324  int i;
3325 
3326  for (i = 0; i <= DUPINF; i++)
3327  {
3328  if (haspath[i])
3329  break;
3330  }
3331  for (i++; i <= DUPINF + 1; i++)
3332  haspath[i] = true;
3333  }
3334 
3335  /* Report out the completed path length map */
3336  assert(s->no < nfa->nstates);
3337  assert(haspaths[s->no] == NULL);
3338  haspaths[s->no] = haspath;
3339 
3340  /* Mark state no longer busy */
3341  s->tmp = NULL;
3342 
3343  return result;
3344 }
3345 
3346 /*
3347  * check_out_colors_match - subroutine for checkmatchall
3348  *
3349  * Check whether the set of states reachable from s by arcs of color co1
3350  * is equivalent to the set reachable by arcs of color co2.
3351  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3352  * so we need not examine arc types here.
3353  */
3354 static bool
3356 {
3357  bool result = true;
3358  struct arc *a;
3359 
3360  /*
3361  * To do this in linear time, we assume that the NFA contains no duplicate
3362  * arcs. Run through the out-arcs, marking states reachable by arcs of
3363  * color co1. Run through again, un-marking states reachable by arcs of
3364  * color co2; if we see a not-marked state, we know this co2 arc is
3365  * unmatched. Then run through again, checking for still-marked states,
3366  * and in any case leaving all the tmp fields reset to NULL.
3367  */
3368  for (a = s->outs; a != NULL; a = a->outchain)
3369  {
3370  if (a->co == co1)
3371  {
3372  assert(a->to->tmp == NULL);
3373  a->to->tmp = a->to;
3374  }
3375  }
3376  for (a = s->outs; a != NULL; a = a->outchain)
3377  {
3378  if (a->co == co2)
3379  {
3380  if (a->to->tmp != NULL)
3381  a->to->tmp = NULL;
3382  else
3383  result = false; /* unmatched co2 arc */
3384  }
3385  }
3386  for (a = s->outs; a != NULL; a = a->outchain)
3387  {
3388  if (a->co == co1)
3389  {
3390  if (a->to->tmp != NULL)
3391  {
3392  result = false; /* unmatched co1 arc */
3393  a->to->tmp = NULL;
3394  }
3395  }
3396  }
3397  return result;
3398 }
3399 
3400 /*
3401  * check_in_colors_match - subroutine for checkmatchall
3402  *
3403  * Check whether the set of states that can reach s by arcs of color co1
3404  * is equivalent to the set that can reach s by arcs of color co2.
3405  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3406  * so we need not examine arc types here.
3407  */
3408 static bool
3410 {
3411  bool result = true;
3412  struct arc *a;
3413 
3414  /*
3415  * Identical algorithm to check_out_colors_match, except examine the
3416  * from-states of s' inarcs.
3417  */
3418  for (a = s->ins; a != NULL; a = a->inchain)
3419  {
3420  if (a->co == co1)
3421  {
3422  assert(a->from->tmp == NULL);
3423  a->from->tmp = a->from;
3424  }
3425  }
3426  for (a = s->ins; a != NULL; a = a->inchain)
3427  {
3428  if (a->co == co2)
3429  {
3430  if (a->from->tmp != NULL)
3431  a->from->tmp = NULL;
3432  else
3433  result = false; /* unmatched co2 arc */
3434  }
3435  }
3436  for (a = s->ins; a != NULL; a = a->inchain)
3437  {
3438  if (a->co == co1)
3439  {
3440  if (a->from->tmp != NULL)
3441  {
3442  result = false; /* unmatched co1 arc */
3443  a->from->tmp = NULL;
3444  }
3445  }
3446  }
3447  return result;
3448 }
3449 
3450 /*
3451  * compact - construct the compact representation of an NFA
3452  */
3453 static void
3454 compact(struct nfa *nfa,
3455  struct cnfa *cnfa)
3456 {
3457  struct state *s;
3458  struct arc *a;
3459  size_t nstates;
3460  size_t narcs;
3461  struct carc *ca;
3462  struct carc *first;
3463 
3464  assert(!NISERR());
3465 
3466  nstates = 0;
3467  narcs = 0;
3468  for (s = nfa->states; s != NULL; s = s->next)
3469  {
3470  nstates++;
3471  narcs += s->nouts + 1; /* need one extra for endmarker */
3472  }
3473 
3474  cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
3475  cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
3476  cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
3477  if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
3478  {
3479  if (cnfa->stflags != NULL)
3480  FREE(cnfa->stflags);
3481  if (cnfa->states != NULL)
3482  FREE(cnfa->states);
3483  if (cnfa->arcs != NULL)
3484  FREE(cnfa->arcs);
3485  NERR(REG_ESPACE);
3486  return;
3487  }
3488  cnfa->nstates = nstates;
3489  cnfa->pre = nfa->pre->no;
3490  cnfa->post = nfa->post->no;
3491  cnfa->bos[0] = nfa->bos[0];
3492  cnfa->bos[1] = nfa->bos[1];
3493  cnfa->eos[0] = nfa->eos[0];
3494  cnfa->eos[1] = nfa->eos[1];
3495  cnfa->ncolors = maxcolor(nfa->cm) + 1;
3496  cnfa->flags = nfa->flags;
3497  cnfa->minmatchall = nfa->minmatchall;
3498  cnfa->maxmatchall = nfa->maxmatchall;
3499 
3500  ca = cnfa->arcs;
3501  for (s = nfa->states; s != NULL; s = s->next)
3502  {
3503  assert((size_t) s->no < nstates);
3504  cnfa->stflags[s->no] = 0;
3505  cnfa->states[s->no] = ca;
3506  first = ca;
3507  for (a = s->outs; a != NULL; a = a->outchain)
3508  switch (a->type)
3509  {
3510  case PLAIN:
3511  ca->co = a->co;
3512  ca->to = a->to->no;
3513  ca++;
3514  break;
3515  case LACON:
3516  assert(s->no != cnfa->pre);
3517  assert(a->co >= 0);
3518  ca->co = (color) (cnfa->ncolors + a->co);
3519  ca->to = a->to->no;
3520  ca++;
3521  cnfa->flags |= HASLACONS;
3522  break;
3523  default:
3524  NERR(REG_ASSERT);
3525  return;
3526  }
3527  carcsort(first, ca - first);
3528  ca->co = COLORLESS;
3529  ca->to = 0;
3530  ca++;
3531  }
3532  assert(ca == &cnfa->arcs[narcs]);
3533  assert(cnfa->nstates != 0);
3534 
3535  /* mark no-progress states */
3536  for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3537  cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
3538  cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
3539 }
3540 
3541 /*
3542  * carcsort - sort compacted-NFA arcs by color
3543  */
3544 static void
3545 carcsort(struct carc *first, size_t n)
3546 {
3547  if (n > 1)
3548  qsort(first, n, sizeof(struct carc), carc_cmp);
3549 }
3550 
3551 static int
3552 carc_cmp(const void *a, const void *b)
3553 {
3554  const struct carc *aa = (const struct carc *) a;
3555  const struct carc *bb = (const struct carc *) b;
3556 
3557  if (aa->co < bb->co)
3558  return -1;
3559  if (aa->co > bb->co)
3560  return +1;
3561  if (aa->to < bb->to)
3562  return -1;
3563  if (aa->to > bb->to)
3564  return +1;
3565  return 0;
3566 }
3567 
3568 /*
3569  * freecnfa - free a compacted NFA
3570  */
3571 static void
3573 {
3574  assert(!NULLCNFA(*cnfa)); /* not empty already */
3575  FREE(cnfa->stflags);
3576  FREE(cnfa->states);
3577  FREE(cnfa->arcs);
3578  ZAPCNFA(*cnfa);
3579 }
3580 
3581 /*
3582  * dumpnfa - dump an NFA in human-readable form
3583  */
3584 static void
3585 dumpnfa(struct nfa *nfa,
3586  FILE *f)
3587 {
3588 #ifdef REG_DEBUG
3589  struct state *s;
3590  int nstates = 0;
3591  int narcs = 0;
3592 
3593  fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
3594  if (nfa->bos[0] != COLORLESS)
3595  fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3596  if (nfa->bos[1] != COLORLESS)
3597  fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3598  if (nfa->eos[0] != COLORLESS)
3599  fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3600  if (nfa->eos[1] != COLORLESS)
3601  fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3602  if (nfa->flags & HASLACONS)
3603  fprintf(f, ", haslacons");
3604  if (nfa->flags & MATCHALL)
3605  fprintf(f, ", minmatchall %d, maxmatchall %d",
3606  nfa->minmatchall, nfa->maxmatchall);
3607  fprintf(f, "\n");
3608  for (s = nfa->states; s != NULL; s = s->next)
3609  {
3610  dumpstate(s, f);
3611  nstates++;
3612  narcs += s->nouts;
3613  }
3614  fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3615  if (nfa->parent == NULL)
3616  dumpcolors(nfa->cm, f);
3617  fflush(f);
3618 #endif
3619 }
3620 
3621 #ifdef REG_DEBUG /* subordinates of dumpnfa */
3622 
3623 /*
3624  * dumpstate - dump an NFA state in human-readable form
3625  */
3626 static void
3627 dumpstate(struct state *s,
3628  FILE *f)
3629 {
3630  struct arc *a;
3631 
3632  fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3633  (s->flag) ? s->flag : '.');
3634  if (s->prev != NULL && s->prev->next != s)
3635  fprintf(f, "\tstate chain bad\n");
3636  if (s->nouts == 0)
3637  fprintf(f, "\tno out arcs\n");
3638  else
3639  dumparcs(s, f);
3640  for (a = s->ins; a != NULL; a = a->inchain)
3641  {
3642  if (a->to != s)
3643  fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3644  a->from->no, a->to->no, s->no);
3645  }
3646  fflush(f);
3647 }
3648 
3649 /*
3650  * dumparcs - dump out-arcs in human-readable form
3651  */
3652 static void
3653 dumparcs(struct state *s,
3654  FILE *f)
3655 {
3656  int pos;
3657  struct arc *a;
3658 
3659  /* printing oldest arcs first is usually clearer */
3660  a = s->outs;
3661  assert(a != NULL);
3662  while (a->outchain != NULL)
3663  a = a->outchain;
3664  pos = 1;
3665  do
3666  {
3667  dumparc(a, s, f);
3668  if (pos == 5)
3669  {
3670  fprintf(f, "\n");
3671  pos = 1;
3672  }
3673  else
3674  pos++;
3675  a = a->outchainRev;
3676  } while (a != NULL);
3677  if (pos != 1)
3678  fprintf(f, "\n");
3679 }
3680 
3681 /*
3682  * dumparc - dump one outarc in readable form, including prefixing tab
3683  */
3684 static void
3685 dumparc(struct arc *a,
3686  struct state *s,
3687  FILE *f)
3688 {
3689  struct arc *aa;
3690 
3691  fprintf(f, "\t");
3692  switch (a->type)
3693  {
3694  case PLAIN:
3695  if (a->co == RAINBOW)
3696  fprintf(f, "[*]");
3697  else
3698  fprintf(f, "[%ld]", (long) a->co);
3699  break;
3700  case AHEAD:
3701  if (a->co == RAINBOW)
3702  fprintf(f, ">*>");
3703  else
3704  fprintf(f, ">%ld>", (long) a->co);
3705  break;
3706  case BEHIND:
3707  if (a->co == RAINBOW)
3708  fprintf(f, "<*<");
3709  else
3710  fprintf(f, "<%ld<", (long) a->co);
3711  break;
3712  case LACON:
3713  fprintf(f, ":%ld:", (long) a->co);
3714  break;
3715  case '^':
3716  case '$':
3717  fprintf(f, "%c%d", a->type, (int) a->co);
3718  break;
3719  case EMPTY:
3720  break;
3721  default:
3722  fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3723  break;
3724  }
3725  if (a->from != s)
3726  fprintf(f, "?%d?", a->from->no);
3727  for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
3728  if (aa == a)
3729  break; /* NOTE BREAK OUT */
3730  if (aa == NULL)
3731  fprintf(f, "?!?"); /* missing from out-chain */
3732  fprintf(f, "->");
3733  if (a->to == NULL)
3734  {
3735  fprintf(f, "NULL");
3736  return;
3737  }
3738  fprintf(f, "%d", a->to->no);
3739  for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
3740  if (aa == a)
3741  break; /* NOTE BREAK OUT */
3742  if (aa == NULL)
3743  fprintf(f, "?!?"); /* missing from in-chain */
3744 }
3745 #endif /* REG_DEBUG */
3746 
3747 /*
3748  * dumpcnfa - dump a compacted NFA in human-readable form
3749  */
3750 #ifdef REG_DEBUG
3751 static void
3752 dumpcnfa(struct cnfa *cnfa,
3753  FILE *f)
3754 {
3755  int st;
3756 
3757  fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3758  if (cnfa->bos[0] != COLORLESS)
3759  fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3760  if (cnfa->bos[1] != COLORLESS)
3761  fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3762  if (cnfa->eos[0] != COLORLESS)
3763  fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3764  if (cnfa->eos[1] != COLORLESS)
3765  fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3766  if (cnfa->flags & HASLACONS)
3767  fprintf(f, ", haslacons");
3768  if (cnfa->flags & MATCHALL)
3769  fprintf(f, ", minmatchall %d, maxmatchall %d",
3770  cnfa->minmatchall, cnfa->maxmatchall);
3771  fprintf(f, "\n");
3772  for (st = 0; st < cnfa->nstates; st++)
3773  dumpcstate(st, cnfa, f);
3774  fflush(f);
3775 }
3776 #endif
3777 
3778 #ifdef REG_DEBUG /* subordinates of dumpcnfa */
3779 
3780 /*
3781  * dumpcstate - dump a compacted-NFA state in human-readable form
3782  */
3783 static void
3784 dumpcstate(int st,
3785  struct cnfa *cnfa,
3786  FILE *f)
3787 {
3788  struct carc *ca;
3789  int pos;
3790 
3791  fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3792  pos = 1;
3793  for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
3794  {
3795  if (ca->co == RAINBOW)
3796  fprintf(f, "\t[*]->%d", ca->to);
3797  else if (ca->co < cnfa->ncolors)
3798  fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3799  else
3800  fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
3801  if (pos == 5)
3802  {
3803  fprintf(f, "\n");
3804  pos = 1;
3805  }
3806  else
3807  pos++;
3808  }
3809  if (ca == cnfa->states[st] || pos != 1)
3810  fprintf(f, "\n");
3811  fflush(f);
3812 }
3813 
3814 #endif /* REG_DEBUG */
size_t lastabused
Definition: regguts.h:357
#define MAXSBSIZE
Definition: regguts.h:341
#define REG_UIMPOSSIBLE
Definition: regex.h:74
#define NISERR()
Definition: regc_nfa.c:39
int no
Definition: regguts.h:319
static void fixconstraintloops(struct nfa *nfa, FILE *f)
Definition: regc_nfa.c:2334
struct state * from
Definition: regguts.h:294
static void markreachable(struct nfa *nfa, struct state *s, struct state *okay, struct state *mark)
Definition: regc_nfa.c:2935
size_t lastsbused
Definition: regguts.h:356
int type
Definition: regguts.h:292
static void moveouts(struct nfa *nfa, struct state *oldState, struct state *newState)
Definition: regc_nfa.c:1055
#define NERR(e)
Definition: regc_nfa.c:40
struct state s[FLEXIBLE_ARRAY_MEMBER]
Definition: regguts.h:336
static long optimize(struct nfa *nfa, FILE *f)
Definition: regc_nfa.c:1566
#define ERR
Definition: _int.h:161
int maxmatchall
Definition: regguts.h:419
#define FREESTATE
Definition: regguts.h:320
static void fixempties(struct nfa *nfa, FILE *f)
Definition: regc_nfa.c:2040
struct state * final
Definition: regguts.h:347
static bool check_in_colors_match(struct state *s, color co1, color co2)
Definition: regc_nfa.c:3409
static struct nfa * newnfa(struct vars *v, struct colormap *cm, struct nfa *parent)
Definition: regc_nfa.c:47
#define CNFA_NOPROGRESS
Definition: regguts.h:413
static int push(struct nfa *nfa, struct arc *con, struct state **intermediates)
Definition: regc_nfa.c:1857
static void pullback(struct nfa *nfa, FILE *f)
Definition: regc_nfa.c:1606
#define REG_ESPACE
Definition: regex.h:151
static int carc_cmp(const void *a, const void *b)
Definition: regc_nfa.c:3552
Definition: regguts.h:290
static void changearctarget(struct arc *a, struct state *newto)
Definition: regc_nfa.c:541
static void removetraverse(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:1411
struct state * states
Definition: regguts.h:350
static void changearcsource(struct arc *a, struct state *newfrom)
Definition: regc_nfa.c:497
#define RAINBOW
Definition: regguts.h:154
#define NULLCNFA(cnfa)
Definition: regguts.h:431
Definition: regguts.h:401
short color
Definition: regguts.h:150
int nins
Definition: regguts.h:322
static void deltraverse(struct nfa *nfa, struct state *leftend, struct state *s)
Definition: regc_nfa.c:1277
int pre
Definition: regguts.h:408
static void copyins(struct nfa *nfa, struct state *oldState, struct state *newState)
Definition: regc_nfa.c:874
color eos[2]
Definition: regguts.h:360
#define CA(ct, at)
int flags
Definition: regguts.h:361
struct state * post
Definition: regguts.h:348
#define FREE(ptr)
Definition: cryptohash.c:37
Definition: regguts.h:343
#define ARCBATCHSIZE(n)
Definition: regguts.h:312
static struct state * emptyreachable(struct nfa *nfa, struct state *s, struct state *lastfound, struct arc **inarcsorig)
Definition: regc_nfa.c:2267
#define COLORED(a)
Definition: regcomp.c:307
static void dumpnfa(struct nfa *nfa, FILE *f)
Definition: regc_nfa.c:3585
#define ISERR()
Definition: regcomp.c:273
#define fprintf
Definition: port.h:220
#define BEHIND
Definition: regcomp.c:299
struct carc ** states
Definition: regguts.h:414
struct colordesc * cd
Definition: regguts.h:231
static struct arc * allocarc(struct nfa *nfa)
Definition: regc_nfa.c:376
#define NOTREACHED
Definition: regguts.h:91
static void mergeins(struct nfa *nfa, struct state *s, struct arc **arcarray, int arccount)
Definition: regc_nfa.c:956
size_t narcs
Definition: regguts.h:309
#define MALLOC(n)
Definition: regcustom.h:60
static void freecnfa(struct cnfa *cnfa)
Definition: regc_nfa.c:3572
int maxmatchall
Definition: regguts.h:363
int nstates
Definition: regguts.h:403
#define REG_MAX_COMPILE_SPACE
Definition: regguts.h:444
static struct state * newfstate(struct nfa *nfa, int flag)
Definition: regc_nfa.c:216
struct vars * v
Definition: regguts.h:364
struct arcbatch * next
Definition: regguts.h:308
static const FormData_pg_attribute a2
Definition: heap.c:166
static color pseudocolor(struct colormap *cm)
Definition: regc_color.c:312
#define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs)
Definition: regc_nfa.c:766
#define ZAPCNFA(cnfa)
Definition: regguts.h:429
static void pushfwd(struct nfa *nfa, FILE *f)
Definition: regc_nfa.c:1777
static void markcanreach(struct nfa *nfa, struct state *s, struct state *okay, struct state *mark)
Definition: regc_nfa.c:2961
#define SATISFIED
Definition: regcomp.c:165
char * s1
#define LACON
Definition: regcomp.c:297
struct arc * inchainRev
Definition: regguts.h:300
static void copyouts(struct nfa *nfa, struct state *oldState, struct state *newState)
Definition: regc_nfa.c:1147
regex_t * re
Definition: regcomp.c:239
#define REPLACEARC
Definition: regcomp.c:167
static void breakconstraintloop(struct nfa *nfa, struct state *sinitial)
Definition: regc_nfa.c:2522
static void cloneouts(struct nfa *nfa, struct state *old, struct state *from, struct state *to, int type)
Definition: regc_nfa.c:1229
static void dupnfa(struct nfa *nfa, struct state *start, struct state *stop, struct state *from, struct state *to)
Definition: regc_nfa.c:1328
static int hasnonemptyout(struct state *s)
Definition: regc_nfa.c:583
static void colorchain(struct colormap *cm, struct arc *a)
Definition: regc_color.c:984
struct arc * outchain
Definition: regguts.h:296
char * flag(int b)
Definition: test-ctype.c:33
struct statebatch * next
Definition: regguts.h:334
color bos[2]
Definition: regguts.h:359
static int isconstraintarc(struct arc *a)
Definition: regc_nfa.c:2295
#define assert(TEST)
Definition: imath.c:73
struct colormap * cm
Definition: regguts.h:358
struct state * next
Definition: regguts.h:327
static struct state * single_color_transition(struct state *s1, struct state *s2)
Definition: regc_nfa.c:1497
int nstates
Definition: regguts.h:349
char * stflags
Definition: regguts.h:412
struct arc * outs
Definition: regguts.h:325
static int hasconstraintout(struct state *s)
Definition: regc_nfa.c:2313
static void clonesuccessorstates(struct nfa *nfa, struct state *ssource, struct state *sclone, struct state *spredecessor, struct arc *refarc, char *curdonemap, char *outerdonemap, int nstates)
Definition: regc_nfa.c:2668
static int sortins_cmp(const void *a, const void *b)
Definition: regc_nfa.c:670
#define FIRSTSBSIZE
Definition: regguts.h:340
static void freearc(struct nfa *nfa, struct arc *victim)
Definition: regc_nfa.c:426
static void freenfa(struct nfa *nfa)
Definition: regc_nfa.c:107
static void specialcolors(struct nfa *nfa)
Definition: regc_nfa.c:1527
char flag
Definition: regguts.h:321
static void cleartraverse(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:1460
#define MATCHALL
Definition: regguts.h:407
static void rainbow(struct nfa *nfa, struct colormap *cm, int type, color but, struct state *from, struct state *to)
Definition: regc_color.c:1031
struct state * tmp
Definition: regguts.h:326
struct state * freestates
Definition: regguts.h:352
static void moveins(struct nfa *nfa, struct state *oldState, struct state *newState)
Definition: regc_nfa.c:782
static int verbose
int minmatchall
Definition: regguts.h:362
int progress
Definition: pgbench.c:270
color bos[2]
Definition: regguts.h:410
static struct arc * findarc(struct state *s, int type, color co)
Definition: regc_nfa.c:600
static int findconstraintloop(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:2433
struct state * to
Definition: regguts.h:295
char * s2
#define COMPATIBLE
Definition: regcomp.c:166
static struct state * newstate(struct nfa *nfa)
Definition: regc_nfa.c:137
#define PLAIN
Definition: regcomp.c:287
static void delsub(struct nfa *nfa, struct state *lp, struct state *rp)
Definition: regc_nfa.c:1254
static void createarc(struct nfa *nfa, int t, color co, struct state *from, struct state *to)
Definition: regc_nfa.c:331
static bool checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
Definition: regc_nfa.c:3213
struct arcbatch * lastab
Definition: regguts.h:355
#define REG_UEMPTYMATCH
Definition: regex.h:73
int to
Definition: regguts.h:398
color co
Definition: regguts.h:293
#define DUPINF
Definition: regguts.h:94
#define REG_ASSERT
Definition: regex.h:153
Definition: regguts.h:317
int flags
Definition: regguts.h:179
static int pull(struct nfa *nfa, struct arc *con, struct state **intermediates)
Definition: regc_nfa.c:1686
static void cleanup(struct nfa *nfa)
Definition: regc_nfa.c:2900
#define FIRSTABSIZE
Definition: regguts.h:314
int flags
Definition: regguts.h:405
static void newarc(struct nfa *nfa, int t, color co, struct state *from, struct state *to)
Definition: regc_nfa.c:285
struct carc * arcs
Definition: regguts.h:416
struct arc a[FLEXIBLE_ARRAY_MEMBER]
Definition: regguts.h:310
static void sortouts(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:695
#define REG_ETOOBIG
Definition: regex.h:157
size_t nstates
Definition: regguts.h:335
int minmatchall
Definition: regguts.h:418
#define EMPTY
Definition: regcomp.c:285
static bool check_out_colors_match(struct state *s, color co1, color co2)
Definition: regc_nfa.c:3355
struct state * prev
Definition: regguts.h:329
struct arc * freearcs
Definition: regguts.h:353
static int sortouts_cmp(const void *a, const void *b)
Definition: regc_nfa.c:737
static void dropstate(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:230
int post
Definition: regguts.h:409
struct state * slast
Definition: regguts.h:351
int nouts
Definition: regguts.h:323
static void compact(struct nfa *nfa, struct cnfa *cnfa)
Definition: regc_nfa.c:3454
#define PSEUDO
Definition: regguts.h:181
static int combine(struct nfa *nfa, struct arc *con, struct arc *a)
Definition: regc_nfa.c:1953
static void uncolorchain(struct colormap *cm, struct arc *a)
Definition: regc_color.c:1001
color co
Definition: regguts.h:397
static void duptraverse(struct nfa *nfa, struct state *s, struct state *stmp)
Definition: regc_nfa.c:1352
struct nfa * parent
Definition: regguts.h:365
struct state * init
Definition: regguts.h:346
static void cparc(struct nfa *nfa, struct arc *oa, struct state *from, struct state *to)
Definition: regc_nfa.c:616
struct arc * inchain
Definition: regguts.h:299
static color maxcolor(struct colormap *cm)
Definition: regc_color.c:172
static void carcsort(struct carc *first, size_t n)
Definition: regc_nfa.c:3545
#define CANCEL_REQUESTED(re)
Definition: regguts.h:516
int i
#define COLORLESS
Definition: regguts.h:153
#define STATEBATCHSIZE(n)
Definition: regguts.h:338
#define MAXABSIZE
Definition: regguts.h:315
#define INCOMPATIBLE
Definition: regcomp.c:164
#define qsort(a, b, c, d)
Definition: port.h:504
size_t spaceused
Definition: regcomp.c:265
struct arc * outchainRev
Definition: regguts.h:297
Definition: regcomp.c:237
static void freestate(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:246
#define HASLACONS
Definition: regguts.h:406
static long analyze(struct nfa *nfa)
Definition: regc_nfa.c:2987
#define STACK_TOO_DEEP(re)
Definition: regguts.h:519
#define AHEAD
Definition: regcomp.c:298
struct arc * ins
Definition: regguts.h:324
struct state * pre
Definition: regguts.h:345
static void removeconstraints(struct nfa *nfa, struct state *start, struct state *stop)
Definition: regc_nfa.c:1392
static void checkmatchall(struct nfa *nfa)
Definition: regc_nfa.c:3033
color eos[2]
Definition: regguts.h:411
int ncolors
Definition: regguts.h:404
unsigned char bool
Definition: c.h:391
#define REG_CANCEL
Definition: regex.h:159
struct statebatch * lastsb
Definition: regguts.h:354
static void sortins(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:628
Definition: regguts.h:395