PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
regc_lex.c
Go to the documentation of this file.
1 /*
2  * lexical analyzer
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results. The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/regc_lex.c
32  *
33  */
34 
35 /* scanning macros (know about v) */
36 #define ATEOS() (v->now >= v->stop)
37 #define HAVE(n) (v->stop - v->now >= (n))
38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41  *(v->now+1) == CHR(b) && \
42  *(v->now+2) == CHR(c))
43 #define SET(c) (v->nexttype = (c))
44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c) return (SET(c), 1)
46 #define RETV(c, n) return (SETV(c, n), 1)
47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
49 
50 /* lexical contexts */
51 #define L_ERE 1 /* mainline ERE/ARE */
52 #define L_BRE 2 /* mainline BRE */
53 #define L_Q 3 /* REG_QUOTE */
54 #define L_EBND 4 /* ERE/ARE bound */
55 #define L_BBND 5 /* BRE bound */
56 #define L_BRACK 6 /* brackets */
57 #define L_CEL 7 /* collating element */
58 #define L_ECL 8 /* equivalence class */
59 #define L_CCL 9 /* character class */
60 #define INTOCON(c) (v->lexcon = (c))
61 #define INCON(con) (v->lexcon == (con))
62 
63 /* construct pointer past end of chr array */
64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65 
66 /*
67  * lexstart - set up lexical stuff, scan leading options
68  */
69 static void
70 lexstart(struct vars * v)
71 {
72  prefixes(v); /* may turn on new type bits etc. */
73  NOERR();
74 
75  if (v->cflags & REG_QUOTE)
76  {
78  INTOCON(L_Q);
79  }
80  else if (v->cflags & REG_EXTENDED)
81  {
82  assert(!(v->cflags & REG_QUOTE));
83  INTOCON(L_ERE);
84  }
85  else
86  {
87  assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88  INTOCON(L_BRE);
89  }
90 
91  v->nexttype = EMPTY; /* remember we were at the start */
92  next(v); /* set up the first token */
93 }
94 
95 /*
96  * prefixes - implement various special prefixes
97  */
98 static void
99 prefixes(struct vars * v)
100 {
101  /* literal string doesn't get any of this stuff */
102  if (v->cflags & REG_QUOTE)
103  return;
104 
105  /* initial "***" gets special things */
106  if (HAVE(4) && NEXT3('*', '*', '*'))
107  switch (*(v->now + 3))
108  {
109  case CHR('?'): /* "***?" error, msg shows version */
110  ERR(REG_BADPAT);
111  return; /* proceed no further */
112  break;
113  case CHR('='): /* "***=" shifts to literal string */
115  v->cflags |= REG_QUOTE;
117  v->now += 4;
118  return; /* and there can be no more prefixes */
119  break;
120  case CHR(':'): /* "***:" shifts to AREs */
122  v->cflags |= REG_ADVANCED;
123  v->now += 4;
124  break;
125  default: /* otherwise *** is just an error */
126  ERR(REG_BADRPT);
127  return;
128  break;
129  }
130 
131  /* BREs and EREs don't get embedded options */
132  if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133  return;
134 
135  /* embedded options (AREs only) */
136  if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137  {
139  v->now += 2;
140  for (; !ATEOS() && iscalpha(*v->now); v->now++)
141  switch (*v->now)
142  {
143  case CHR('b'): /* BREs (but why???) */
144  v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145  break;
146  case CHR('c'): /* case sensitive */
147  v->cflags &= ~REG_ICASE;
148  break;
149  case CHR('e'): /* plain EREs */
150  v->cflags |= REG_EXTENDED;
151  v->cflags &= ~(REG_ADVF | REG_QUOTE);
152  break;
153  case CHR('i'): /* case insensitive */
154  v->cflags |= REG_ICASE;
155  break;
156  case CHR('m'): /* Perloid synonym for n */
157  case CHR('n'): /* \n affects ^ $ . [^ */
158  v->cflags |= REG_NEWLINE;
159  break;
160  case CHR('p'): /* ~Perl, \n affects . [^ */
161  v->cflags |= REG_NLSTOP;
162  v->cflags &= ~REG_NLANCH;
163  break;
164  case CHR('q'): /* literal string */
165  v->cflags |= REG_QUOTE;
166  v->cflags &= ~REG_ADVANCED;
167  break;
168  case CHR('s'): /* single line, \n ordinary */
169  v->cflags &= ~REG_NEWLINE;
170  break;
171  case CHR('t'): /* tight syntax */
172  v->cflags &= ~REG_EXPANDED;
173  break;
174  case CHR('w'): /* weird, \n affects ^ $ only */
175  v->cflags &= ~REG_NLSTOP;
176  v->cflags |= REG_NLANCH;
177  break;
178  case CHR('x'): /* expanded syntax */
179  v->cflags |= REG_EXPANDED;
180  break;
181  default:
182  ERR(REG_BADOPT);
183  return;
184  }
185  if (!NEXT1(')'))
186  {
187  ERR(REG_BADOPT);
188  return;
189  }
190  v->now++;
191  if (v->cflags & REG_QUOTE)
192  v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193  }
194 }
195 
196 /*
197  * lexnest - "call a subroutine", interpolating string at the lexical level
198  *
199  * Note, this is not a very general facility. There are a number of
200  * implicit assumptions about what sorts of strings can be subroutines.
201  */
202 static void
203 lexnest(struct vars * v,
204  const chr *beginp, /* start of interpolation */
205  const chr *endp) /* one past end of interpolation */
206 {
207  assert(v->savenow == NULL); /* only one level of nesting */
208  v->savenow = v->now;
209  v->savestop = v->stop;
210  v->now = beginp;
211  v->stop = endp;
212 }
213 
214 /*
215  * string constants to interpolate as expansions of things like \d
216  */
217 static const chr backd[] = { /* \d */
218  CHR('['), CHR('['), CHR(':'),
219  CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220  CHR(':'), CHR(']'), CHR(']')
221 };
222 static const chr backD[] = { /* \D */
223  CHR('['), CHR('^'), CHR('['), CHR(':'),
224  CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225  CHR(':'), CHR(']'), CHR(']')
226 };
227 static const chr brbackd[] = { /* \d within brackets */
228  CHR('['), CHR(':'),
229  CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
230  CHR(':'), CHR(']')
231 };
232 static const chr backs[] = { /* \s */
233  CHR('['), CHR('['), CHR(':'),
234  CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235  CHR(':'), CHR(']'), CHR(']')
236 };
237 static const chr backS[] = { /* \S */
238  CHR('['), CHR('^'), CHR('['), CHR(':'),
239  CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240  CHR(':'), CHR(']'), CHR(']')
241 };
242 static const chr brbacks[] = { /* \s within brackets */
243  CHR('['), CHR(':'),
244  CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
245  CHR(':'), CHR(']')
246 };
247 static const chr backw[] = { /* \w */
248  CHR('['), CHR('['), CHR(':'),
249  CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250  CHR(':'), CHR(']'), CHR('_'), CHR(']')
251 };
252 static const chr backW[] = { /* \W */
253  CHR('['), CHR('^'), CHR('['), CHR(':'),
254  CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255  CHR(':'), CHR(']'), CHR('_'), CHR(']')
256 };
257 static const chr brbackw[] = { /* \w within brackets */
258  CHR('['), CHR(':'),
259  CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260  CHR(':'), CHR(']'), CHR('_')
261 };
262 
263 /*
264  * lexword - interpolate a bracket expression for word characters
265  * Possibly ought to inquire whether there is a "word" character class.
266  */
267 static void
268 lexword(struct vars * v)
269 {
270  lexnest(v, backw, ENDOF(backw));
271 }
272 
273 /*
274  * next - get next token
275  */
276 static int /* 1 normal, 0 failure */
277 next(struct vars * v)
278 {
279  chr c;
280 
281  /* errors yield an infinite sequence of failures */
282  if (ISERR())
283  return 0; /* the error has set nexttype to EOS */
284 
285  /* remember flavor of last token */
286  v->lasttype = v->nexttype;
287 
288  /* REG_BOSONLY */
289  if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290  {
291  /* at start of a REG_BOSONLY RE */
292  RETV(SBEGIN, 0); /* same as \A */
293  }
294 
295  /* if we're nested and we've hit end, return to outer level */
296  if (v->savenow != NULL && ATEOS())
297  {
298  v->now = v->savenow;
299  v->stop = v->savestop;
300  v->savenow = v->savestop = NULL;
301  }
302 
303  /* skip white space etc. if appropriate (not in literal or []) */
304  if (v->cflags & REG_EXPANDED)
305  switch (v->lexcon)
306  {
307  case L_ERE:
308  case L_BRE:
309  case L_EBND:
310  case L_BBND:
311  skip(v);
312  break;
313  }
314 
315  /* handle EOS, depending on context */
316  if (ATEOS())
317  {
318  switch (v->lexcon)
319  {
320  case L_ERE:
321  case L_BRE:
322  case L_Q:
323  RET(EOS);
324  break;
325  case L_EBND:
326  case L_BBND:
327  FAILW(REG_EBRACE);
328  break;
329  case L_BRACK:
330  case L_CEL:
331  case L_ECL:
332  case L_CCL:
333  FAILW(REG_EBRACK);
334  break;
335  }
337  }
338 
339  /* okay, time to actually get a character */
340  c = *v->now++;
341 
342  /* deal with the easy contexts, punt EREs to code below */
343  switch (v->lexcon)
344  {
345  case L_BRE: /* punt BREs to separate function */
346  return brenext(v, c);
347  break;
348  case L_ERE: /* see below */
349  break;
350  case L_Q: /* literal strings are easy */
351  RETV(PLAIN, c);
352  break;
353  case L_BBND: /* bounds are fairly simple */
354  case L_EBND:
355  switch (c)
356  {
357  case CHR('0'):
358  case CHR('1'):
359  case CHR('2'):
360  case CHR('3'):
361  case CHR('4'):
362  case CHR('5'):
363  case CHR('6'):
364  case CHR('7'):
365  case CHR('8'):
366  case CHR('9'):
367  RETV(DIGIT, (chr) DIGITVAL(c));
368  break;
369  case CHR(','):
370  RET(',');
371  break;
372  case CHR('}'): /* ERE bound ends with } */
373  if (INCON(L_EBND))
374  {
375  INTOCON(L_ERE);
376  if ((v->cflags & REG_ADVF) && NEXT1('?'))
377  {
378  v->now++;
380  RETV('}', 0);
381  }
382  RETV('}', 1);
383  }
384  else
385  FAILW(REG_BADBR);
386  break;
387  case CHR('\\'): /* BRE bound ends with \} */
388  if (INCON(L_BBND) && NEXT1('}'))
389  {
390  v->now++;
391  INTOCON(L_BRE);
392  RET('}');
393  }
394  else
395  FAILW(REG_BADBR);
396  break;
397  default:
398  FAILW(REG_BADBR);
399  break;
400  }
402  break;
403  case L_BRACK: /* brackets are not too hard */
404  switch (c)
405  {
406  case CHR(']'):
407  if (LASTTYPE('['))
408  RETV(PLAIN, c);
409  else
410  {
411  INTOCON((v->cflags & REG_EXTENDED) ?
412  L_ERE : L_BRE);
413  RET(']');
414  }
415  break;
416  case CHR('\\'):
417  NOTE(REG_UBBS);
418  if (!(v->cflags & REG_ADVF))
419  RETV(PLAIN, c);
421  if (ATEOS())
423  (DISCARD) lexescape(v);
424  switch (v->nexttype)
425  { /* not all escapes okay here */
426  case PLAIN:
427  return 1;
428  break;
429  case CCLASS:
430  switch (v->nextvalue)
431  {
432  case 'd':
434  break;
435  case 's':
437  break;
438  case 'w':
440  break;
441  default:
443  break;
444  }
445  /* lexnest done, back up and try again */
446  v->nexttype = v->lasttype;
447  return next(v);
448  break;
449  }
450  /* not one of the acceptable escapes */
452  break;
453  case CHR('-'):
454  if (LASTTYPE('[') || NEXT1(']'))
455  RETV(PLAIN, c);
456  else
457  RETV(RANGE, c);
458  break;
459  case CHR('['):
460  if (ATEOS())
461  FAILW(REG_EBRACK);
462  switch (*v->now++)
463  {
464  case CHR('.'):
465  INTOCON(L_CEL);
466  /* might or might not be locale-specific */
467  RET(COLLEL);
468  break;
469  case CHR('='):
470  INTOCON(L_ECL);
471  NOTE(REG_ULOCALE);
472  RET(ECLASS);
473  break;
474  case CHR(':'):
475  INTOCON(L_CCL);
476  NOTE(REG_ULOCALE);
477  RET(CCLASS);
478  break;
479  default: /* oops */
480  v->now--;
481  RETV(PLAIN, c);
482  break;
483  }
485  break;
486  default:
487  RETV(PLAIN, c);
488  break;
489  }
491  break;
492  case L_CEL: /* collating elements are easy */
493  if (c == CHR('.') && NEXT1(']'))
494  {
495  v->now++;
496  INTOCON(L_BRACK);
497  RETV(END, '.');
498  }
499  else
500  RETV(PLAIN, c);
501  break;
502  case L_ECL: /* ditto equivalence classes */
503  if (c == CHR('=') && NEXT1(']'))
504  {
505  v->now++;
506  INTOCON(L_BRACK);
507  RETV(END, '=');
508  }
509  else
510  RETV(PLAIN, c);
511  break;
512  case L_CCL: /* ditto character classes */
513  if (c == CHR(':') && NEXT1(']'))
514  {
515  v->now++;
516  INTOCON(L_BRACK);
517  RETV(END, ':');
518  }
519  else
520  RETV(PLAIN, c);
521  break;
522  default:
524  break;
525  }
526 
527  /* that got rid of everything except EREs and AREs */
528  assert(INCON(L_ERE));
529 
530  /* deal with EREs and AREs, except for backslashes */
531  switch (c)
532  {
533  case CHR('|'):
534  RET('|');
535  break;
536  case CHR('*'):
537  if ((v->cflags & REG_ADVF) && NEXT1('?'))
538  {
539  v->now++;
541  RETV('*', 0);
542  }
543  RETV('*', 1);
544  break;
545  case CHR('+'):
546  if ((v->cflags & REG_ADVF) && NEXT1('?'))
547  {
548  v->now++;
550  RETV('+', 0);
551  }
552  RETV('+', 1);
553  break;
554  case CHR('?'):
555  if ((v->cflags & REG_ADVF) && NEXT1('?'))
556  {
557  v->now++;
559  RETV('?', 0);
560  }
561  RETV('?', 1);
562  break;
563  case CHR('{'): /* bounds start or plain character */
564  if (v->cflags & REG_EXPANDED)
565  skip(v);
566  if (ATEOS() || !iscdigit(*v->now))
567  {
568  NOTE(REG_UBRACES);
569  NOTE(REG_UUNSPEC);
570  RETV(PLAIN, c);
571  }
572  else
573  {
574  NOTE(REG_UBOUNDS);
575  INTOCON(L_EBND);
576  RET('{');
577  }
579  break;
580  case CHR('('): /* parenthesis, or advanced extension */
581  if ((v->cflags & REG_ADVF) && NEXT1('?'))
582  {
584  v->now++;
585  if (ATEOS())
586  FAILW(REG_BADRPT);
587  switch (*v->now++)
588  {
589  case CHR(':'): /* non-capturing paren */
590  RETV('(', 0);
591  break;
592  case CHR('#'): /* comment */
593  while (!ATEOS() && *v->now != CHR(')'))
594  v->now++;
595  if (!ATEOS())
596  v->now++;
597  assert(v->nexttype == v->lasttype);
598  return next(v);
599  break;
600  case CHR('='): /* positive lookahead */
603  break;
604  case CHR('!'): /* negative lookahead */
607  break;
608  case CHR('<'):
609  if (ATEOS())
610  FAILW(REG_BADRPT);
611  switch (*v->now++)
612  {
613  case CHR('='): /* positive lookbehind */
616  break;
617  case CHR('!'): /* negative lookbehind */
620  break;
621  default:
622  FAILW(REG_BADRPT);
623  break;
624  }
626  break;
627  default:
628  FAILW(REG_BADRPT);
629  break;
630  }
632  }
633  if (v->cflags & REG_NOSUB)
634  RETV('(', 0); /* all parens non-capturing */
635  else
636  RETV('(', 1);
637  break;
638  case CHR(')'):
639  if (LASTTYPE('('))
640  NOTE(REG_UUNSPEC);
641  RETV(')', c);
642  break;
643  case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
644  if (HAVE(6) && *(v->now + 0) == CHR('[') &&
645  *(v->now + 1) == CHR(':') &&
646  (*(v->now + 2) == CHR('<') ||
647  *(v->now + 2) == CHR('>')) &&
648  *(v->now + 3) == CHR(':') &&
649  *(v->now + 4) == CHR(']') &&
650  *(v->now + 5) == CHR(']'))
651  {
652  c = *(v->now + 2);
653  v->now += 6;
655  RET((c == CHR('<')) ? '<' : '>');
656  }
657  INTOCON(L_BRACK);
658  if (NEXT1('^'))
659  {
660  v->now++;
661  RETV('[', 0);
662  }
663  RETV('[', 1);
664  break;
665  case CHR('.'):
666  RET('.');
667  break;
668  case CHR('^'):
669  RET('^');
670  break;
671  case CHR('$'):
672  RET('$');
673  break;
674  case CHR('\\'): /* mostly punt backslashes to code below */
675  if (ATEOS())
677  break;
678  default: /* ordinary character */
679  RETV(PLAIN, c);
680  break;
681  }
682 
683  /* ERE/ARE backslash handling; backslash already eaten */
684  assert(!ATEOS());
685  if (!(v->cflags & REG_ADVF))
686  { /* only AREs have non-trivial escapes */
687  if (iscalnum(*v->now))
688  {
690  NOTE(REG_UUNSPEC);
691  }
692  RETV(PLAIN, *v->now++);
693  }
694  (DISCARD) lexescape(v);
695  if (ISERR())
697  if (v->nexttype == CCLASS)
698  { /* fudge at lexical level */
699  switch (v->nextvalue)
700  {
701  case 'd':
702  lexnest(v, backd, ENDOF(backd));
703  break;
704  case 'D':
705  lexnest(v, backD, ENDOF(backD));
706  break;
707  case 's':
708  lexnest(v, backs, ENDOF(backs));
709  break;
710  case 'S':
711  lexnest(v, backS, ENDOF(backS));
712  break;
713  case 'w':
714  lexnest(v, backw, ENDOF(backw));
715  break;
716  case 'W':
717  lexnest(v, backW, ENDOF(backW));
718  break;
719  default:
721  FAILW(REG_ASSERT);
722  break;
723  }
724  /* lexnest done, back up and try again */
725  v->nexttype = v->lasttype;
726  return next(v);
727  }
728  /* otherwise, lexescape has already done the work */
729  return !ISERR();
730 }
731 
732 /*
733  * lexescape - parse an ARE backslash escape (backslash already eaten)
734  * Note slightly nonstandard use of the CCLASS type code.
735  */
736 static int /* not actually used, but convenient for RETV */
737 lexescape(struct vars * v)
738 {
739  chr c;
740  static const chr alert[] = {
741  CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
742  };
743  static const chr esc[] = {
744  CHR('E'), CHR('S'), CHR('C')
745  };
746  const chr *save;
747 
748  assert(v->cflags & REG_ADVF);
749 
750  assert(!ATEOS());
751  c = *v->now++;
752  if (!iscalnum(c))
753  RETV(PLAIN, c);
754 
756  switch (c)
757  {
758  case CHR('a'):
759  RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
760  break;
761  case CHR('A'):
762  RETV(SBEGIN, 0);
763  break;
764  case CHR('b'):
765  RETV(PLAIN, CHR('\b'));
766  break;
767  case CHR('B'):
768  RETV(PLAIN, CHR('\\'));
769  break;
770  case CHR('c'):
771  NOTE(REG_UUNPORT);
772  if (ATEOS())
774  RETV(PLAIN, (chr) (*v->now++ & 037));
775  break;
776  case CHR('d'):
777  NOTE(REG_ULOCALE);
778  RETV(CCLASS, 'd');
779  break;
780  case CHR('D'):
781  NOTE(REG_ULOCALE);
782  RETV(CCLASS, 'D');
783  break;
784  case CHR('e'):
785  NOTE(REG_UUNPORT);
786  RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
787  break;
788  case CHR('f'):
789  RETV(PLAIN, CHR('\f'));
790  break;
791  case CHR('m'):
792  RET('<');
793  break;
794  case CHR('M'):
795  RET('>');
796  break;
797  case CHR('n'):
798  RETV(PLAIN, CHR('\n'));
799  break;
800  case CHR('r'):
801  RETV(PLAIN, CHR('\r'));
802  break;
803  case CHR('s'):
804  NOTE(REG_ULOCALE);
805  RETV(CCLASS, 's');
806  break;
807  case CHR('S'):
808  NOTE(REG_ULOCALE);
809  RETV(CCLASS, 'S');
810  break;
811  case CHR('t'):
812  RETV(PLAIN, CHR('\t'));
813  break;
814  case CHR('u'):
815  c = lexdigits(v, 16, 4, 4);
816  if (ISERR() || !CHR_IS_IN_RANGE(c))
818  RETV(PLAIN, c);
819  break;
820  case CHR('U'):
821  c = lexdigits(v, 16, 8, 8);
822  if (ISERR() || !CHR_IS_IN_RANGE(c))
824  RETV(PLAIN, c);
825  break;
826  case CHR('v'):
827  RETV(PLAIN, CHR('\v'));
828  break;
829  case CHR('w'):
830  NOTE(REG_ULOCALE);
831  RETV(CCLASS, 'w');
832  break;
833  case CHR('W'):
834  NOTE(REG_ULOCALE);
835  RETV(CCLASS, 'W');
836  break;
837  case CHR('x'):
838  NOTE(REG_UUNPORT);
839  c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
840  if (ISERR() || !CHR_IS_IN_RANGE(c))
842  RETV(PLAIN, c);
843  break;
844  case CHR('y'):
845  NOTE(REG_ULOCALE);
846  RETV(WBDRY, 0);
847  break;
848  case CHR('Y'):
849  NOTE(REG_ULOCALE);
850  RETV(NWBDRY, 0);
851  break;
852  case CHR('Z'):
853  RETV(SEND, 0);
854  break;
855  case CHR('1'):
856  case CHR('2'):
857  case CHR('3'):
858  case CHR('4'):
859  case CHR('5'):
860  case CHR('6'):
861  case CHR('7'):
862  case CHR('8'):
863  case CHR('9'):
864  save = v->now;
865  v->now--; /* put first digit back */
866  c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
867  if (ISERR())
869  /* ugly heuristic (first test is "exactly 1 digit?") */
870  if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
871  {
873  RETV(BACKREF, c);
874  }
875  /* oops, doesn't look like it's a backref after all... */
876  v->now = save;
877  /* and fall through into octal number */
878  case CHR('0'):
879  NOTE(REG_UUNPORT);
880  v->now--; /* put first digit back */
881  c = lexdigits(v, 8, 1, 3);
882  if (ISERR())
884  if (c > 0xff)
885  {
886  /* out of range, so we handled one digit too much */
887  v->now--;
888  c >>= 3;
889  }
890  RETV(PLAIN, c);
891  break;
892  default:
893  assert(iscalpha(c));
894  FAILW(REG_EESCAPE); /* unknown alphabetic escape */
895  break;
896  }
898 }
899 
900 /*
901  * lexdigits - slurp up digits and return chr value
902  *
903  * This does not account for overflow; callers should range-check the result
904  * if maxlen is large enough to make that possible.
905  */
906 static chr /* chr value; errors signalled via ERR */
907 lexdigits(struct vars * v,
908  int base,
909  int minlen,
910  int maxlen)
911 {
912  uchr n; /* unsigned to avoid overflow misbehavior */
913  int len;
914  chr c;
915  int d;
916  const uchr ub = (uchr) base;
917 
918  n = 0;
919  for (len = 0; len < maxlen && !ATEOS(); len++)
920  {
921  c = *v->now++;
922  switch (c)
923  {
924  case CHR('0'):
925  case CHR('1'):
926  case CHR('2'):
927  case CHR('3'):
928  case CHR('4'):
929  case CHR('5'):
930  case CHR('6'):
931  case CHR('7'):
932  case CHR('8'):
933  case CHR('9'):
934  d = DIGITVAL(c);
935  break;
936  case CHR('a'):
937  case CHR('A'):
938  d = 10;
939  break;
940  case CHR('b'):
941  case CHR('B'):
942  d = 11;
943  break;
944  case CHR('c'):
945  case CHR('C'):
946  d = 12;
947  break;
948  case CHR('d'):
949  case CHR('D'):
950  d = 13;
951  break;
952  case CHR('e'):
953  case CHR('E'):
954  d = 14;
955  break;
956  case CHR('f'):
957  case CHR('F'):
958  d = 15;
959  break;
960  default:
961  v->now--; /* oops, not a digit at all */
962  d = -1;
963  break;
964  }
965 
966  if (d >= base)
967  { /* not a plausible digit */
968  v->now--;
969  d = -1;
970  }
971  if (d < 0)
972  break; /* NOTE BREAK OUT */
973  n = n * ub + (uchr) d;
974  }
975  if (len < minlen)
976  ERR(REG_EESCAPE);
977 
978  return (chr) n;
979 }
980 
981 /*
982  * brenext - get next BRE token
983  *
984  * This is much like EREs except for all the stupid backslashes and the
985  * context-dependency of some things.
986  */
987 static int /* 1 normal, 0 failure */
988 brenext(struct vars * v,
989  chr c)
990 {
991  switch (c)
992  {
993  case CHR('*'):
994  if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
995  RETV(PLAIN, c);
996  RET('*');
997  break;
998  case CHR('['):
999  if (HAVE(6) && *(v->now + 0) == CHR('[') &&
1000  *(v->now + 1) == CHR(':') &&
1001  (*(v->now + 2) == CHR('<') ||
1002  *(v->now + 2) == CHR('>')) &&
1003  *(v->now + 3) == CHR(':') &&
1004  *(v->now + 4) == CHR(']') &&
1005  *(v->now + 5) == CHR(']'))
1006  {
1007  c = *(v->now + 2);
1008  v->now += 6;
1010  RET((c == CHR('<')) ? '<' : '>');
1011  }
1012  INTOCON(L_BRACK);
1013  if (NEXT1('^'))
1014  {
1015  v->now++;
1016  RETV('[', 0);
1017  }
1018  RETV('[', 1);
1019  break;
1020  case CHR('.'):
1021  RET('.');
1022  break;
1023  case CHR('^'):
1024  if (LASTTYPE(EMPTY))
1025  RET('^');
1026  if (LASTTYPE('('))
1027  {
1028  NOTE(REG_UUNSPEC);
1029  RET('^');
1030  }
1031  RETV(PLAIN, c);
1032  break;
1033  case CHR('$'):
1034  if (v->cflags & REG_EXPANDED)
1035  skip(v);
1036  if (ATEOS())
1037  RET('$');
1038  if (NEXT2('\\', ')'))
1039  {
1040  NOTE(REG_UUNSPEC);
1041  RET('$');
1042  }
1043  RETV(PLAIN, c);
1044  break;
1045  case CHR('\\'):
1046  break; /* see below */
1047  default:
1048  RETV(PLAIN, c);
1049  break;
1050  }
1051 
1052  assert(c == CHR('\\'));
1053 
1054  if (ATEOS())
1055  FAILW(REG_EESCAPE);
1056 
1057  c = *v->now++;
1058  switch (c)
1059  {
1060  case CHR('{'):
1061  INTOCON(L_BBND);
1062  NOTE(REG_UBOUNDS);
1063  RET('{');
1064  break;
1065  case CHR('('):
1066  RETV('(', 1);
1067  break;
1068  case CHR(')'):
1069  RETV(')', c);
1070  break;
1071  case CHR('<'):
1073  RET('<');
1074  break;
1075  case CHR('>'):
1077  RET('>');
1078  break;
1079  case CHR('1'):
1080  case CHR('2'):
1081  case CHR('3'):
1082  case CHR('4'):
1083  case CHR('5'):
1084  case CHR('6'):
1085  case CHR('7'):
1086  case CHR('8'):
1087  case CHR('9'):
1088  NOTE(REG_UBACKREF);
1089  RETV(BACKREF, (chr) DIGITVAL(c));
1090  break;
1091  default:
1092  if (iscalnum(c))
1093  {
1094  NOTE(REG_UBSALNUM);
1095  NOTE(REG_UUNSPEC);
1096  }
1097  RETV(PLAIN, c);
1098  break;
1099  }
1100 
1101  assert(NOTREACHED);
1102  return 0;
1103 }
1104 
1105 /*
1106  * skip - skip white space and comments in expanded form
1107  */
1108 static void
1109 skip(struct vars * v)
1110 {
1111  const chr *start = v->now;
1112 
1113  assert(v->cflags & REG_EXPANDED);
1114 
1115  for (;;)
1116  {
1117  while (!ATEOS() && iscspace(*v->now))
1118  v->now++;
1119  if (ATEOS() || *v->now != CHR('#'))
1120  break; /* NOTE BREAK OUT */
1121  assert(NEXT1('#'));
1122  while (!ATEOS() && *v->now != CHR('\n'))
1123  v->now++;
1124  /* leave the newline to be picked up by the iscspace loop */
1125  }
1126 
1127  if (v->now != start)
1129 }
1130 
1131 /*
1132  * newline - return the chr for a newline
1133  *
1134  * This helps confine use of CHR to this source file.
1135  */
1136 static chr
1137 newline(void)
1138 {
1139  return CHR('\n');
1140 }
1141 
1142 /*
1143  * chrnamed - return the chr known by a given (chr string) name
1144  *
1145  * The code is a bit clumsy, but this routine gets only such specialized
1146  * use that it hardly matters.
1147  */
1148 static chr
1149 chrnamed(struct vars * v,
1150  const chr *startp, /* start of name */
1151  const chr *endp, /* just past end of name */
1152  chr lastresort) /* what to return if name lookup fails */
1153 {
1154  chr c;
1155  int errsave;
1156  int e;
1157  struct cvec *cv;
1158 
1159  errsave = v->err;
1160  v->err = 0;
1161  c = element(v, startp, endp);
1162  e = v->err;
1163  v->err = errsave;
1164 
1165  if (e != 0)
1166  return lastresort;
1167 
1168  cv = range(v, c, c, 0);
1169  if (cv->nchrs == 0)
1170  return lastresort;
1171  return cv->chrs[0];
1172 }
#define RANGE
Definition: regcomp.c:283
static void skip(struct vars *v)
Definition: regc_lex.c:1109
int nexttype
Definition: regcomp.c:234
#define REG_NLSTOP
Definition: regex.h:109
#define DIGITVAL(c)
Definition: regcustom.h:72
#define REG_UBSALNUM
Definition: regex.h:64
#define NEXT1(c)
Definition: regc_lex.c:38
#define CCLASS
Definition: regcomp.c:281
#define NEXT2(a, b)
Definition: regc_lex.c:39
#define L_BRACK
Definition: regc_lex.c:56
static const chr backs[]
Definition: regc_lex.c:232
#define ERR
Definition: _int.h:146
static int lexescape(struct vars *v)
Definition: regc_lex.c:737
chr nextvalue
Definition: regcomp.c:235
#define REG_BADOPT
Definition: regex.h:154
static void lexword(struct vars *v)
Definition: regc_lex.c:268
static const chr brbackw[]
Definition: regc_lex.c:257
#define REG_QUOTE
Definition: regex.h:104
const chr * savenow
Definition: regcomp.c:229
#define LATYPE_BEHIND_POS
Definition: regguts.h:101
#define REG_EBRACE
Definition: regex.h:146
int lasttype
Definition: regcomp.c:233
#define REG_ULOCALE
Definition: regex.h:70
#define LATYPE_AHEAD_NEG
Definition: regguts.h:100
#define ISERR()
Definition: regcomp.c:262
#define RET(c)
Definition: regc_lex.c:45
static chr newline(void)
Definition: regc_lex.c:1137
#define REG_ICASE
Definition: regex.h:106
#define NOTREACHED
Definition: regguts.h:91
static chr chrnamed(struct vars *v, const chr *startp, const chr *endp, chr lastresort)
Definition: regc_lex.c:1149
#define LATYPE_AHEAD_POS
Definition: regguts.h:99
static void lexstart(struct vars *v)
Definition: regc_lex.c:70
#define ENDOF(array)
Definition: regc_lex.c:64
#define iscspace(x)
Definition: regcustom.h:103
#define REG_EBRACK
Definition: regex.h:144
#define CHR_IS_IN_RANGE(c)
Definition: regcustom.h:87
pg_wchar chr
Definition: regcustom.h:68
#define iscalnum(x)
Definition: regcustom.h:100
static chr lexdigits(struct vars *v, int base, int minlen, int maxlen)
Definition: regc_lex.c:907
#define END
Definition: _int.h:145
#define INCON(con)
Definition: regc_lex.c:61
#define LACON
Definition: regcomp.c:284
#define REG_BOSONLY
Definition: regex.h:114
#define L_ERE
Definition: regc_lex.c:51
int nchrs
Definition: regguts.h:258
#define REG_BADBR
Definition: regex.h:147
static int next(struct vars *v)
Definition: regc_lex.c:277
char * c
static void prefixes(struct vars *v)
Definition: regc_lex.c:99
#define REG_BADRPT
Definition: regex.h:150
static const chr brbacks[]
Definition: regc_lex.c:242
#define assert(TEST)
Definition: imath.c:37
#define REG_NEWLINE
Definition: regex.h:111
#define REG_UBBS
Definition: regex.h:66
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
Definition: regc_locale.c:416
#define REG_ADVANCED
Definition: regex.h:103
#define NEXT3(a, b, c)
Definition: regc_lex.c:40
#define REG_EESCAPE
Definition: regex.h:142
#define SBEGIN
Definition: regcomp.c:289
#define LASTTYPE(t)
Definition: regc_lex.c:48
#define BACKREF
Definition: regcomp.c:278
static const chr backS[]
Definition: regc_lex.c:237
Definition: regguts.h:256
int err
Definition: regcomp.c:231
static chr element(struct vars *v, const chr *startp, const chr *endp)
Definition: regc_locale.c:380
static const chr backD[]
Definition: regc_lex.c:222
#define LATYPE_BEHIND_NEG
Definition: regguts.h:102
#define REG_EXTENDED
Definition: regex.h:101
#define iscalpha(x)
Definition: regcustom.h:101
#define INTOCON(c)
Definition: regc_lex.c:60
#define REG_UUNPORT
Definition: regex.h:69
#define REG_ADVF
Definition: regex.h:102
const chr * savestop
Definition: regcomp.c:230
#define PLAIN
Definition: regcomp.c:276
#define L_Q
Definition: regc_lex.c:53
#define L_ECL
Definition: regc_lex.c:58
#define REG_UBACKREF
Definition: regex.h:60
#define EOS
Definition: regcomp.c:275
#define RETV(c, n)
Definition: regc_lex.c:46
chr * chrs
Definition: regguts.h:260
static int brenext(struct vars *v, chr c)
Definition: regc_lex.c:988
#define NWBDRY
Definition: regcomp.c:288
#define REG_ASSERT
Definition: regex.h:151
#define NULL
Definition: c.h:229
#define L_CEL
Definition: regc_lex.c:57
#define REG_UUNSPEC
Definition: regex.h:68
#define L_BRE
Definition: regc_lex.c:52
unsigned uchr
Definition: regcustom.h:69
const chr * now
Definition: regcomp.c:227
#define REG_NLANCH
Definition: regex.h:110
#define L_CCL
Definition: regc_lex.c:59
#define REG_UBOUNDS
Definition: regex.h:62
#define COLLEL
Definition: regcomp.c:279
#define REG_NOSUB
Definition: regex.h:107
#define EMPTY
Definition: regcomp.c:274
#define NOTE(b)
Definition: regcomp.c:270
int lexcon
Definition: regcomp.c:236
const chr * stop
Definition: regcomp.c:228
#define FAILW(e)
Definition: regc_lex.c:47
static void lexnest(struct vars *v, const chr *beginp, const chr *endp)
Definition: regc_lex.c:203
e
Definition: preproc-init.c:82
#define NOERR()
Definition: regcomp.c:266
static const chr backw[]
Definition: regc_lex.c:247
#define SEND
Definition: regcomp.c:290
int cflags
Definition: regcomp.c:232
#define REG_EXPANDED
Definition: regex.h:108
#define HAVE(n)
Definition: regc_lex.c:37
#define DISCARD
Definition: regguts.h:58
static const chr backW[]
Definition: regc_lex.c:252
#define L_BBND
Definition: regc_lex.c:55
#define iscdigit(x)
Definition: regcustom.h:102
#define ATEOS()
Definition: regc_lex.c:36
#define REG_UNONPOSIX
Definition: regex.h:67
#define WBDRY
Definition: regcomp.c:287
#define REG_UBRACES
Definition: regex.h:63
#define CHR(c)
Definition: regcustom.h:71
#define L_EBND
Definition: regc_lex.c:54
#define REG_ULOOKAROUND
Definition: regex.h:61
static const chr brbackd[]
Definition: regc_lex.c:227
Definition: regcomp.c:224
#define DIGIT
Definition: regcomp.c:277
#define REG_BADPAT
Definition: regex.h:139
static const chr backd[]
Definition: regc_lex.c:217
#define ECLASS
Definition: regcomp.c:280