PostgreSQL Source Code  git master
regc_lex.c
Go to the documentation of this file.
1 /*
2  * lexical analyzer
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results. The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/regc_lex.c
32  *
33  */
34 
35 /* scanning macros (know about v) */
36 #define ATEOS() (v->now >= v->stop)
37 #define HAVE(n) (v->stop - v->now >= (n))
38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41  *(v->now+1) == CHR(b) && \
42  *(v->now+2) == CHR(c))
43 #define SET(c) (v->nexttype = (c))
44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c) return (SET(c), 1)
46 #define RETV(c, n) return (SETV(c, n), 1)
47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
49 
50 /* lexical contexts */
51 #define L_ERE 1 /* mainline ERE/ARE */
52 #define L_BRE 2 /* mainline BRE */
53 #define L_Q 3 /* REG_QUOTE */
54 #define L_EBND 4 /* ERE/ARE bound */
55 #define L_BBND 5 /* BRE bound */
56 #define L_BRACK 6 /* brackets */
57 #define L_CEL 7 /* collating element */
58 #define L_ECL 8 /* equivalence class */
59 #define L_CCL 9 /* character class */
60 #define INTOCON(c) (v->lexcon = (c))
61 #define INCON(con) (v->lexcon == (con))
62 
63 /* construct pointer past end of chr array */
64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65 
66 /*
67  * lexstart - set up lexical stuff, scan leading options
68  */
69 static void
70 lexstart(struct vars *v)
71 {
72  prefixes(v); /* may turn on new type bits etc. */
73  NOERR();
74 
75  if (v->cflags & REG_QUOTE)
76  {
78  INTOCON(L_Q);
79  }
80  else if (v->cflags & REG_EXTENDED)
81  {
82  assert(!(v->cflags & REG_QUOTE));
83  INTOCON(L_ERE);
84  }
85  else
86  {
87  assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88  INTOCON(L_BRE);
89  }
90 
91  v->nexttype = EMPTY; /* remember we were at the start */
92  next(v); /* set up the first token */
93 }
94 
95 /*
96  * prefixes - implement various special prefixes
97  */
98 static void
99 prefixes(struct vars *v)
100 {
101  /* literal string doesn't get any of this stuff */
102  if (v->cflags & REG_QUOTE)
103  return;
104 
105  /* initial "***" gets special things */
106  if (HAVE(4) && NEXT3('*', '*', '*'))
107  switch (*(v->now + 3))
108  {
109  case CHR('?'): /* "***?" error, msg shows version */
110  ERR(REG_BADPAT);
111  return; /* proceed no further */
112  break;
113  case CHR('='): /* "***=" shifts to literal string */
115  v->cflags |= REG_QUOTE;
117  v->now += 4;
118  return; /* and there can be no more prefixes */
119  break;
120  case CHR(':'): /* "***:" shifts to AREs */
122  v->cflags |= REG_ADVANCED;
123  v->now += 4;
124  break;
125  default: /* otherwise *** is just an error */
126  ERR(REG_BADRPT);
127  return;
128  break;
129  }
130 
131  /* BREs and EREs don't get embedded options */
132  if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133  return;
134 
135  /* embedded options (AREs only) */
136  if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137  {
139  v->now += 2;
140  for (; !ATEOS() && iscalpha(*v->now); v->now++)
141  switch (*v->now)
142  {
143  case CHR('b'): /* BREs (but why???) */
144  v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145  break;
146  case CHR('c'): /* case sensitive */
147  v->cflags &= ~REG_ICASE;
148  break;
149  case CHR('e'): /* plain EREs */
150  v->cflags |= REG_EXTENDED;
151  v->cflags &= ~(REG_ADVF | REG_QUOTE);
152  break;
153  case CHR('i'): /* case insensitive */
154  v->cflags |= REG_ICASE;
155  break;
156  case CHR('m'): /* Perloid synonym for n */
157  case CHR('n'): /* \n affects ^ $ . [^ */
158  v->cflags |= REG_NEWLINE;
159  break;
160  case CHR('p'): /* ~Perl, \n affects . [^ */
161  v->cflags |= REG_NLSTOP;
162  v->cflags &= ~REG_NLANCH;
163  break;
164  case CHR('q'): /* literal string */
165  v->cflags |= REG_QUOTE;
166  v->cflags &= ~REG_ADVANCED;
167  break;
168  case CHR('s'): /* single line, \n ordinary */
169  v->cflags &= ~REG_NEWLINE;
170  break;
171  case CHR('t'): /* tight syntax */
172  v->cflags &= ~REG_EXPANDED;
173  break;
174  case CHR('w'): /* weird, \n affects ^ $ only */
175  v->cflags &= ~REG_NLSTOP;
176  v->cflags |= REG_NLANCH;
177  break;
178  case CHR('x'): /* expanded syntax */
179  v->cflags |= REG_EXPANDED;
180  break;
181  default:
182  ERR(REG_BADOPT);
183  return;
184  }
185  if (!NEXT1(')'))
186  {
187  ERR(REG_BADOPT);
188  return;
189  }
190  v->now++;
191  if (v->cflags & REG_QUOTE)
192  v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193  }
194 }
195 
196 /*
197  * next - get next token
198  */
199 static int /* 1 normal, 0 failure */
200 next(struct vars *v)
201 {
202  chr c;
203 
204 next_restart: /* loop here after eating a comment */
205 
206  /* errors yield an infinite sequence of failures */
207  if (ISERR())
208  return 0; /* the error has set nexttype to EOS */
209 
210  /* remember flavor of last token */
211  v->lasttype = v->nexttype;
212 
213  /* REG_BOSONLY */
214  if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
215  {
216  /* at start of a REG_BOSONLY RE */
217  RETV(SBEGIN, 0); /* same as \A */
218  }
219 
220  /* skip white space etc. if appropriate (not in literal or []) */
221  if (v->cflags & REG_EXPANDED)
222  switch (v->lexcon)
223  {
224  case L_ERE:
225  case L_BRE:
226  case L_EBND:
227  case L_BBND:
228  skip(v);
229  break;
230  }
231 
232  /* handle EOS, depending on context */
233  if (ATEOS())
234  {
235  switch (v->lexcon)
236  {
237  case L_ERE:
238  case L_BRE:
239  case L_Q:
240  RET(EOS);
241  break;
242  case L_EBND:
243  case L_BBND:
244  FAILW(REG_EBRACE);
245  break;
246  case L_BRACK:
247  case L_CEL:
248  case L_ECL:
249  case L_CCL:
250  FAILW(REG_EBRACK);
251  break;
252  }
254  }
255 
256  /* okay, time to actually get a character */
257  c = *v->now++;
258 
259  /* deal with the easy contexts, punt EREs to code below */
260  switch (v->lexcon)
261  {
262  case L_BRE: /* punt BREs to separate function */
263  return brenext(v, c);
264  break;
265  case L_ERE: /* see below */
266  break;
267  case L_Q: /* literal strings are easy */
268  RETV(PLAIN, c);
269  break;
270  case L_BBND: /* bounds are fairly simple */
271  case L_EBND:
272  switch (c)
273  {
274  case CHR('0'):
275  case CHR('1'):
276  case CHR('2'):
277  case CHR('3'):
278  case CHR('4'):
279  case CHR('5'):
280  case CHR('6'):
281  case CHR('7'):
282  case CHR('8'):
283  case CHR('9'):
284  RETV(DIGIT, (chr) DIGITVAL(c));
285  break;
286  case CHR(','):
287  RET(',');
288  break;
289  case CHR('}'): /* ERE bound ends with } */
290  if (INCON(L_EBND))
291  {
292  INTOCON(L_ERE);
293  if ((v->cflags & REG_ADVF) && NEXT1('?'))
294  {
295  v->now++;
297  RETV('}', 0);
298  }
299  RETV('}', 1);
300  }
301  else
302  FAILW(REG_BADBR);
303  break;
304  case CHR('\\'): /* BRE bound ends with \} */
305  if (INCON(L_BBND) && NEXT1('}'))
306  {
307  v->now++;
308  INTOCON(L_BRE);
309  RETV('}', 1);
310  }
311  else
312  FAILW(REG_BADBR);
313  break;
314  default:
315  FAILW(REG_BADBR);
316  break;
317  }
319  break;
320  case L_BRACK: /* brackets are not too hard */
321  switch (c)
322  {
323  case CHR(']'):
324  if (LASTTYPE('['))
325  RETV(PLAIN, c);
326  else
327  {
328  INTOCON((v->cflags & REG_EXTENDED) ?
329  L_ERE : L_BRE);
330  RET(']');
331  }
332  break;
333  case CHR('\\'):
334  NOTE(REG_UBBS);
335  if (!(v->cflags & REG_ADVF))
336  RETV(PLAIN, c);
338  if (ATEOS())
340  if (!lexescape(v))
341  return 0;
342  switch (v->nexttype)
343  { /* not all escapes okay here */
344  case PLAIN:
345  case CCLASSS:
346  case CCLASSC:
347  return 1;
348  break;
349  }
350  /* not one of the acceptable escapes */
352  break;
353  case CHR('-'):
354  if (LASTTYPE('[') || NEXT1(']'))
355  RETV(PLAIN, c);
356  else
357  RETV(RANGE, c);
358  break;
359  case CHR('['):
360  if (ATEOS())
361  FAILW(REG_EBRACK);
362  switch (*v->now++)
363  {
364  case CHR('.'):
365  INTOCON(L_CEL);
366  /* might or might not be locale-specific */
367  RET(COLLEL);
368  break;
369  case CHR('='):
370  INTOCON(L_ECL);
371  NOTE(REG_ULOCALE);
372  RET(ECLASS);
373  break;
374  case CHR(':'):
375  INTOCON(L_CCL);
376  NOTE(REG_ULOCALE);
377  RET(CCLASS);
378  break;
379  default: /* oops */
380  v->now--;
381  RETV(PLAIN, c);
382  break;
383  }
385  break;
386  default:
387  RETV(PLAIN, c);
388  break;
389  }
391  break;
392  case L_CEL: /* collating elements are easy */
393  if (c == CHR('.') && NEXT1(']'))
394  {
395  v->now++;
396  INTOCON(L_BRACK);
397  RETV(END, '.');
398  }
399  else
400  RETV(PLAIN, c);
401  break;
402  case L_ECL: /* ditto equivalence classes */
403  if (c == CHR('=') && NEXT1(']'))
404  {
405  v->now++;
406  INTOCON(L_BRACK);
407  RETV(END, '=');
408  }
409  else
410  RETV(PLAIN, c);
411  break;
412  case L_CCL: /* ditto character classes */
413  if (c == CHR(':') && NEXT1(']'))
414  {
415  v->now++;
416  INTOCON(L_BRACK);
417  RETV(END, ':');
418  }
419  else
420  RETV(PLAIN, c);
421  break;
422  default:
424  break;
425  }
426 
427  /* that got rid of everything except EREs and AREs */
428  assert(INCON(L_ERE));
429 
430  /* deal with EREs and AREs, except for backslashes */
431  switch (c)
432  {
433  case CHR('|'):
434  RET('|');
435  break;
436  case CHR('*'):
437  if ((v->cflags & REG_ADVF) && NEXT1('?'))
438  {
439  v->now++;
441  RETV('*', 0);
442  }
443  RETV('*', 1);
444  break;
445  case CHR('+'):
446  if ((v->cflags & REG_ADVF) && NEXT1('?'))
447  {
448  v->now++;
450  RETV('+', 0);
451  }
452  RETV('+', 1);
453  break;
454  case CHR('?'):
455  if ((v->cflags & REG_ADVF) && NEXT1('?'))
456  {
457  v->now++;
459  RETV('?', 0);
460  }
461  RETV('?', 1);
462  break;
463  case CHR('{'): /* bounds start or plain character */
464  if (v->cflags & REG_EXPANDED)
465  skip(v);
466  if (ATEOS() || !iscdigit(*v->now))
467  {
468  NOTE(REG_UBRACES);
469  NOTE(REG_UUNSPEC);
470  RETV(PLAIN, c);
471  }
472  else
473  {
474  NOTE(REG_UBOUNDS);
475  INTOCON(L_EBND);
476  RET('{');
477  }
479  break;
480  case CHR('('): /* parenthesis, or advanced extension */
481  if ((v->cflags & REG_ADVF) && NEXT1('?'))
482  {
484  v->now++;
485  if (ATEOS())
486  FAILW(REG_BADRPT);
487  switch (*v->now++)
488  {
489  case CHR(':'): /* non-capturing paren */
490  RETV('(', 0);
491  break;
492  case CHR('#'): /* comment */
493  while (!ATEOS() && *v->now != CHR(')'))
494  v->now++;
495  if (!ATEOS())
496  v->now++;
497  assert(v->nexttype == v->lasttype);
498  goto next_restart;
499  case CHR('='): /* positive lookahead */
502  break;
503  case CHR('!'): /* negative lookahead */
506  break;
507  case CHR('<'):
508  if (ATEOS())
509  FAILW(REG_BADRPT);
510  switch (*v->now++)
511  {
512  case CHR('='): /* positive lookbehind */
515  break;
516  case CHR('!'): /* negative lookbehind */
519  break;
520  default:
521  FAILW(REG_BADRPT);
522  break;
523  }
525  break;
526  default:
527  FAILW(REG_BADRPT);
528  break;
529  }
531  }
532  RETV('(', 1);
533  break;
534  case CHR(')'):
535  if (LASTTYPE('('))
536  NOTE(REG_UUNSPEC);
537  RETV(')', c);
538  break;
539  case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
540  if (HAVE(6) && *(v->now + 0) == CHR('[') &&
541  *(v->now + 1) == CHR(':') &&
542  (*(v->now + 2) == CHR('<') ||
543  *(v->now + 2) == CHR('>')) &&
544  *(v->now + 3) == CHR(':') &&
545  *(v->now + 4) == CHR(']') &&
546  *(v->now + 5) == CHR(']'))
547  {
548  c = *(v->now + 2);
549  v->now += 6;
551  RET((c == CHR('<')) ? '<' : '>');
552  }
553  INTOCON(L_BRACK);
554  if (NEXT1('^'))
555  {
556  v->now++;
557  RETV('[', 0);
558  }
559  RETV('[', 1);
560  break;
561  case CHR('.'):
562  RET('.');
563  break;
564  case CHR('^'):
565  RET('^');
566  break;
567  case CHR('$'):
568  RET('$');
569  break;
570  case CHR('\\'): /* mostly punt backslashes to code below */
571  if (ATEOS())
573  break;
574  default: /* ordinary character */
575  RETV(PLAIN, c);
576  break;
577  }
578 
579  /* ERE/ARE backslash handling; backslash already eaten */
580  assert(!ATEOS());
581  if (!(v->cflags & REG_ADVF))
582  { /* only AREs have non-trivial escapes */
583  if (iscalnum(*v->now))
584  {
586  NOTE(REG_UUNSPEC);
587  }
588  RETV(PLAIN, *v->now++);
589  }
590  return lexescape(v);
591 }
592 
593 /*
594  * lexescape - parse an ARE backslash escape (backslash already eaten)
595  *
596  * This is used for ARE backslashes both normally and inside bracket
597  * expressions. In the latter case, not all escape types are allowed,
598  * but the caller must reject unwanted ones after we return.
599  */
600 static int
601 lexescape(struct vars *v)
602 {
603  chr c;
604  static const chr alert[] = {
605  CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
606  };
607  static const chr esc[] = {
608  CHR('E'), CHR('S'), CHR('C')
609  };
610  const chr *save;
611 
612  assert(v->cflags & REG_ADVF);
613 
614  assert(!ATEOS());
615  c = *v->now++;
616 
617  /* if it's not alphanumeric ASCII, treat it as a plain character */
618  if (!('a' <= c && c <= 'z') &&
619  !('A' <= c && c <= 'Z') &&
620  !('0' <= c && c <= '9'))
621  RETV(PLAIN, c);
622 
624  switch (c)
625  {
626  case CHR('a'):
627  RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
628  break;
629  case CHR('A'):
630  RETV(SBEGIN, 0);
631  break;
632  case CHR('b'):
633  RETV(PLAIN, CHR('\b'));
634  break;
635  case CHR('B'):
636  RETV(PLAIN, CHR('\\'));
637  break;
638  case CHR('c'):
639  NOTE(REG_UUNPORT);
640  if (ATEOS())
642  RETV(PLAIN, (chr) (*v->now++ & 037));
643  break;
644  case CHR('d'):
645  NOTE(REG_ULOCALE);
647  break;
648  case CHR('D'):
649  NOTE(REG_ULOCALE);
651  break;
652  case CHR('e'):
653  NOTE(REG_UUNPORT);
654  RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
655  break;
656  case CHR('f'):
657  RETV(PLAIN, CHR('\f'));
658  break;
659  case CHR('m'):
660  RET('<');
661  break;
662  case CHR('M'):
663  RET('>');
664  break;
665  case CHR('n'):
666  RETV(PLAIN, CHR('\n'));
667  break;
668  case CHR('r'):
669  RETV(PLAIN, CHR('\r'));
670  break;
671  case CHR('s'):
672  NOTE(REG_ULOCALE);
674  break;
675  case CHR('S'):
676  NOTE(REG_ULOCALE);
678  break;
679  case CHR('t'):
680  RETV(PLAIN, CHR('\t'));
681  break;
682  case CHR('u'):
683  c = lexdigits(v, 16, 4, 4);
684  if (ISERR() || !CHR_IS_IN_RANGE(c))
686  RETV(PLAIN, c);
687  break;
688  case CHR('U'):
689  c = lexdigits(v, 16, 8, 8);
690  if (ISERR() || !CHR_IS_IN_RANGE(c))
692  RETV(PLAIN, c);
693  break;
694  case CHR('v'):
695  RETV(PLAIN, CHR('\v'));
696  break;
697  case CHR('w'):
698  NOTE(REG_ULOCALE);
699  RETV(CCLASSS, CC_WORD);
700  break;
701  case CHR('W'):
702  NOTE(REG_ULOCALE);
703  RETV(CCLASSC, CC_WORD);
704  break;
705  case CHR('x'):
706  NOTE(REG_UUNPORT);
707  c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
708  if (ISERR() || !CHR_IS_IN_RANGE(c))
710  RETV(PLAIN, c);
711  break;
712  case CHR('y'):
713  NOTE(REG_ULOCALE);
714  RETV(WBDRY, 0);
715  break;
716  case CHR('Y'):
717  NOTE(REG_ULOCALE);
718  RETV(NWBDRY, 0);
719  break;
720  case CHR('Z'):
721  RETV(SEND, 0);
722  break;
723  case CHR('1'):
724  case CHR('2'):
725  case CHR('3'):
726  case CHR('4'):
727  case CHR('5'):
728  case CHR('6'):
729  case CHR('7'):
730  case CHR('8'):
731  case CHR('9'):
732  save = v->now;
733  v->now--; /* put first digit back */
734  c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
735  if (ISERR())
737  /* ugly heuristic (first test is "exactly 1 digit?") */
738  if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
739  {
741  RETV(BACKREF, c);
742  }
743  /* oops, doesn't look like it's a backref after all... */
744  v->now = save;
745  /* and fall through into octal number */
746  /* FALLTHROUGH */
747  case CHR('0'):
748  NOTE(REG_UUNPORT);
749  v->now--; /* put first digit back */
750  c = lexdigits(v, 8, 1, 3);
751  if (ISERR())
753  if (c > 0xff)
754  {
755  /* out of range, so we handled one digit too much */
756  v->now--;
757  c >>= 3;
758  }
759  RETV(PLAIN, c);
760  break;
761  default:
762 
763  /*
764  * Throw an error for unrecognized ASCII alpha escape sequences,
765  * which reserves them for future use if needed.
766  */
768  break;
769  }
771 }
772 
773 /*
774  * lexdigits - slurp up digits and return chr value
775  *
776  * This does not account for overflow; callers should range-check the result
777  * if maxlen is large enough to make that possible.
778  */
779 static chr /* chr value; errors signalled via ERR */
780 lexdigits(struct vars *v,
781  int base,
782  int minlen,
783  int maxlen)
784 {
785  uchr n; /* unsigned to avoid overflow misbehavior */
786  int len;
787  chr c;
788  int d;
789  const uchr ub = (uchr) base;
790 
791  n = 0;
792  for (len = 0; len < maxlen && !ATEOS(); len++)
793  {
794  c = *v->now++;
795  switch (c)
796  {
797  case CHR('0'):
798  case CHR('1'):
799  case CHR('2'):
800  case CHR('3'):
801  case CHR('4'):
802  case CHR('5'):
803  case CHR('6'):
804  case CHR('7'):
805  case CHR('8'):
806  case CHR('9'):
807  d = DIGITVAL(c);
808  break;
809  case CHR('a'):
810  case CHR('A'):
811  d = 10;
812  break;
813  case CHR('b'):
814  case CHR('B'):
815  d = 11;
816  break;
817  case CHR('c'):
818  case CHR('C'):
819  d = 12;
820  break;
821  case CHR('d'):
822  case CHR('D'):
823  d = 13;
824  break;
825  case CHR('e'):
826  case CHR('E'):
827  d = 14;
828  break;
829  case CHR('f'):
830  case CHR('F'):
831  d = 15;
832  break;
833  default:
834  v->now--; /* oops, not a digit at all */
835  d = -1;
836  break;
837  }
838 
839  if (d >= base)
840  { /* not a plausible digit */
841  v->now--;
842  d = -1;
843  }
844  if (d < 0)
845  break; /* NOTE BREAK OUT */
846  n = n * ub + (uchr) d;
847  }
848  if (len < minlen)
849  ERR(REG_EESCAPE);
850 
851  return (chr) n;
852 }
853 
854 /*
855  * brenext - get next BRE token
856  *
857  * This is much like EREs except for all the stupid backslashes and the
858  * context-dependency of some things.
859  */
860 static int /* 1 normal, 0 failure */
861 brenext(struct vars *v,
862  chr c)
863 {
864  switch (c)
865  {
866  case CHR('*'):
867  if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
868  RETV(PLAIN, c);
869  RETV('*', 1);
870  break;
871  case CHR('['):
872  if (HAVE(6) && *(v->now + 0) == CHR('[') &&
873  *(v->now + 1) == CHR(':') &&
874  (*(v->now + 2) == CHR('<') ||
875  *(v->now + 2) == CHR('>')) &&
876  *(v->now + 3) == CHR(':') &&
877  *(v->now + 4) == CHR(']') &&
878  *(v->now + 5) == CHR(']'))
879  {
880  c = *(v->now + 2);
881  v->now += 6;
883  RET((c == CHR('<')) ? '<' : '>');
884  }
885  INTOCON(L_BRACK);
886  if (NEXT1('^'))
887  {
888  v->now++;
889  RETV('[', 0);
890  }
891  RETV('[', 1);
892  break;
893  case CHR('.'):
894  RET('.');
895  break;
896  case CHR('^'):
897  if (LASTTYPE(EMPTY))
898  RET('^');
899  if (LASTTYPE('('))
900  {
901  NOTE(REG_UUNSPEC);
902  RET('^');
903  }
904  RETV(PLAIN, c);
905  break;
906  case CHR('$'):
907  if (v->cflags & REG_EXPANDED)
908  skip(v);
909  if (ATEOS())
910  RET('$');
911  if (NEXT2('\\', ')'))
912  {
913  NOTE(REG_UUNSPEC);
914  RET('$');
915  }
916  RETV(PLAIN, c);
917  break;
918  case CHR('\\'):
919  break; /* see below */
920  default:
921  RETV(PLAIN, c);
922  break;
923  }
924 
925  assert(c == CHR('\\'));
926 
927  if (ATEOS())
929 
930  c = *v->now++;
931  switch (c)
932  {
933  case CHR('{'):
934  INTOCON(L_BBND);
935  NOTE(REG_UBOUNDS);
936  RET('{');
937  break;
938  case CHR('('):
939  RETV('(', 1);
940  break;
941  case CHR(')'):
942  RETV(')', c);
943  break;
944  case CHR('<'):
946  RET('<');
947  break;
948  case CHR('>'):
950  RET('>');
951  break;
952  case CHR('1'):
953  case CHR('2'):
954  case CHR('3'):
955  case CHR('4'):
956  case CHR('5'):
957  case CHR('6'):
958  case CHR('7'):
959  case CHR('8'):
960  case CHR('9'):
962  RETV(BACKREF, (chr) DIGITVAL(c));
963  break;
964  default:
965  if (iscalnum(c))
966  {
968  NOTE(REG_UUNSPEC);
969  }
970  RETV(PLAIN, c);
971  break;
972  }
973 
975  return 0;
976 }
977 
978 /*
979  * skip - skip white space and comments in expanded form
980  */
981 static void
982 skip(struct vars *v)
983 {
984  const chr *start = v->now;
985 
986  assert(v->cflags & REG_EXPANDED);
987 
988  for (;;)
989  {
990  while (!ATEOS() && iscspace(*v->now))
991  v->now++;
992  if (ATEOS() || *v->now != CHR('#'))
993  break; /* NOTE BREAK OUT */
994  assert(NEXT1('#'));
995  while (!ATEOS() && *v->now != CHR('\n'))
996  v->now++;
997  /* leave the newline to be picked up by the iscspace loop */
998  }
999 
1000  if (v->now != start)
1002 }
1003 
1004 /*
1005  * newline - return the chr for a newline
1006  *
1007  * This helps confine use of CHR to this source file.
1008  */
1009 static chr
1010 newline(void)
1011 {
1012  return CHR('\n');
1013 }
1014 
1015 /*
1016  * chrnamed - return the chr known by a given (chr string) name
1017  *
1018  * The code is a bit clumsy, but this routine gets only such specialized
1019  * use that it hardly matters.
1020  */
1021 static chr
1022 chrnamed(struct vars *v,
1023  const chr *startp, /* start of name */
1024  const chr *endp, /* just past end of name */
1025  chr lastresort) /* what to return if name lookup fails */
1026 {
1027  chr c;
1028  int errsave;
1029  int e;
1030  struct cvec *cv;
1031 
1032  errsave = v->err;
1033  v->err = 0;
1034  c = element(v, startp, endp);
1035  e = v->err;
1036  v->err = errsave;
1037 
1038  if (e != 0)
1039  return lastresort;
1040 
1041  cv = range(v, c, c, 0);
1042  if (cv->nchrs == 0)
1043  return lastresort;
1044  return cv->chrs[0];
1045 }
#define END
Definition: _int.h:160
#define ERR
Definition: _int.h:161
#define errsave(context,...)
Definition: elog.h:260
const void size_t len
@ NOTE
Definition: pg_regress.c:89
char * c
e
Definition: preproc-init.c:82
#define L_ERE
Definition: regc_lex.c:51
#define NEXT2(a, b)
Definition: regc_lex.c:39
#define RET(c)
Definition: regc_lex.c:45
#define INTOCON(c)
Definition: regc_lex.c:60
#define INCON(con)
Definition: regc_lex.c:61
static int lexescape(struct vars *v)
Definition: regc_lex.c:601
#define L_BBND
Definition: regc_lex.c:55
#define ATEOS()
Definition: regc_lex.c:36
#define L_Q
Definition: regc_lex.c:53
static void skip(struct vars *v)
Definition: regc_lex.c:982
static chr lexdigits(struct vars *v, int base, int minlen, int maxlen)
Definition: regc_lex.c:780
#define HAVE(n)
Definition: regc_lex.c:37
#define LASTTYPE(t)
Definition: regc_lex.c:48
#define RETV(c, n)
Definition: regc_lex.c:46
static chr newline(void)
Definition: regc_lex.c:1010
#define L_CEL
Definition: regc_lex.c:57
#define FAILW(e)
Definition: regc_lex.c:47
#define L_EBND
Definition: regc_lex.c:54
#define L_ECL
Definition: regc_lex.c:58
static int brenext(struct vars *v, chr c)
Definition: regc_lex.c:861
static void lexstart(struct vars *v)
Definition: regc_lex.c:70
static void prefixes(struct vars *v)
Definition: regc_lex.c:99
#define ENDOF(array)
Definition: regc_lex.c:64
#define L_CCL
Definition: regc_lex.c:59
#define NEXT3(a, b, c)
Definition: regc_lex.c:40
#define L_BRACK
Definition: regc_lex.c:56
static chr chrnamed(struct vars *v, const chr *startp, const chr *endp, chr lastresort)
Definition: regc_lex.c:1022
#define NEXT1(c)
Definition: regc_lex.c:38
#define L_BRE
Definition: regc_lex.c:52
static int next(struct vars *v)
Definition: regc_lex.c:200
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
Definition: regc_locale.c:412
static chr element(struct vars *v, const chr *startp, const chr *endp)
Definition: regc_locale.c:376
#define COLLEL
Definition: regcomp.c:333
#define NWBDRY
Definition: regcomp.c:344
#define NOERR()
Definition: regcomp.c:320
#define EMPTY
Definition: regcomp.c:328
#define SBEGIN
Definition: regcomp.c:345
#define ISERR()
Definition: regcomp.c:316
#define CCLASSS
Definition: regcomp.c:337
#define CCLASS
Definition: regcomp.c:335
#define WBDRY
Definition: regcomp.c:343
#define DIGIT
Definition: regcomp.c:331
#define CCLASSC
Definition: regcomp.c:338
#define ECLASS
Definition: regcomp.c:334
#define BACKREF
Definition: regcomp.c:332
#define LACON
Definition: regcomp.c:340
#define EOS
Definition: regcomp.c:329
#define PLAIN
Definition: regcomp.c:330
#define SEND
Definition: regcomp.c:346
#define RANGE
Definition: regcomp.c:339
unsigned uchr
Definition: regcustom.h:60
#define DIGITVAL(c)
Definition: regcustom.h:63
#define iscalnum(x)
Definition: regcustom.h:90
#define iscdigit(x)
Definition: regcustom.h:92
#define CHR_IS_IN_RANGE(c)
Definition: regcustom.h:77
pg_wchar chr
Definition: regcustom.h:59
#define CHR(c)
Definition: regcustom.h:62
#define iscspace(x)
Definition: regcustom.h:93
#define iscalpha(x)
Definition: regcustom.h:91
#define assert(x)
Definition: regcustom.h:56
#define REG_BADOPT
Definition: regex.h:154
#define REG_ICASE
Definition: regex.h:106
#define REG_EBRACK
Definition: regex.h:144
#define REG_UBOUNDS
Definition: regex.h:62
#define REG_BADRPT
Definition: regex.h:150
#define REG_EESCAPE
Definition: regex.h:142
#define REG_ULOOKAROUND
Definition: regex.h:61
#define REG_UBBS
Definition: regex.h:66
#define REG_ADVANCED
Definition: regex.h:103
#define REG_EXPANDED
Definition: regex.h:108
#define REG_NLANCH
Definition: regex.h:110
#define REG_EXTENDED
Definition: regex.h:101
#define REG_NLSTOP
Definition: regex.h:109
#define REG_ADVF
Definition: regex.h:102
#define REG_UUNSPEC
Definition: regex.h:68
#define REG_UNONPOSIX
Definition: regex.h:67
#define REG_BADBR
Definition: regex.h:147
#define REG_NEWLINE
Definition: regex.h:111
#define REG_UBSALNUM
Definition: regex.h:64
#define REG_ULOCALE
Definition: regex.h:70
#define REG_UUNPORT
Definition: regex.h:69
#define REG_EBRACE
Definition: regex.h:146
#define REG_BADPAT
Definition: regex.h:139
#define REG_BOSONLY
Definition: regex.h:114
#define REG_UBRACES
Definition: regex.h:63
#define REG_UBACKREF
Definition: regex.h:60
#define REG_QUOTE
Definition: regex.h:104
#define LATYPE_AHEAD_NEG
Definition: regguts.h:105
#define LATYPE_BEHIND_POS
Definition: regguts.h:106
#define NOTREACHED
Definition: regguts.h:96
#define LATYPE_BEHIND_NEG
Definition: regguts.h:107
@ CC_WORD
Definition: regguts.h:141
@ CC_SPACE
Definition: regguts.h:141
@ CC_DIGIT
Definition: regguts.h:140
#define LATYPE_AHEAD_POS
Definition: regguts.h:104
Definition: regguts.h:279
int nchrs
Definition: regguts.h:280
chr * chrs
Definition: regguts.h:282
Definition: regcomp.c:281
const chr * now
Definition: regcomp.c:283
int err
Definition: regcomp.c:285
int cflags
Definition: regcomp.c:286
int lexcon
Definition: regcomp.c:290
int nexttype
Definition: regcomp.c:288
int lasttype
Definition: regcomp.c:287