PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
regc_locale.c
Go to the documentation of this file.
1 /*
2  * regc_locale.c --
3  *
4  * This file contains locale-specific regexp routines.
5  * This file is #included by regcomp.c.
6  *
7  * Copyright (c) 1998 by Scriptics Corporation.
8  *
9  * This software is copyrighted by the Regents of the University of
10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11  * Corporation and other parties. The following terms apply to all files
12  * associated with the software unless explicitly disclaimed in
13  * individual files.
14  *
15  * The authors hereby grant permission to use, copy, modify, distribute,
16  * and license this software and its documentation for any purpose, provided
17  * that existing copyright notices are retained in all copies and that this
18  * notice is included verbatim in any distributions. No written agreement,
19  * license, or royalty fee is required for any of the authorized uses.
20  * Modifications to this software may be copyrighted by their authors
21  * and need not follow the licensing terms described here, provided that
22  * the new terms are clearly indicated on the first page of each file where
23  * they apply.
24  *
25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  *
31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36  * MODIFICATIONS.
37  *
38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
39  * U.S. government, the Government shall have only "Restricted Rights"
40  * in the software and related documentation as defined in the Federal
41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42  * are acquiring the software on behalf of the Department of Defense, the
43  * software shall be classified as "Commercial Computer Software" and the
44  * Government shall have only "Restricted Rights" as defined in Clause
45  * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46  * authors grant the U.S. Government and others acting in its behalf
47  * permission to use and distribute the software in accordance with the
48  * terms specified in this license.
49  *
50  * src/backend/regex/regc_locale.c
51  */
52 
53 /* ASCII character-name table */
54 
55 static const struct cname
56 {
57  const char *name;
58  const char code;
59 } cnames[] =
60 
61 {
62  {
63  "NUL", '\0'
64  },
65  {
66  "SOH", '\001'
67  },
68  {
69  "STX", '\002'
70  },
71  {
72  "ETX", '\003'
73  },
74  {
75  "EOT", '\004'
76  },
77  {
78  "ENQ", '\005'
79  },
80  {
81  "ACK", '\006'
82  },
83  {
84  "BEL", '\007'
85  },
86  {
87  "alert", '\007'
88  },
89  {
90  "BS", '\010'
91  },
92  {
93  "backspace", '\b'
94  },
95  {
96  "HT", '\011'
97  },
98  {
99  "tab", '\t'
100  },
101  {
102  "LF", '\012'
103  },
104  {
105  "newline", '\n'
106  },
107  {
108  "VT", '\013'
109  },
110  {
111  "vertical-tab", '\v'
112  },
113  {
114  "FF", '\014'
115  },
116  {
117  "form-feed", '\f'
118  },
119  {
120  "CR", '\015'
121  },
122  {
123  "carriage-return", '\r'
124  },
125  {
126  "SO", '\016'
127  },
128  {
129  "SI", '\017'
130  },
131  {
132  "DLE", '\020'
133  },
134  {
135  "DC1", '\021'
136  },
137  {
138  "DC2", '\022'
139  },
140  {
141  "DC3", '\023'
142  },
143  {
144  "DC4", '\024'
145  },
146  {
147  "NAK", '\025'
148  },
149  {
150  "SYN", '\026'
151  },
152  {
153  "ETB", '\027'
154  },
155  {
156  "CAN", '\030'
157  },
158  {
159  "EM", '\031'
160  },
161  {
162  "SUB", '\032'
163  },
164  {
165  "ESC", '\033'
166  },
167  {
168  "IS4", '\034'
169  },
170  {
171  "FS", '\034'
172  },
173  {
174  "IS3", '\035'
175  },
176  {
177  "GS", '\035'
178  },
179  {
180  "IS2", '\036'
181  },
182  {
183  "RS", '\036'
184  },
185  {
186  "IS1", '\037'
187  },
188  {
189  "US", '\037'
190  },
191  {
192  "space", ' '
193  },
194  {
195  "exclamation-mark", '!'
196  },
197  {
198  "quotation-mark", '"'
199  },
200  {
201  "number-sign", '#'
202  },
203  {
204  "dollar-sign", '$'
205  },
206  {
207  "percent-sign", '%'
208  },
209  {
210  "ampersand", '&'
211  },
212  {
213  "apostrophe", '\''
214  },
215  {
216  "left-parenthesis", '('
217  },
218  {
219  "right-parenthesis", ')'
220  },
221  {
222  "asterisk", '*'
223  },
224  {
225  "plus-sign", '+'
226  },
227  {
228  "comma", ','
229  },
230  {
231  "hyphen", '-'
232  },
233  {
234  "hyphen-minus", '-'
235  },
236  {
237  "period", '.'
238  },
239  {
240  "full-stop", '.'
241  },
242  {
243  "slash", '/'
244  },
245  {
246  "solidus", '/'
247  },
248  {
249  "zero", '0'
250  },
251  {
252  "one", '1'
253  },
254  {
255  "two", '2'
256  },
257  {
258  "three", '3'
259  },
260  {
261  "four", '4'
262  },
263  {
264  "five", '5'
265  },
266  {
267  "six", '6'
268  },
269  {
270  "seven", '7'
271  },
272  {
273  "eight", '8'
274  },
275  {
276  "nine", '9'
277  },
278  {
279  "colon", ':'
280  },
281  {
282  "semicolon", ';'
283  },
284  {
285  "less-than-sign", '<'
286  },
287  {
288  "equals-sign", '='
289  },
290  {
291  "greater-than-sign", '>'
292  },
293  {
294  "question-mark", '?'
295  },
296  {
297  "commercial-at", '@'
298  },
299  {
300  "left-square-bracket", '['
301  },
302  {
303  "backslash", '\\'
304  },
305  {
306  "reverse-solidus", '\\'
307  },
308  {
309  "right-square-bracket", ']'
310  },
311  {
312  "circumflex", '^'
313  },
314  {
315  "circumflex-accent", '^'
316  },
317  {
318  "underscore", '_'
319  },
320  {
321  "low-line", '_'
322  },
323  {
324  "grave-accent", '`'
325  },
326  {
327  "left-brace", '{'
328  },
329  {
330  "left-curly-bracket", '{'
331  },
332  {
333  "vertical-line", '|'
334  },
335  {
336  "right-brace", '}'
337  },
338  {
339  "right-curly-bracket", '}'
340  },
341  {
342  "tilde", '~'
343  },
344  {
345  "DEL", '\177'
346  },
347  {
348  NULL, 0
349  }
350 };
351 
352 /*
353  * The following arrays define the valid character class names.
354  */
355 static const char *const classNames[NUM_CCLASSES + 1] = {
356  "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
357  "lower", "print", "punct", "space", "upper", "xdigit", NULL
358 };
359 
360 enum classes
361 {
364 };
365 
366 /*
367  * We do not use the hard-wired Unicode classification tables that Tcl does.
368  * This is because (a) we need to deal with other encodings besides Unicode,
369  * and (b) we want to track the behavior of the libc locale routines as
370  * closely as possible. For example, it wouldn't be unreasonable for a
371  * locale to not consider every Unicode letter as a letter. So we build
372  * character classification cvecs by asking libc, even for Unicode.
373  */
374 
375 
376 /*
377  * element - map collating-element name to chr
378  */
379 static chr
380 element(struct vars * v, /* context */
381  const chr *startp, /* points to start of name */
382  const chr *endp) /* points just past end of name */
383 {
384  const struct cname *cn;
385  size_t len;
386 
387  /* generic: one-chr names stand for themselves */
388  assert(startp < endp);
389  len = endp - startp;
390  if (len == 1)
391  return *startp;
392 
393  NOTE(REG_ULOCALE);
394 
395  /* search table */
396  for (cn = cnames; cn->name != NULL; cn++)
397  {
398  if (strlen(cn->name) == len &&
399  pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
400  {
401  break; /* NOTE BREAK OUT */
402  }
403  }
404  if (cn->name != NULL)
405  return CHR(cn->code);
406 
407  /* couldn't find it */
408  ERR(REG_ECOLLATE);
409  return 0;
410 }
411 
412 /*
413  * range - supply cvec for a range, including legality check
414  */
415 static struct cvec *
416 range(struct vars * v, /* context */
417  chr a, /* range start */
418  chr b, /* range end, might equal a */
419  int cases) /* case-independent? */
420 {
421  int nchrs;
422  struct cvec *cv;
423  chr c,
424  cc;
425 
426  if (a != b && !before(a, b))
427  {
428  ERR(REG_ERANGE);
429  return NULL;
430  }
431 
432  if (!cases)
433  { /* easy version */
434  cv = getcvec(v, 0, 1);
435  NOERRN();
436  addrange(cv, a, b);
437  return cv;
438  }
439 
440  /*
441  * When case-independent, it's hard to decide when cvec ranges are usable,
442  * so for now at least, we won't try. We use a range for the originally
443  * specified chrs and then add on any case-equivalents that are outside
444  * that range as individual chrs.
445  *
446  * To ensure sane behavior if someone specifies a very large range, limit
447  * the allocation size to 100000 chrs (arbitrary) and check for overrun
448  * inside the loop below.
449  */
450  nchrs = b - a + 1;
451  if (nchrs <= 0 || nchrs > 100000)
452  nchrs = 100000;
453 
454  cv = getcvec(v, nchrs, 1);
455  NOERRN();
456  addrange(cv, a, b);
457 
458  for (c = a; c <= b; c++)
459  {
460  cc = pg_wc_tolower(c);
461  if (cc != c &&
462  (before(cc, a) || before(b, cc)))
463  {
464  if (cv->nchrs >= cv->chrspace)
465  {
466  ERR(REG_ETOOBIG);
467  return NULL;
468  }
469  addchr(cv, cc);
470  }
471  cc = pg_wc_toupper(c);
472  if (cc != c &&
473  (before(cc, a) || before(b, cc)))
474  {
475  if (cv->nchrs >= cv->chrspace)
476  {
477  ERR(REG_ETOOBIG);
478  return NULL;
479  }
480  addchr(cv, cc);
481  }
482  if (CANCEL_REQUESTED(v->re))
483  {
484  ERR(REG_CANCEL);
485  return NULL;
486  }
487  }
488 
489  return cv;
490 }
491 
492 /*
493  * before - is chr x before chr y, for purposes of range legality?
494  */
495 static int /* predicate */
497 {
498  if (x < y)
499  return 1;
500  return 0;
501 }
502 
503 /*
504  * eclass - supply cvec for an equivalence class
505  * Must include case counterparts on request.
506  */
507 static struct cvec *
508 eclass(struct vars * v, /* context */
509  chr c, /* Collating element representing the
510  * equivalence class. */
511  int cases) /* all cases? */
512 {
513  struct cvec *cv;
514 
515  /* crude fake equivalence class for testing */
516  if ((v->cflags & REG_FAKE) && c == 'x')
517  {
518  cv = getcvec(v, 4, 0);
519  addchr(cv, CHR('x'));
520  addchr(cv, CHR('y'));
521  if (cases)
522  {
523  addchr(cv, CHR('X'));
524  addchr(cv, CHR('Y'));
525  }
526  return cv;
527  }
528 
529  /* otherwise, none */
530  if (cases)
531  return allcases(v, c);
532  cv = getcvec(v, 1, 0);
533  assert(cv != NULL);
534  addchr(cv, c);
535  return cv;
536 }
537 
538 /*
539  * cclass - supply cvec for a character class
540  *
541  * Must include case counterparts if "cases" is true.
542  *
543  * The returned cvec might be either a transient cvec gotten from getcvec(),
544  * or a permanently cached one from pg_ctype_get_cache(). This is okay
545  * because callers are not supposed to explicitly free the result either way.
546  */
547 static struct cvec *
548 cclass(struct vars * v, /* context */
549  const chr *startp, /* where the name starts */
550  const chr *endp, /* just past the end of the name */
551  int cases) /* case-independent? */
552 {
553  size_t len;
554  struct cvec *cv = NULL;
555  const char *const * namePtr;
556  int i,
557  index;
558 
559  /*
560  * Map the name to the corresponding enumerated value.
561  */
562  len = endp - startp;
563  index = -1;
564  for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
565  {
566  if (strlen(*namePtr) == len &&
567  pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
568  {
569  index = i;
570  break;
571  }
572  }
573  if (index == -1)
574  {
575  ERR(REG_ECTYPE);
576  return NULL;
577  }
578 
579  /*
580  * Remap lower and upper to alpha if the match is case insensitive.
581  */
582 
583  if (cases &&
584  ((enum classes) index == CC_LOWER ||
585  (enum classes) index == CC_UPPER))
586  index = (int) CC_ALPHA;
587 
588  /*
589  * Now compute the character class contents. For classes that are based
590  * on the behavior of a <wctype.h> or <ctype.h> function, we use
591  * pg_ctype_get_cache so that we can cache the results. Other classes
592  * have definitions that are hard-wired here, and for those we just
593  * construct a transient cvec on the fly.
594  *
595  * NB: keep this code in sync with cclass_column_index(), below.
596  */
597 
598  switch ((enum classes) index)
599  {
600  case CC_PRINT:
601  cv = pg_ctype_get_cache(pg_wc_isprint, index);
602  break;
603  case CC_ALNUM:
604  cv = pg_ctype_get_cache(pg_wc_isalnum, index);
605  break;
606  case CC_ALPHA:
607  cv = pg_ctype_get_cache(pg_wc_isalpha, index);
608  break;
609  case CC_ASCII:
610  /* hard-wired meaning */
611  cv = getcvec(v, 0, 1);
612  if (cv)
613  addrange(cv, 0, 0x7f);
614  break;
615  case CC_BLANK:
616  /* hard-wired meaning */
617  cv = getcvec(v, 2, 0);
618  addchr(cv, '\t');
619  addchr(cv, ' ');
620  break;
621  case CC_CNTRL:
622  /* hard-wired meaning */
623  cv = getcvec(v, 0, 2);
624  addrange(cv, 0x0, 0x1f);
625  addrange(cv, 0x7f, 0x9f);
626  break;
627  case CC_DIGIT:
628  cv = pg_ctype_get_cache(pg_wc_isdigit, index);
629  break;
630  case CC_PUNCT:
631  cv = pg_ctype_get_cache(pg_wc_ispunct, index);
632  break;
633  case CC_XDIGIT:
634 
635  /*
636  * It's not clear how to define this in non-western locales, and
637  * even less clear that there's any particular use in trying. So
638  * just hard-wire the meaning.
639  */
640  cv = getcvec(v, 0, 3);
641  if (cv)
642  {
643  addrange(cv, '0', '9');
644  addrange(cv, 'a', 'f');
645  addrange(cv, 'A', 'F');
646  }
647  break;
648  case CC_SPACE:
649  cv = pg_ctype_get_cache(pg_wc_isspace, index);
650  break;
651  case CC_LOWER:
652  cv = pg_ctype_get_cache(pg_wc_islower, index);
653  break;
654  case CC_UPPER:
655  cv = pg_ctype_get_cache(pg_wc_isupper, index);
656  break;
657  case CC_GRAPH:
658  cv = pg_ctype_get_cache(pg_wc_isgraph, index);
659  break;
660  }
661 
662  /* If cv is NULL now, the reason must be "out of memory" */
663  if (cv == NULL)
664  ERR(REG_ESPACE);
665  return cv;
666 }
667 
668 /*
669  * cclass_column_index - get appropriate high colormap column index for chr
670  */
671 static int
673 {
674  int colnum = 0;
675 
676  /* Shouldn't go through all these pushups for simple chrs */
677  assert(c > MAX_SIMPLE_CHR);
678 
679  /*
680  * Note: we should not see requests to consider cclasses that are not
681  * treated as locale-specific by cclass(), above.
682  */
683  if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
684  colnum |= cm->classbits[CC_PRINT];
685  if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
686  colnum |= cm->classbits[CC_ALNUM];
687  if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
688  colnum |= cm->classbits[CC_ALPHA];
689  assert(cm->classbits[CC_ASCII] == 0);
690  assert(cm->classbits[CC_BLANK] == 0);
691  assert(cm->classbits[CC_CNTRL] == 0);
692  if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
693  colnum |= cm->classbits[CC_DIGIT];
694  if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
695  colnum |= cm->classbits[CC_PUNCT];
696  assert(cm->classbits[CC_XDIGIT] == 0);
697  if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
698  colnum |= cm->classbits[CC_SPACE];
699  if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
700  colnum |= cm->classbits[CC_LOWER];
701  if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
702  colnum |= cm->classbits[CC_UPPER];
703  if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
704  colnum |= cm->classbits[CC_GRAPH];
705 
706  return colnum;
707 }
708 
709 /*
710  * allcases - supply cvec for all case counterparts of a chr (including itself)
711  *
712  * This is a shortcut, preferably an efficient one, for simple characters;
713  * messy cases are done via range().
714  */
715 static struct cvec *
716 allcases(struct vars * v, /* context */
717  chr c) /* character to get case equivs of */
718 {
719  struct cvec *cv;
720  chr lc,
721  uc;
722 
723  lc = pg_wc_tolower(c);
724  uc = pg_wc_toupper(c);
725 
726  cv = getcvec(v, 2, 0);
727  addchr(cv, lc);
728  if (lc != uc)
729  addchr(cv, uc);
730  return cv;
731 }
732 
733 /*
734  * cmp - chr-substring compare
735  *
736  * Backrefs need this. It should preferably be efficient.
737  * Note that it does not need to report anything except equal/unequal.
738  * Note also that the length is exact, and the comparison should not
739  * stop at embedded NULs!
740  */
741 static int /* 0 for equal, nonzero for unequal */
742 cmp(const chr *x, const chr *y, /* strings to compare */
743  size_t len) /* exact length of comparison */
744 {
745  return memcmp(VS(x), VS(y), len * sizeof(chr));
746 }
747 
748 /*
749  * casecmp - case-independent chr-substring compare
750  *
751  * REG_ICASE backrefs need this. It should preferably be efficient.
752  * Note that it does not need to report anything except equal/unequal.
753  * Note also that the length is exact, and the comparison should not
754  * stop at embedded NULs!
755  */
756 static int /* 0 for equal, nonzero for unequal */
757 casecmp(const chr *x, const chr *y, /* strings to compare */
758  size_t len) /* exact length of comparison */
759 {
760  for (; len > 0; len--, x++, y++)
761  {
762  if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
763  return 1;
764  }
765  return 0;
766 }
static int pg_wc_ispunct(pg_wchar c)
#define ERR
Definition: _int.h:146
classes
Definition: regc_locale.c:360
#define NOERRN()
Definition: regcomp.c:267
#define REG_ESPACE
Definition: regex.h:149
static void addrange(struct cvec *cv, chr from, chr to)
Definition: regc_cvec.c:90
int classbits[NUM_CCLASSES]
Definition: regguts.h:221
const char * name
Definition: regc_locale.c:57
static pg_wchar pg_wc_toupper(pg_wchar c)
#define REG_ULOCALE
Definition: regex.h:70
int chrspace
Definition: regguts.h:259
static struct cvec * eclass(struct vars *v, chr c, int cases)
Definition: regc_locale.c:508
Definition: type.h:90
#define NUM_CCLASSES
Definition: regguts.h:197
static int before(chr x, chr y)
Definition: regc_locale.c:496
pg_wchar chr
Definition: regcustom.h:68
static int pg_wc_isalpha(pg_wchar c)
const char code
Definition: regc_locale.c:58
int nchrs
Definition: regguts.h:258
regex_t * re
Definition: regcomp.c:226
static int pg_wc_isspace(pg_wchar c)
char * c
int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n)
Definition: wstrncmp.c:55
#define assert(TEST)
Definition: imath.c:37
#define REG_FAKE
Definition: regex.h:116
static struct cvec * pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
Definition: regc_locale.c:416
#define REG_ECOLLATE
Definition: regex.h:140
static int pg_wc_islower(pg_wchar c)
Definition: regguts.h:256
static pg_wchar pg_wc_tolower(pg_wchar c)
static int pg_wc_isalnum(pg_wchar c)
static chr element(struct vars *v, const chr *startp, const chr *endp)
Definition: regc_locale.c:380
static const char *const classNames[NUM_CCLASSES+1]
Definition: regc_locale.c:355
static struct cvec * allcases(struct vars *v, chr c)
Definition: regc_locale.c:716
static int pg_wc_isdigit(pg_wchar c)
#define REG_ECTYPE
Definition: regex.h:141
#define NULL
Definition: c.h:229
static struct cvec * cclass(struct vars *v, const chr *startp, const chr *endp, int cases)
Definition: regc_locale.c:548
#define REG_ERANGE
Definition: regex.h:148
#define REG_ETOOBIG
Definition: regex.h:155
#define VS(x)
Definition: regguts.h:61
#define NOTE(b)
Definition: regcomp.c:270
static const struct cname cnames[]
static int pg_wc_isprint(pg_wchar c)
int cflags
Definition: regcomp.c:232
#define CANCEL_REQUESTED(re)
Definition: regguts.h:452
int i
static int cclass_column_index(struct colormap *cm, chr c)
Definition: regc_locale.c:672
#define CHR(c)
Definition: regcustom.h:71
static void addchr(struct cvec *cv, chr c)
Definition: regc_cvec.c:79
Definition: regcomp.c:224
static struct cvec * getcvec(struct vars *v, int nchrs, int nranges)
Definition: regc_cvec.c:112
static int pg_wc_isupper(pg_wchar c)
static int pg_wc_isgraph(pg_wchar c)
static int casecmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:757
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742
#define MAX_SIMPLE_CHR
Definition: regcustom.h:97
#define REG_CANCEL
Definition: regex.h:157