PostgreSQL Source Code git master
regc_locale.c
Go to the documentation of this file.
1/*
2 * regc_locale.c --
3 *
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
6 *
7 * Copyright (c) 1998 by Scriptics Corporation.
8 *
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
13 * individual files.
14 *
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
23 * they apply.
24 *
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 * MODIFICATIONS.
37 *
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
49 *
50 * src/backend/regex/regc_locale.c
51 */
52
53/* ASCII character-name table */
54
55static const struct cname
56{
57 const char *name;
58 const char code;
59} cnames[] =
60
61{
62 {
63 "NUL", '\0'
64 },
65 {
66 "SOH", '\001'
67 },
68 {
69 "STX", '\002'
70 },
71 {
72 "ETX", '\003'
73 },
74 {
75 "EOT", '\004'
76 },
77 {
78 "ENQ", '\005'
79 },
80 {
81 "ACK", '\006'
82 },
83 {
84 "BEL", '\007'
85 },
86 {
87 "alert", '\007'
88 },
89 {
90 "BS", '\010'
91 },
92 {
93 "backspace", '\b'
94 },
95 {
96 "HT", '\011'
97 },
98 {
99 "tab", '\t'
100 },
101 {
102 "LF", '\012'
103 },
104 {
105 "newline", '\n'
106 },
107 {
108 "VT", '\013'
109 },
110 {
111 "vertical-tab", '\v'
112 },
113 {
114 "FF", '\014'
115 },
116 {
117 "form-feed", '\f'
118 },
119 {
120 "CR", '\015'
121 },
122 {
123 "carriage-return", '\r'
124 },
125 {
126 "SO", '\016'
127 },
128 {
129 "SI", '\017'
130 },
131 {
132 "DLE", '\020'
133 },
134 {
135 "DC1", '\021'
136 },
137 {
138 "DC2", '\022'
139 },
140 {
141 "DC3", '\023'
142 },
143 {
144 "DC4", '\024'
145 },
146 {
147 "NAK", '\025'
148 },
149 {
150 "SYN", '\026'
151 },
152 {
153 "ETB", '\027'
154 },
155 {
156 "CAN", '\030'
157 },
158 {
159 "EM", '\031'
160 },
161 {
162 "SUB", '\032'
163 },
164 {
165 "ESC", '\033'
166 },
167 {
168 "IS4", '\034'
169 },
170 {
171 "FS", '\034'
172 },
173 {
174 "IS3", '\035'
175 },
176 {
177 "GS", '\035'
178 },
179 {
180 "IS2", '\036'
181 },
182 {
183 "RS", '\036'
184 },
185 {
186 "IS1", '\037'
187 },
188 {
189 "US", '\037'
190 },
191 {
192 "space", ' '
193 },
194 {
195 "exclamation-mark", '!'
196 },
197 {
198 "quotation-mark", '"'
199 },
200 {
201 "number-sign", '#'
202 },
203 {
204 "dollar-sign", '$'
205 },
206 {
207 "percent-sign", '%'
208 },
209 {
210 "ampersand", '&'
211 },
212 {
213 "apostrophe", '\''
214 },
215 {
216 "left-parenthesis", '('
217 },
218 {
219 "right-parenthesis", ')'
220 },
221 {
222 "asterisk", '*'
223 },
224 {
225 "plus-sign", '+'
226 },
227 {
228 "comma", ','
229 },
230 {
231 "hyphen", '-'
232 },
233 {
234 "hyphen-minus", '-'
235 },
236 {
237 "period", '.'
238 },
239 {
240 "full-stop", '.'
241 },
242 {
243 "slash", '/'
244 },
245 {
246 "solidus", '/'
247 },
248 {
249 "zero", '0'
250 },
251 {
252 "one", '1'
253 },
254 {
255 "two", '2'
256 },
257 {
258 "three", '3'
259 },
260 {
261 "four", '4'
262 },
263 {
264 "five", '5'
265 },
266 {
267 "six", '6'
268 },
269 {
270 "seven", '7'
271 },
272 {
273 "eight", '8'
274 },
275 {
276 "nine", '9'
277 },
278 {
279 "colon", ':'
280 },
281 {
282 "semicolon", ';'
283 },
284 {
285 "less-than-sign", '<'
286 },
287 {
288 "equals-sign", '='
289 },
290 {
291 "greater-than-sign", '>'
292 },
293 {
294 "question-mark", '?'
295 },
296 {
297 "commercial-at", '@'
298 },
299 {
300 "left-square-bracket", '['
301 },
302 {
303 "backslash", '\\'
304 },
305 {
306 "reverse-solidus", '\\'
307 },
308 {
309 "right-square-bracket", ']'
310 },
311 {
312 "circumflex", '^'
313 },
314 {
315 "circumflex-accent", '^'
316 },
317 {
318 "underscore", '_'
319 },
320 {
321 "low-line", '_'
322 },
323 {
324 "grave-accent", '`'
325 },
326 {
327 "left-brace", '{'
328 },
329 {
330 "left-curly-bracket", '{'
331 },
332 {
333 "vertical-line", '|'
334 },
335 {
336 "right-brace", '}'
337 },
338 {
339 "right-curly-bracket", '}'
340 },
341 {
342 "tilde", '~'
343 },
344 {
345 "DEL", '\177'
346 },
347 {
348 NULL, 0
349 }
351
352/*
353 * The following array defines the valid character class names.
354 * The entries must match enum char_classes in regguts.h.
355 */
356static const char *const classNames[NUM_CCLASSES + 1] = {
357 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
358 "lower", "print", "punct", "space", "upper", "xdigit", "word",
359 NULL
360};
361
362/*
363 * We do not use the hard-wired Unicode classification tables that Tcl does.
364 * This is because (a) we need to deal with other encodings besides Unicode,
365 * and (b) we want to track the behavior of the libc locale routines as
366 * closely as possible. For example, it wouldn't be unreasonable for a
367 * locale to not consider every Unicode letter as a letter. So we build
368 * character classification cvecs by asking libc, even for Unicode.
369 */
370
371
372/*
373 * element - map collating-element name to chr
374 */
375static chr
376element(struct vars *v, /* context */
377 const chr *startp, /* points to start of name */
378 const chr *endp) /* points just past end of name */
379{
380 const struct cname *cn;
381 size_t len;
382
383 /* generic: one-chr names stand for themselves */
384 assert(startp < endp);
385 len = endp - startp;
386 if (len == 1)
387 return *startp;
388
390
391 /* search table */
392 for (cn = cnames; cn->name != NULL; cn++)
393 {
394 if (strlen(cn->name) == len &&
395 pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
396 {
397 break; /* NOTE BREAK OUT */
398 }
399 }
400 if (cn->name != NULL)
401 return CHR(cn->code);
402
403 /* couldn't find it */
405 return 0;
406}
407
408/*
409 * range - supply cvec for a range, including legality check
410 */
411static struct cvec *
412range(struct vars *v, /* context */
413 chr a, /* range start */
414 chr b, /* range end, might equal a */
415 int cases) /* case-independent? */
416{
417 int nchrs;
418 struct cvec *cv;
419 chr c,
420 cc;
421
422 if (a != b && !before(a, b))
423 {
425 return NULL;
426 }
427
428 if (!cases)
429 { /* easy version */
430 cv = getcvec(v, 0, 1);
431 NOERRN();
432 addrange(cv, a, b);
433 return cv;
434 }
435
436 /*
437 * When case-independent, it's hard to decide when cvec ranges are usable,
438 * so for now at least, we won't try. We use a range for the originally
439 * specified chrs and then add on any case-equivalents that are outside
440 * that range as individual chrs.
441 *
442 * To ensure sane behavior if someone specifies a very large range, limit
443 * the allocation size to 100000 chrs (arbitrary) and check for overrun
444 * inside the loop below.
445 */
446 nchrs = b - a + 1;
447 if (nchrs <= 0 || nchrs > 100000)
448 nchrs = 100000;
449
450 cv = getcvec(v, nchrs, 1);
451 NOERRN();
452 addrange(cv, a, b);
453
454 for (c = a; c <= b; c++)
455 {
456 cc = pg_wc_tolower(c);
457 if (cc != c &&
458 (before(cc, a) || before(b, cc)))
459 {
460 if (cv->nchrs >= cv->chrspace)
461 {
463 return NULL;
464 }
465 addchr(cv, cc);
466 }
467 cc = pg_wc_toupper(c);
468 if (cc != c &&
469 (before(cc, a) || before(b, cc)))
470 {
471 if (cv->nchrs >= cv->chrspace)
472 {
474 return NULL;
475 }
476 addchr(cv, cc);
477 }
478 INTERRUPT(v->re);
479 }
480
481 return cv;
482}
483
484/*
485 * before - is chr x before chr y, for purposes of range legality?
486 */
487static int /* predicate */
489{
490 if (x < y)
491 return 1;
492 return 0;
493}
494
495/*
496 * eclass - supply cvec for an equivalence class
497 * Must include case counterparts on request.
498 */
499static struct cvec *
500eclass(struct vars *v, /* context */
501 chr c, /* Collating element representing the
502 * equivalence class. */
503 int cases) /* all cases? */
504{
505 struct cvec *cv;
506
507 /* crude fake equivalence class for testing */
508 if ((v->cflags & REG_FAKE) && c == 'x')
509 {
510 cv = getcvec(v, 4, 0);
511 addchr(cv, CHR('x'));
512 addchr(cv, CHR('y'));
513 if (cases)
514 {
515 addchr(cv, CHR('X'));
516 addchr(cv, CHR('Y'));
517 }
518 return cv;
519 }
520
521 /* otherwise, none */
522 if (cases)
523 return allcases(v, c);
524 cv = getcvec(v, 1, 0);
525 assert(cv != NULL);
526 addchr(cv, c);
527 return cv;
528}
529
530/*
531 * lookupcclass - lookup a character class identified by name
532 *
533 * On failure, sets an error code in *v; the result is then garbage.
534 */
535static enum char_classes
536lookupcclass(struct vars *v, /* context (for returning errors) */
537 const chr *startp, /* where the name starts */
538 const chr *endp) /* just past the end of the name */
539{
540 size_t len;
541 const char *const *namePtr;
542 int i;
543
544 /*
545 * Map the name to the corresponding enumerated value.
546 */
547 len = endp - startp;
548 for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
549 {
550 if (strlen(*namePtr) == len &&
551 pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
552 return (enum char_classes) i;
553 }
554
556 return (enum char_classes) 0;
557}
558
559/*
560 * cclasscvec - supply cvec for a character class
561 *
562 * Must include case counterparts if "cases" is true.
563 *
564 * The returned cvec might be either a transient cvec gotten from getcvec(),
565 * or a permanently cached one from pg_ctype_get_cache(). This is okay
566 * because callers are not supposed to explicitly free the result either way.
567 */
568static struct cvec *
569cclasscvec(struct vars *v, /* context */
570 enum char_classes cclasscode, /* class to build a cvec for */
571 int cases) /* case-independent? */
572{
573 struct cvec *cv = NULL;
574
575 /*
576 * Remap lower and upper to alpha if the match is case insensitive.
577 */
578
579 if (cases &&
580 (cclasscode == CC_LOWER ||
583
584 /*
585 * Now compute the character class contents. For classes that are based
586 * on the behavior of a <wctype.h> or <ctype.h> function, we use
587 * pg_ctype_get_cache so that we can cache the results. Other classes
588 * have definitions that are hard-wired here, and for those we just
589 * construct a transient cvec on the fly.
590 *
591 * NB: keep this code in sync with cclass_column_index(), below.
592 */
593
594 switch (cclasscode)
595 {
596 case CC_PRINT:
598 break;
599 case CC_ALNUM:
601 break;
602 case CC_ALPHA:
604 break;
605 case CC_WORD:
607 break;
608 case CC_ASCII:
609 /* hard-wired meaning */
610 cv = getcvec(v, 0, 1);
611 if (cv)
612 addrange(cv, 0, 0x7f);
613 break;
614 case CC_BLANK:
615 /* hard-wired meaning */
616 cv = getcvec(v, 2, 0);
617 addchr(cv, '\t');
618 addchr(cv, ' ');
619 break;
620 case CC_CNTRL:
621 /* hard-wired meaning */
622 cv = getcvec(v, 0, 2);
623 addrange(cv, 0x0, 0x1f);
624 addrange(cv, 0x7f, 0x9f);
625 break;
626 case CC_DIGIT:
628 break;
629 case CC_PUNCT:
631 break;
632 case CC_XDIGIT:
633
634 /*
635 * It's not clear how to define this in non-western locales, and
636 * even less clear that there's any particular use in trying. So
637 * just hard-wire the meaning.
638 */
639 cv = getcvec(v, 0, 3);
640 if (cv)
641 {
642 addrange(cv, '0', '9');
643 addrange(cv, 'a', 'f');
644 addrange(cv, 'A', 'F');
645 }
646 break;
647 case CC_SPACE:
649 break;
650 case CC_LOWER:
652 break;
653 case CC_UPPER:
655 break;
656 case CC_GRAPH:
658 break;
659 }
660
661 /* If cv is NULL now, the reason must be "out of memory" */
662 if (cv == NULL)
664 return cv;
665}
666
667/*
668 * cclass_column_index - get appropriate high colormap column index for chr
669 */
670static int
672{
673 int colnum = 0;
674
675 /* Shouldn't go through all these pushups for simple chrs */
677
678 /*
679 * Note: we should not see requests to consider cclasses that are not
680 * treated as locale-specific by cclasscvec(), above.
681 */
682 if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
683 colnum |= cm->classbits[CC_PRINT];
684 if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
685 colnum |= cm->classbits[CC_ALNUM];
686 if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
687 colnum |= cm->classbits[CC_ALPHA];
688 if (cm->classbits[CC_WORD] && pg_wc_isword(c))
689 colnum |= cm->classbits[CC_WORD];
690 assert(cm->classbits[CC_ASCII] == 0);
691 assert(cm->classbits[CC_BLANK] == 0);
692 assert(cm->classbits[CC_CNTRL] == 0);
693 if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
694 colnum |= cm->classbits[CC_DIGIT];
695 if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
696 colnum |= cm->classbits[CC_PUNCT];
697 assert(cm->classbits[CC_XDIGIT] == 0);
698 if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
699 colnum |= cm->classbits[CC_SPACE];
700 if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
701 colnum |= cm->classbits[CC_LOWER];
702 if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
703 colnum |= cm->classbits[CC_UPPER];
704 if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
705 colnum |= cm->classbits[CC_GRAPH];
706
707 return colnum;
708}
709
710/*
711 * allcases - supply cvec for all case counterparts of a chr (including itself)
712 *
713 * This is a shortcut, preferably an efficient one, for simple characters;
714 * messy cases are done via range().
715 */
716static struct cvec *
717allcases(struct vars *v, /* context */
718 chr c) /* character to get case equivs of */
719{
720 struct cvec *cv;
721 chr lc,
722 uc;
723
724 lc = pg_wc_tolower(c);
725 uc = pg_wc_toupper(c);
726
727 cv = getcvec(v, 2, 0);
728 addchr(cv, lc);
729 if (lc != uc)
730 addchr(cv, uc);
731 return cv;
732}
733
734/*
735 * cmp - chr-substring compare
736 *
737 * Backrefs need this. It should preferably be efficient.
738 * Note that it does not need to report anything except equal/unequal.
739 * Note also that the length is exact, and the comparison should not
740 * stop at embedded NULs!
741 */
742static int /* 0 for equal, nonzero for unequal */
743cmp(const chr *x, const chr *y, /* strings to compare */
744 size_t len) /* exact length of comparison */
745{
746 return memcmp(VS(x), VS(y), len * sizeof(chr));
747}
748
749/*
750 * casecmp - case-independent chr-substring compare
751 *
752 * REG_ICASE backrefs need this. It should preferably be efficient.
753 * Note that it does not need to report anything except equal/unequal.
754 * Note also that the length is exact, and the comparison should not
755 * stop at embedded NULs!
756 */
757static int /* 0 for equal, nonzero for unequal */
758casecmp(const chr *x, const chr *y, /* strings to compare */
759 size_t len) /* exact length of comparison */
760{
761 for (; len > 0; len--, x++, y++)
762 {
763 if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
764 return 1;
765 }
766 return 0;
767}
#define ERR
Definition: _int.h:161
int y
Definition: isn.c:73
int b
Definition: isn.c:71
int x
Definition: isn.c:72
int a
Definition: isn.c:70
int i
Definition: isn.c:74
const void size_t len
@ NOTE
Definition: pg_regress.c:88
char * c
static void addchr(struct cvec *cv, chr c)
Definition: regc_cvec.c:79
static void addrange(struct cvec *cv, chr from, chr to)
Definition: regc_cvec.c:90
static struct cvec * getcvec(struct vars *v, int nchrs, int nranges)
Definition: regc_cvec.c:112
static int casecmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:758
static enum char_classes lookupcclass(struct vars *v, const chr *startp, const chr *endp)
Definition: regc_locale.c:536
static struct cvec * eclass(struct vars *v, chr c, int cases)
Definition: regc_locale.c:500
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743
static struct cvec * cclasscvec(struct vars *v, enum char_classes cclasscode, int cases)
Definition: regc_locale.c:569
static const char *const classNames[NUM_CCLASSES+1]
Definition: regc_locale.c:356
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
Definition: regc_locale.c:412
static chr element(struct vars *v, const chr *startp, const chr *endp)
Definition: regc_locale.c:376
static struct cvec * allcases(struct vars *v, chr c)
Definition: regc_locale.c:717
static const struct cname cnames[]
static int before(chr x, chr y)
Definition: regc_locale.c:488
static int cclass_column_index(struct colormap *cm, chr c)
Definition: regc_locale.c:671
static int pg_wc_islower(pg_wchar c)
static int pg_wc_isword(pg_wchar c)
static int pg_wc_isspace(pg_wchar c)
static pg_wchar pg_wc_tolower(pg_wchar c)
static int pg_wc_ispunct(pg_wchar c)
static int pg_wc_isgraph(pg_wchar c)
static pg_wchar pg_wc_toupper(pg_wchar c)
static int pg_wc_isprint(pg_wchar c)
static int pg_wc_isalnum(pg_wchar c)
static int pg_wc_isdigit(pg_wchar c)
static struct cvec * pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
static int pg_wc_isupper(pg_wchar c)
static int pg_wc_isalpha(pg_wchar c)
#define NOERRN()
Definition: regcomp.c:322
#define INTERRUPT(re)
Definition: regcustom.h:55
#define MAX_SIMPLE_CHR
Definition: regcustom.h:87
pg_wchar chr
Definition: regcustom.h:59
#define CHR(c)
Definition: regcustom.h:62
#define assert(x)
Definition: regcustom.h:56
#define REG_ECTYPE
Definition: regex.h:219
#define REG_ETOOBIG
Definition: regex.h:233
#define REG_ERANGE
Definition: regex.h:226
#define REG_ULOCALE
Definition: regex.h:148
#define REG_ECOLLATE
Definition: regex.h:218
#define REG_FAKE
Definition: regex.h:194
#define REG_ESPACE
Definition: regex.h:227
#define NUM_CCLASSES
Definition: regguts.h:144
char_classes
Definition: regguts.h:139
@ CC_UPPER
Definition: regguts.h:141
@ CC_WORD
Definition: regguts.h:141
@ CC_LOWER
Definition: regguts.h:141
@ CC_ASCII
Definition: regguts.h:140
@ CC_ALNUM
Definition: regguts.h:140
@ CC_XDIGIT
Definition: regguts.h:141
@ CC_PRINT
Definition: regguts.h:141
@ CC_BLANK
Definition: regguts.h:140
@ CC_GRAPH
Definition: regguts.h:140
@ CC_CNTRL
Definition: regguts.h:140
@ CC_SPACE
Definition: regguts.h:141
@ CC_DIGIT
Definition: regguts.h:140
@ CC_ALPHA
Definition: regguts.h:140
@ CC_PUNCT
Definition: regguts.h:141
#define VS(x)
Definition: regguts.h:61
const char * name
Definition: regc_locale.c:57
const char code
Definition: regc_locale.c:58
int classbits[NUM_CCLASSES]
Definition: regguts.h:243
Definition: regguts.h:279
int chrspace
Definition: regguts.h:281
int nchrs
Definition: regguts.h:280
int cclasscode
Definition: regguts.h:286
Definition: regcomp.c:282
int cflags
Definition: regcomp.c:287
regex_t * re
Definition: regcomp.c:283
int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n)
Definition: wstrncmp.c:55