PostgreSQL Source Code  git master
unicode_category.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * unicode_category.h
4  * Routines for determining the category of Unicode characters.
5  *
6  * These definitions can be used by both frontend and backend code.
7  *
8  * Copyright (c) 2017-2024, PostgreSQL Global Development Group
9  *
10  * src/include/common/unicode_category.h
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef UNICODE_CATEGORY_H
15 #define UNICODE_CATEGORY_H
16 
17 #include "mb/pg_wchar.h"
18 
19 /*
20  * Unicode General Category Values
21  *
22  * See: https://www.unicode.org/reports/tr44/#General_Category_Values
23  *
24  * The Unicode stability policy guarantees: "The enumeration of
25  * General_Category property values is fixed. No new values will be
26  * added". See: https://www.unicode.org/policies/stability_policy.html
27  *
28  * Numeric values chosen to match corresponding ICU UCharCategory.
29  */
30 typedef enum pg_unicode_category
31 {
32  PG_U_UNASSIGNED = 0, /* Cn */
33  PG_U_UPPERCASE_LETTER = 1, /* Lu */
34  PG_U_LOWERCASE_LETTER = 2, /* Ll */
35  PG_U_TITLECASE_LETTER = 3, /* Lt */
36  PG_U_MODIFIER_LETTER = 4, /* Lm */
37  PG_U_OTHER_LETTER = 5, /* Lo */
38  PG_U_NONSPACING_MARK = 6, /* Mn */
39  PG_U_ENCLOSING_MARK = 7, /* Me */
40  PG_U_SPACING_MARK = 8, /* Mc */
41  PG_U_DECIMAL_NUMBER = 9, /* Nd */
42  PG_U_LETTER_NUMBER = 10, /* Nl */
43  PG_U_OTHER_NUMBER = 11, /* No */
44  PG_U_SPACE_SEPARATOR = 12, /* Zs */
45  PG_U_LINE_SEPARATOR = 13, /* Zl */
46  PG_U_PARAGRAPH_SEPARATOR = 14, /* Zp */
47  PG_U_CONTROL = 15, /* Cc */
48  PG_U_FORMAT = 16, /* Cf */
49  PG_U_PRIVATE_USE = 17, /* Co */
50  PG_U_SURROGATE = 18, /* Cs */
51  PG_U_DASH_PUNCTUATION = 19, /* Pd */
52  PG_U_OPEN_PUNCTUATION = 20, /* Ps */
53  PG_U_CLOSE_PUNCTUATION = 21, /* Pe */
55  PG_U_OTHER_PUNCTUATION = 23, /* Po */
56  PG_U_MATH_SYMBOL = 24, /* Sm */
57  PG_U_CURRENCY_SYMBOL = 25, /* Sc */
58  PG_U_MODIFIER_SYMBOL = 26, /* Sk */
59  PG_U_OTHER_SYMBOL = 27, /* So */
60  PG_U_INITIAL_PUNCTUATION = 28, /* Pi */
61  PG_U_FINAL_PUNCTUATION = 29 /* Pf */
63 
65 extern const char *unicode_category_string(pg_unicode_category category);
66 extern const char *unicode_category_abbrev(pg_unicode_category category);
67 
68 extern bool pg_u_prop_alphabetic(pg_wchar c);
69 extern bool pg_u_prop_lowercase(pg_wchar c);
70 extern bool pg_u_prop_uppercase(pg_wchar c);
71 extern bool pg_u_prop_cased(pg_wchar c);
73 extern bool pg_u_prop_white_space(pg_wchar c);
74 extern bool pg_u_prop_hex_digit(pg_wchar c);
75 extern bool pg_u_prop_join_control(pg_wchar c);
76 
77 extern bool pg_u_isdigit(pg_wchar c, bool posix);
78 extern bool pg_u_isalpha(pg_wchar c);
79 extern bool pg_u_isalnum(pg_wchar c, bool posix);
80 extern bool pg_u_isword(pg_wchar c);
81 extern bool pg_u_isupper(pg_wchar c);
82 extern bool pg_u_islower(pg_wchar c);
83 extern bool pg_u_isblank(pg_wchar c);
84 extern bool pg_u_iscntrl(pg_wchar c);
85 extern bool pg_u_isgraph(pg_wchar c);
86 extern bool pg_u_isprint(pg_wchar c);
87 extern bool pg_u_ispunct(pg_wchar c, bool posix);
88 extern bool pg_u_isspace(pg_wchar c);
89 extern bool pg_u_isxdigit(pg_wchar c, bool posix);
90 
91 #endif /* UNICODE_CATEGORY_H */
unsigned int pg_wchar
Definition: mbprint.c:31
char * c
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_prop_alphabetic(pg_wchar c)
bool pg_u_isalnum(pg_wchar c, bool posix)
pg_unicode_category
@ PG_U_CONNECTOR_PUNCTUATION
@ PG_U_OTHER_SYMBOL
@ PG_U_DASH_PUNCTUATION
@ PG_U_UPPERCASE_LETTER
@ PG_U_DECIMAL_NUMBER
@ PG_U_CLOSE_PUNCTUATION
@ PG_U_NONSPACING_MARK
@ PG_U_INITIAL_PUNCTUATION
@ PG_U_CURRENCY_SYMBOL
@ PG_U_LETTER_NUMBER
@ PG_U_MODIFIER_SYMBOL
@ PG_U_SPACE_SEPARATOR
@ PG_U_OPEN_PUNCTUATION
@ PG_U_FORMAT
@ PG_U_PRIVATE_USE
@ PG_U_OTHER_LETTER
@ PG_U_PARAGRAPH_SEPARATOR
@ PG_U_CONTROL
@ PG_U_SPACING_MARK
@ PG_U_TITLECASE_LETTER
@ PG_U_OTHER_NUMBER
@ PG_U_MATH_SYMBOL
@ PG_U_LOWERCASE_LETTER
@ PG_U_LINE_SEPARATOR
@ PG_U_UNASSIGNED
@ PG_U_SURROGATE
@ PG_U_FINAL_PUNCTUATION
@ PG_U_MODIFIER_LETTER
@ PG_U_OTHER_PUNCTUATION
@ PG_U_ENCLOSING_MARK
bool pg_u_iscntrl(pg_wchar c)
bool pg_u_ispunct(pg_wchar c, bool posix)
bool pg_u_prop_cased(pg_wchar c)
bool pg_u_isprint(pg_wchar c)
bool pg_u_prop_uppercase(pg_wchar c)
bool pg_u_isblank(pg_wchar c)
bool pg_u_prop_white_space(pg_wchar c)
bool pg_u_isword(pg_wchar c)
bool pg_u_isspace(pg_wchar c)
bool pg_u_isupper(pg_wchar c)
bool pg_u_prop_join_control(pg_wchar c)
bool pg_u_isxdigit(pg_wchar c, bool posix)
const char * unicode_category_abbrev(pg_unicode_category category)
bool pg_u_isdigit(pg_wchar c, bool posix)
bool pg_u_islower(pg_wchar c)
bool pg_u_prop_case_ignorable(pg_wchar c)
bool pg_u_prop_hex_digit(pg_wchar c)
bool pg_u_prop_lowercase(pg_wchar c)
bool pg_u_isalpha(pg_wchar c)
pg_unicode_category unicode_category(pg_wchar ucs)
bool pg_u_isgraph(pg_wchar c)