PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
category_test.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * category_test.c
3 * Program to test Unicode general category and character properties.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode/category_test.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#include "postgres_fe.h"
13
14#include <stdio.h>
15#include <stdlib.h>
16#include <string.h>
17#include <wctype.h>
18
19#ifdef USE_ICU
20#include <unicode/uchar.h>
21#endif
22
25
26static int pg_unicode_version = 0;
27#ifdef USE_ICU
28static int icu_unicode_version = 0;
29#endif
30
31/*
32 * Parse version into integer for easy comparison.
33 */
34static int
35parse_unicode_version(const char *version)
36{
38 int major;
39 int minor;
40
41 n = sscanf(version, "%d.%d", &major, &minor);
42
43 Assert(n == 2);
44 Assert(minor < 100);
45
46 return major * 100 + minor;
47}
48
49#ifdef USE_ICU
50/*
51 * Test Postgres Unicode tables by comparing with ICU. Test the General
52 * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
53 * White_Space, and Hex_Digit.
54 */
55static void
56icu_test()
57{
58 int successful = 0;
59 int pg_skipped_codepoints = 0;
60 int icu_skipped_codepoints = 0;
61
62 for (pg_wchar code = 0; code <= 0x10ffff; code++)
63 {
64 uint8_t pg_category = unicode_category(code);
65 uint8_t icu_category = u_charType(code);
66
67 /* Property tests */
68 bool prop_alphabetic = pg_u_prop_alphabetic(code);
69 bool prop_lowercase = pg_u_prop_lowercase(code);
70 bool prop_uppercase = pg_u_prop_uppercase(code);
71 bool prop_cased = pg_u_prop_cased(code);
72 bool prop_case_ignorable = pg_u_prop_case_ignorable(code);
73 bool prop_white_space = pg_u_prop_white_space(code);
74 bool prop_hex_digit = pg_u_prop_hex_digit(code);
75 bool prop_join_control = pg_u_prop_join_control(code);
76
77 bool icu_prop_alphabetic = u_hasBinaryProperty(code, UCHAR_ALPHABETIC);
78 bool icu_prop_lowercase = u_hasBinaryProperty(code, UCHAR_LOWERCASE);
79 bool icu_prop_uppercase = u_hasBinaryProperty(code, UCHAR_UPPERCASE);
80 bool icu_prop_cased = u_hasBinaryProperty(code, UCHAR_CASED);
81 bool icu_prop_case_ignorable = u_hasBinaryProperty(code, UCHAR_CASE_IGNORABLE);
82 bool icu_prop_white_space = u_hasBinaryProperty(code, UCHAR_WHITE_SPACE);
83 bool icu_prop_hex_digit = u_hasBinaryProperty(code, UCHAR_HEX_DIGIT);
84 bool icu_prop_join_control = u_hasBinaryProperty(code, UCHAR_JOIN_CONTROL);
85
86 /*
87 * Compare with ICU for character classes using:
88 *
89 * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
90 *
91 * which describes how to use ICU to test for membership in regex
92 * character classes.
93 *
94 * NB: the document suggests testing for some properties such as
95 * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
96 * "POSIX Compatible" character classes.
97 */
98 bool isalpha = pg_u_isalpha(code);
99 bool islower = pg_u_islower(code);
100 bool isupper = pg_u_isupper(code);
101 bool ispunct = pg_u_ispunct(code, false);
102 bool isdigit = pg_u_isdigit(code, false);
103 bool isxdigit = pg_u_isxdigit(code, false);
104 bool isalnum = pg_u_isalnum(code, false);
105 bool isspace = pg_u_isspace(code);
106 bool isblank = pg_u_isblank(code);
107 bool iscntrl = pg_u_iscntrl(code);
108 bool isgraph = pg_u_isgraph(code);
109 bool isprint = pg_u_isprint(code);
110
111 bool icu_isalpha = u_isUAlphabetic(code);
112 bool icu_islower = u_isULowercase(code);
113 bool icu_isupper = u_isUUppercase(code);
114 bool icu_ispunct = u_ispunct(code);
115 bool icu_isdigit = u_isdigit(code);
116 bool icu_isxdigit = u_hasBinaryProperty(code,
117 UCHAR_POSIX_XDIGIT);
118 bool icu_isalnum = u_hasBinaryProperty(code,
119 UCHAR_POSIX_ALNUM);
120 bool icu_isspace = u_isUWhiteSpace(code);
121 bool icu_isblank = u_isblank(code);
122 bool icu_iscntrl = icu_category == PG_U_CONTROL;
123 bool icu_isgraph = u_hasBinaryProperty(code,
124 UCHAR_POSIX_GRAPH);
125 bool icu_isprint = u_hasBinaryProperty(code,
126 UCHAR_POSIX_PRINT);
127
128 /*
129 * A version mismatch means that some assigned codepoints in the newer
130 * version may be unassigned in the older version. That's OK, though
131 * the test will not cover those codepoints marked unassigned in the
132 * older version (that is, it will no longer be an exhaustive test).
133 */
134 if (pg_category == PG_U_UNASSIGNED &&
135 icu_category != PG_U_UNASSIGNED &&
137 {
138 pg_skipped_codepoints++;
139 continue;
140 }
141
142 if (icu_category == PG_U_UNASSIGNED &&
143 pg_category != PG_U_UNASSIGNED &&
145 {
146 icu_skipped_codepoints++;
147 continue;
148 }
149
150 if (pg_category != icu_category)
151 {
152 printf("category_test: FAILURE for codepoint 0x%06x\n", code);
153 printf("category_test: Postgres category: %02d %s %s\n", pg_category,
154 unicode_category_abbrev(pg_category),
155 unicode_category_string(pg_category));
156 printf("category_test: ICU category: %02d %s %s\n", icu_category,
157 unicode_category_abbrev(icu_category),
158 unicode_category_string(icu_category));
159 printf("\n");
160 exit(1);
161 }
162
163 if (prop_alphabetic != icu_prop_alphabetic ||
164 prop_lowercase != icu_prop_lowercase ||
165 prop_uppercase != icu_prop_uppercase ||
166 prop_cased != icu_prop_cased ||
167 prop_case_ignorable != icu_prop_case_ignorable ||
168 prop_white_space != icu_prop_white_space ||
169 prop_hex_digit != icu_prop_hex_digit ||
170 prop_join_control != icu_prop_join_control)
171 {
172 printf("category_test: FAILURE for codepoint 0x%06x\n", code);
173 printf("category_test: Postgres property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
174 prop_alphabetic, prop_lowercase, prop_uppercase,
175 prop_cased, prop_case_ignorable,
176 prop_white_space, prop_hex_digit, prop_join_control);
177 printf("category_test: ICU property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
178 icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,
179 icu_prop_cased, icu_prop_case_ignorable,
180 icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);
181 printf("\n");
182 exit(1);
183 }
184
185 if (isalpha != icu_isalpha ||
186 islower != icu_islower ||
187 isupper != icu_isupper ||
188 ispunct != icu_ispunct ||
189 isdigit != icu_isdigit ||
190 isxdigit != icu_isxdigit ||
191 isalnum != icu_isalnum ||
192 isspace != icu_isspace ||
193 isblank != icu_isblank ||
194 iscntrl != icu_iscntrl ||
195 isgraph != icu_isgraph ||
196 isprint != icu_isprint)
197 {
198 printf("category_test: FAILURE for codepoint 0x%06x\n", code);
199 printf("category_test: Postgres class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
200 isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);
201 printf("category_test: ICU class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
202 icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);
203 printf("\n");
204 exit(1);
205 }
206
207 if (pg_category != PG_U_UNASSIGNED)
208 successful++;
209 }
210
211 if (pg_skipped_codepoints > 0)
212 printf("category_test: skipped %d codepoints unassigned in Postgres due to Unicode version mismatch\n",
213 pg_skipped_codepoints);
214 if (icu_skipped_codepoints > 0)
215 printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
216 icu_skipped_codepoints);
217
218 printf("category_test: ICU test: %d codepoints successful\n", successful);
219}
220#endif
221
222int
223main(int argc, char **argv)
224{
226 printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
227
228#ifdef USE_ICU
229 icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
230 printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
231
232 icu_test();
233#else
234 printf("category_test: ICU not available; skipping\n");
235#endif
236}
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:201
#define Assert(condition)
Definition: c.h:812
static int pg_unicode_version
Definition: category_test.c:26
int main(int argc, char **argv)
static int parse_unicode_version(const char *version)
Definition: category_test.c:35
exit(1)
unsigned int pg_wchar
Definition: mbprint.c:31
#define printf(...)
Definition: port.h:244
bool pg_u_prop_uppercase(pg_wchar code)
bool pg_u_isspace(pg_wchar code)
bool pg_u_isxdigit(pg_wchar code, bool posix)
bool pg_u_ispunct(pg_wchar code, bool posix)
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_isprint(pg_wchar code)
bool pg_u_islower(pg_wchar code)
const char * unicode_category_abbrev(pg_unicode_category category)
bool pg_u_prop_white_space(pg_wchar code)
bool pg_u_isblank(pg_wchar code)
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_isalpha(pg_wchar code)
bool pg_u_prop_lowercase(pg_wchar code)
bool pg_u_isalnum(pg_wchar code, bool posix)
bool pg_u_isupper(pg_wchar code)
bool pg_u_prop_alphabetic(pg_wchar code)
bool pg_u_isdigit(pg_wchar code, bool posix)
bool pg_u_iscntrl(pg_wchar code)
bool pg_u_prop_join_control(pg_wchar code)
bool pg_u_isgraph(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)
bool pg_u_prop_hex_digit(pg_wchar code)
pg_unicode_category unicode_category(pg_wchar code)
@ PG_U_CONTROL
@ PG_U_UNASSIGNED
#define PG_UNICODE_VERSION
Datum icu_unicode_version(PG_FUNCTION_ARGS)
Definition: varlena.c:6315