PostgreSQL Source Code git master
category_test.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * category_test.c
3 * Program to test Unicode general category and character properties.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode/category_test.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#include "postgres_fe.h"
13
14#include <stdio.h>
15#include <stdlib.h>
16#include <string.h>
17#include <wctype.h>
18
19#ifdef USE_ICU
20#include <unicode/uchar.h>
21#endif
22
25#include "mb/pg_wchar.h"
26
27static int pg_unicode_version = 0;
28#ifdef USE_ICU
29static int icu_unicode_version = 0;
30#endif
31
32/*
33 * Parse version into integer for easy comparison.
34 */
35static int
36parse_unicode_version(const char *version)
37{
39 int major;
40 int minor;
41
42 n = sscanf(version, "%d.%d", &major, &minor);
43
44 Assert(n == 2);
45 Assert(minor < 100);
46
47 return major * 100 + minor;
48}
49
50#ifdef USE_ICU
51/*
52 * Test Postgres Unicode tables by comparing with ICU. Test the General
53 * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
54 * White_Space, and Hex_Digit.
55 */
56static void
57icu_test()
58{
59 int successful = 0;
60 int pg_skipped_codepoints = 0;
61 int icu_skipped_codepoints = 0;
62
63 for (char32_t code = 0; code <= 0x10ffff; code++)
64 {
65 uint8_t pg_category = unicode_category(code);
66 uint8_t icu_category = u_charType(code);
67
68 /* Property tests */
69 bool prop_alphabetic = pg_u_prop_alphabetic(code);
70 bool prop_lowercase = pg_u_prop_lowercase(code);
71 bool prop_uppercase = pg_u_prop_uppercase(code);
72 bool prop_cased = pg_u_prop_cased(code);
73 bool prop_case_ignorable = pg_u_prop_case_ignorable(code);
74 bool prop_white_space = pg_u_prop_white_space(code);
75 bool prop_hex_digit = pg_u_prop_hex_digit(code);
76 bool prop_join_control = pg_u_prop_join_control(code);
77
78 bool icu_prop_alphabetic = u_hasBinaryProperty(code, UCHAR_ALPHABETIC);
79 bool icu_prop_lowercase = u_hasBinaryProperty(code, UCHAR_LOWERCASE);
80 bool icu_prop_uppercase = u_hasBinaryProperty(code, UCHAR_UPPERCASE);
81 bool icu_prop_cased = u_hasBinaryProperty(code, UCHAR_CASED);
82 bool icu_prop_case_ignorable = u_hasBinaryProperty(code, UCHAR_CASE_IGNORABLE);
83 bool icu_prop_white_space = u_hasBinaryProperty(code, UCHAR_WHITE_SPACE);
84 bool icu_prop_hex_digit = u_hasBinaryProperty(code, UCHAR_HEX_DIGIT);
85 bool icu_prop_join_control = u_hasBinaryProperty(code, UCHAR_JOIN_CONTROL);
86
87 /*
88 * Compare with ICU for character classes using:
89 *
90 * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
91 *
92 * which describes how to use ICU to test for membership in regex
93 * character classes.
94 *
95 * NB: the document suggests testing for some properties such as
96 * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
97 * "POSIX Compatible" character classes.
98 */
99 bool isalpha = pg_u_isalpha(code);
100 bool islower = pg_u_islower(code);
101 bool isupper = pg_u_isupper(code);
102 bool ispunct = pg_u_ispunct(code, false);
103 bool isdigit = pg_u_isdigit(code, false);
104 bool isxdigit = pg_u_isxdigit(code, false);
105 bool isalnum = pg_u_isalnum(code, false);
106 bool isspace = pg_u_isspace(code);
107 bool isblank = pg_u_isblank(code);
108 bool iscntrl = pg_u_iscntrl(code);
109 bool isgraph = pg_u_isgraph(code);
110 bool isprint = pg_u_isprint(code);
111
112 bool icu_isalpha = u_isUAlphabetic(code);
113 bool icu_islower = u_isULowercase(code);
114 bool icu_isupper = u_isUUppercase(code);
115 bool icu_ispunct = u_ispunct(code);
116 bool icu_isdigit = u_isdigit(code);
117 bool icu_isxdigit = u_hasBinaryProperty(code,
118 UCHAR_POSIX_XDIGIT);
119 bool icu_isalnum = u_hasBinaryProperty(code,
120 UCHAR_POSIX_ALNUM);
121 bool icu_isspace = u_isUWhiteSpace(code);
122 bool icu_isblank = u_isblank(code);
123 bool icu_iscntrl = icu_category == PG_U_CONTROL;
124 bool icu_isgraph = u_hasBinaryProperty(code,
125 UCHAR_POSIX_GRAPH);
126 bool icu_isprint = u_hasBinaryProperty(code,
127 UCHAR_POSIX_PRINT);
128
129 /*
130 * A version mismatch means that some assigned codepoints in the newer
131 * version may be unassigned in the older version. That's OK, though
132 * the test will not cover those codepoints marked unassigned in the
133 * older version (that is, it will no longer be an exhaustive test).
134 */
135 if (pg_category == PG_U_UNASSIGNED &&
136 icu_category != PG_U_UNASSIGNED &&
138 {
139 pg_skipped_codepoints++;
140 continue;
141 }
142
143 if (icu_category == PG_U_UNASSIGNED &&
144 pg_category != PG_U_UNASSIGNED &&
146 {
147 icu_skipped_codepoints++;
148 continue;
149 }
150
151 if (pg_category != icu_category)
152 {
153 printf("category_test: FAILURE for codepoint 0x%06x\n", code);
154 printf("category_test: Postgres category: %02d %s %s\n", pg_category,
155 unicode_category_abbrev(pg_category),
156 unicode_category_string(pg_category));
157 printf("category_test: ICU category: %02d %s %s\n", icu_category,
158 unicode_category_abbrev(icu_category),
159 unicode_category_string(icu_category));
160 printf("\n");
161 exit(1);
162 }
163
164 if (prop_alphabetic != icu_prop_alphabetic ||
165 prop_lowercase != icu_prop_lowercase ||
166 prop_uppercase != icu_prop_uppercase ||
167 prop_cased != icu_prop_cased ||
168 prop_case_ignorable != icu_prop_case_ignorable ||
169 prop_white_space != icu_prop_white_space ||
170 prop_hex_digit != icu_prop_hex_digit ||
171 prop_join_control != icu_prop_join_control)
172 {
173 printf("category_test: FAILURE for codepoint 0x%06x\n", code);
174 printf("category_test: Postgres property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
175 prop_alphabetic, prop_lowercase, prop_uppercase,
176 prop_cased, prop_case_ignorable,
177 prop_white_space, prop_hex_digit, prop_join_control);
178 printf("category_test: ICU property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
179 icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,
180 icu_prop_cased, icu_prop_case_ignorable,
181 icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);
182 printf("\n");
183 exit(1);
184 }
185
186 if (isalpha != icu_isalpha ||
187 islower != icu_islower ||
188 isupper != icu_isupper ||
189 ispunct != icu_ispunct ||
190 isdigit != icu_isdigit ||
191 isxdigit != icu_isxdigit ||
192 isalnum != icu_isalnum ||
193 isspace != icu_isspace ||
194 isblank != icu_isblank ||
195 iscntrl != icu_iscntrl ||
196 isgraph != icu_isgraph ||
197 isprint != icu_isprint)
198 {
199 printf("category_test: FAILURE for codepoint 0x%06x\n", code);
200 printf("category_test: Postgres class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
201 isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);
202 printf("category_test: ICU class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
203 icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);
204 printf("\n");
205 exit(1);
206 }
207
208 if (pg_category != PG_U_UNASSIGNED)
209 successful++;
210 }
211
212 if (pg_skipped_codepoints > 0)
213 printf("category_test: skipped %d codepoints unassigned in Postgres due to Unicode version mismatch\n",
214 pg_skipped_codepoints);
215 if (icu_skipped_codepoints > 0)
216 printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
217 icu_skipped_codepoints);
218
219 printf("category_test: ICU test: %d codepoints successful\n", successful);
220}
221#endif
222
223int
224main(int argc, char **argv)
225{
227 printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
228
229#ifdef USE_ICU
230 icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
231 printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
232
233 icu_test();
234#else
235 printf("category_test: ICU not available; skipping\n");
236#endif
237}
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:228
static int pg_unicode_version
Definition: category_test.c:27
int main(int argc, char **argv)
static int parse_unicode_version(const char *version)
Definition: category_test.c:36
Assert(PointerIsAligned(start, uint64))
#define printf(...)
Definition: port.h:266
bool pg_u_isalnum(char32_t code, bool posix)
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_prop_cased(char32_t code)
bool pg_u_prop_white_space(char32_t code)
bool pg_u_isprint(char32_t code)
bool pg_u_islower(char32_t code)
const char * unicode_category_abbrev(pg_unicode_category category)
bool pg_u_iscntrl(char32_t code)
pg_unicode_category unicode_category(char32_t code)
bool pg_u_prop_lowercase(char32_t code)
bool pg_u_prop_join_control(char32_t code)
bool pg_u_isdigit(char32_t code, bool posix)
bool pg_u_isalpha(char32_t code)
bool pg_u_prop_uppercase(char32_t code)
bool pg_u_isxdigit(char32_t code, bool posix)
bool pg_u_prop_case_ignorable(char32_t code)
bool pg_u_ispunct(char32_t code, bool posix)
bool pg_u_prop_hex_digit(char32_t code)
bool pg_u_isblank(char32_t code)
bool pg_u_isgraph(char32_t code)
bool pg_u_isspace(char32_t code)
bool pg_u_isupper(char32_t code)
bool pg_u_prop_alphabetic(char32_t code)
@ PG_U_CONTROL
@ PG_U_UNASSIGNED
#define PG_UNICODE_VERSION
Datum icu_unicode_version(PG_FUNCTION_ARGS)
Definition: varlena.c:5398