PostgreSQL Source Code git master
case_test.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * case_test.c
3 * Program to test Unicode case mapping functions.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode/case_test.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#include "postgres_fe.h"
13
14#include <locale.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <wctype.h>
19
20#ifdef USE_ICU
21#include <unicode/ucasemap.h>
22#include <unicode/uchar.h>
23#endif
24#include "common/unicode_case.h"
27#include "mb/pg_wchar.h"
28
29/* enough to hold largest source or result string, including NUL */
30#define BUFSZ 256
31
32#ifdef USE_ICU
33static UCaseMap * casemap = NULL;
34#endif
35
36typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
37 ssize_t srclen);
38
39/* simple boundary iterator copied from pg_locale_builtin.c */
41{
42 const char *str;
43 size_t len;
44 size_t offset;
45 bool posix;
46 bool init;
47 bool prev_alnum;
48};
49
50static size_t
52{
53 struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
54
55 while (wbstate->offset < wbstate->len &&
56 wbstate->str[wbstate->offset] != '\0')
57 {
58 char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
59 wbstate->offset);
60 bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
61
62 if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
63 {
64 size_t prev_offset = wbstate->offset;
65
66 wbstate->init = true;
67 wbstate->offset += unicode_utf8len(u);
68 wbstate->prev_alnum = curr_alnum;
69 return prev_offset;
70 }
71
72 wbstate->offset += unicode_utf8len(u);
73 }
74
75 return wbstate->len;
76}
77
78#ifdef USE_ICU
79
80static void
81icu_test_simple(char32_t code)
82{
83 char32_t lower = unicode_lowercase_simple(code);
84 char32_t title = unicode_titlecase_simple(code);
85 char32_t upper = unicode_uppercase_simple(code);
86 char32_t fold = unicode_casefold_simple(code);
87 char32_t iculower = u_tolower(code);
88 char32_t icutitle = u_totitle(code);
89 char32_t icuupper = u_toupper(code);
90 char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
91
92 if (lower != iculower || title != icutitle || upper != icuupper ||
93 fold != icufold)
94 {
95 printf("case_test: FAILURE for codepoint 0x%06x\n", code);
96 printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
97 lower, title, upper, fold);
98 printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
99 iculower, icutitle, icuupper, icufold);
100 printf("\n");
101 exit(1);
102 }
103}
104
105static void
106icu_test_full(char *str)
107{
108 char lower[BUFSZ];
109 char title[BUFSZ];
110 char upper[BUFSZ];
111 char fold[BUFSZ];
112 char icu_lower[BUFSZ];
113 char icu_title[BUFSZ];
114 char icu_upper[BUFSZ];
115 char icu_fold[BUFSZ];
116 UErrorCode status;
117
118 /* full case mapping doesn't use posix semantics */
119 struct WordBoundaryState wbstate = {
120 .str = str,
121 .len = strlen(str),
122 .offset = 0,
123 .posix = false,
124 .init = false,
125 .prev_alnum = false,
126 };
127
128 unicode_strlower(lower, BUFSZ, str, -1, true);
129 unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
130 unicode_strupper(upper, BUFSZ, str, -1, true);
131 unicode_strfold(fold, BUFSZ, str, -1, true);
132 status = U_ZERO_ERROR;
133 ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
134 status = U_ZERO_ERROR;
135 ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
136 status = U_ZERO_ERROR;
137 ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
138 status = U_ZERO_ERROR;
139 ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
140
141 if (strcmp(lower, icu_lower) != 0)
142 {
143 printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
144 icu_lower);
145 exit(1);
146 }
147 if (strcmp(title, icu_title) != 0)
148 {
149 printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
150 icu_title);
151 exit(1);
152 }
153 if (strcmp(upper, icu_upper) != 0)
154 {
155 printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
156 icu_upper);
157 exit(1);
158 }
159 if (strcmp(fold, icu_fold) != 0)
160 {
161 printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
162 icu_fold);
163 exit(1);
164 }
165}
166
167/*
168 * Exhaustively compare case mappings with the results from ICU.
169 */
170static void
171test_icu(void)
172{
173 int successful = 0;
174 int skipped_mismatch = 0;
175
176 for (char32_t code = 0; code <= 0x10ffff; code++)
177 {
178 pg_unicode_category category = unicode_category(code);
179
180 if (category != PG_U_UNASSIGNED)
181 {
182 uint8_t icu_category = u_charType(code);
183 char code_str[5] = {0};
184
185 if (icu_category == PG_U_UNASSIGNED)
186 {
187 skipped_mismatch++;
188 continue;
189 }
190
191 icu_test_simple(code);
192 unicode_to_utf8(code, (unsigned char *) code_str);
193 icu_test_full(code_str);
194
195 successful++;
196 }
197 }
198
199 if (skipped_mismatch > 0)
200 printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
201 skipped_mismatch);
202
203 printf("case_test: ICU simple mapping test: %d codepoints successful\n",
204 successful);
205}
206#endif
207
208static void
209test_convert(TestFunc tfunc, const char *test_string, const char *expected)
210{
211 size_t src1len = strlen(test_string);
212 size_t src2len = -1; /* NUL-terminated */
213 size_t dst1len = strlen(expected);
214 size_t dst2len = strlen(expected) + 1; /* NUL-terminated */
215 char *src1 = malloc(src1len);
216 char *dst1 = malloc(dst1len);
217 char *src2 = strdup(test_string);
218 char *dst2 = malloc(dst2len);
219 size_t needed;
220
221 memcpy(src1, test_string, src1len); /* not NUL-terminated */
222
223 /* neither source nor destination are NUL-terminated */
224 memset(dst1, 0x7F, dst1len);
225 needed = tfunc(dst1, dst1len, src1, src1len);
226 if (needed != strlen(expected))
227 {
228 printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
229 test_string, needed, strlen(expected));
230 exit(1);
231 }
232 if (memcmp(dst1, expected, dst1len) != 0)
233 {
234 printf("case_test: convert_case test1 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
235 test_string, (int) dst1len, dst1, expected);
236 exit(1);
237 }
238
239 /* destination is NUL-terminated and source is not */
240 memset(dst2, 0x7F, dst2len);
241 needed = tfunc(dst2, dst2len, src1, src1len);
242 if (needed != strlen(expected))
243 {
244 printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
245 test_string, needed, strlen(expected));
246 exit(1);
247 }
248 if (strcmp(dst2, expected) != 0)
249 {
250 printf("case_test: convert_case test2 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
251 test_string, dst2, expected);
252 exit(1);
253 }
254
255 /* source is NUL-terminated and destination is not */
256 memset(dst1, 0x7F, dst1len);
257 needed = tfunc(dst1, dst1len, src2, src2len);
258 if (needed != strlen(expected))
259 {
260 printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
261 test_string, needed, strlen(expected));
262 printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
263 exit(1);
264 }
265 if (memcmp(dst1, expected, dst1len) != 0)
266 {
267 printf("case_test: convert_case test3 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
268 test_string, (int) dst1len, dst1, expected);
269 exit(1);
270 }
271
272 /* both source and destination are NUL-terminated */
273 memset(dst2, 0x7F, dst2len);
274 needed = tfunc(dst2, dst2len, src2, src2len);
275 if (needed != strlen(expected))
276 {
277 printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
278 test_string, needed, strlen(expected));
279 exit(1);
280 }
281 if (strcmp(dst2, expected) != 0)
282 {
283 printf("case_test: convert_case test4 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
284 test_string, dst2, expected);
285 exit(1);
286 }
287
288 free(src1);
289 free(dst1);
290 free(src2);
291 free(dst2);
292}
293
294static size_t
295tfunc_lower(char *dst, size_t dstsize, const char *src,
296 ssize_t srclen)
297{
298 return unicode_strlower(dst, dstsize, src, srclen, true);
299}
300
301static size_t
302tfunc_title(char *dst, size_t dstsize, const char *src,
303 ssize_t srclen)
304{
305 struct WordBoundaryState wbstate = {
306 .str = src,
307 .len = srclen,
308 .offset = 0,
309 .init = false,
310 .prev_alnum = false,
311 };
312
313 return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
314 &wbstate);
315}
316
317static size_t
318tfunc_upper(char *dst, size_t dstsize, const char *src,
319 ssize_t srclen)
320{
321 return unicode_strupper(dst, dstsize, src, srclen, true);
322}
323
324static size_t
325tfunc_fold(char *dst, size_t dstsize, const char *src,
326 ssize_t srclen)
327{
328 return unicode_strfold(dst, dstsize, src, srclen, true);
329}
330
331static void
333{
334 /* test string with no case changes */
335 test_convert(tfunc_lower, "√∞", "√∞");
336 /* test adjust-to-cased behavior */
337 test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
338 /* test string with case changes */
339 test_convert(tfunc_upper, "abc", "ABC");
340 /* test string with case changes and byte length changes */
341 test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
342 /* test special case conversions */
343 test_convert(tfunc_upper, "ß", "SS");
344 test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
345 test_convert(tfunc_upper, "ıiIİ", "IIIİ");
346 test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
347 /* test final sigma */
348 test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
349 test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
350 test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
351 test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
352 /* test that alphanumerics are word characters */
353 test_convert(tfunc_title, "λλ", "Λλ");
354 test_convert(tfunc_title, "1a", "1a");
355 /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
356 test_convert(tfunc_title, "\uFF11a", "\uFF11a");
357
358
359#ifdef USE_ICU
360 icu_test_full("");
361 icu_test_full("ȺȺȺ");
362 icu_test_full("ßßß");
363 icu_test_full("√∞");
364 icu_test_full("a b");
365 icu_test_full("abc 123xyz");
366 icu_test_full("σςΣ ΣΣΣ");
367 icu_test_full("ıiIİ");
368 icu_test_full("\uFF11a");
369 /* test <alpha><iota_subscript><acute> */
370 icu_test_full("\u0391\u0345\u0301");
371#endif
372
373 printf("case_test: convert_case: success\n");
374}
375
376int
377main(int argc, char **argv)
378{
379#ifdef USE_ICU
380 UErrorCode status = U_ZERO_ERROR;
381
382 /*
383 * Disable ICU's word break adjustment for titlecase to match the expected
384 * behavior of unicode_strtitle().
385 */
386 casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
387 if (U_FAILURE(status))
388 {
389 printf("case_test: failure opening UCaseMap: %s\n",
390 u_errorName(status));
391 exit(1);
392 }
393#endif
394
395 printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
396#ifdef USE_ICU
397 printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
398 test_icu();
399#else
400 printf("case_test: ICU not available; skipping\n");
401#endif
402
404
405#ifdef USE_ICU
406 ucasemap_close(casemap);
407#endif
408 exit(0);
409}
static void test_convert_case()
Definition: case_test.c:332
static void test_convert(TestFunc tfunc, const char *test_string, const char *expected)
Definition: case_test.c:209
int main(int argc, char **argv)
Definition: case_test.c:377
static size_t tfunc_lower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:295
static size_t initcap_wbnext(void *state)
Definition: case_test.c:51
static size_t tfunc_title(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:302
static size_t tfunc_upper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:318
#define BUFSZ
Definition: case_test.c:30
static size_t tfunc_fold(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:325
size_t(* TestFunc)(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:36
const char * str
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
static char32_t utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(char32_t c)
Definition: pg_wchar.h:607
#define printf(...)
Definition: port.h:266
Definition: regguts.h:323
char32_t unicode_titlecase_simple(char32_t code)
Definition: unicode_case.c:58
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:165
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:101
char32_t unicode_casefold_simple(char32_t code)
Definition: unicode_case.c:74
char32_t unicode_lowercase_simple(char32_t code)
Definition: unicode_case.c:50
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:138
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:189
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
Definition: unicode_case.c:397
char32_t unicode_uppercase_simple(char32_t code)
Definition: unicode_case.c:66
bool pg_u_isalnum(char32_t code, bool posix)
pg_unicode_category unicode_category(char32_t code)
pg_unicode_category
@ PG_U_UNASSIGNED
#define PG_UNICODE_VERSION