PostgreSQL Source Code git master
case_test.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * case_test.c
3 * Program to test Unicode case mapping functions.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode/case_test.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#include "postgres_fe.h"
13
14#include <locale.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <wctype.h>
19
20#ifdef USE_ICU
21#include <unicode/ucasemap.h>
22#include <unicode/uchar.h>
23#endif
24#include "common/unicode_case.h"
27
28/* enough to hold largest source or result string, including NUL */
29#define BUFSZ 256
30
31#ifdef USE_ICU
32static UCaseMap * casemap = NULL;
33#endif
34
35typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
36 ssize_t srclen);
37
38/* simple boundary iterator copied from pg_locale_builtin.c */
40{
41 const char *str;
42 size_t len;
43 size_t offset;
44 bool init;
45 bool prev_alnum;
46};
47
48static size_t
50{
51 struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
52
53 while (wbstate->offset < wbstate->len &&
54 wbstate->str[wbstate->offset] != '\0')
55 {
56 pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
57 wbstate->offset);
58 bool curr_alnum = pg_u_isalnum(u, true);
59
60 if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
61 {
62 size_t prev_offset = wbstate->offset;
63
64 wbstate->init = true;
65 wbstate->offset += unicode_utf8len(u);
66 wbstate->prev_alnum = curr_alnum;
67 return prev_offset;
68 }
69
70 wbstate->offset += unicode_utf8len(u);
71 }
72
73 return wbstate->len;
74}
75
76#ifdef USE_ICU
77
78static void
79icu_test_simple(pg_wchar code)
80{
85 pg_wchar iculower = u_tolower(code);
86 pg_wchar icutitle = u_totitle(code);
87 pg_wchar icuupper = u_toupper(code);
88 pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
89
90 if (lower != iculower || title != icutitle || upper != icuupper ||
91 fold != icufold)
92 {
93 printf("case_test: FAILURE for codepoint 0x%06x\n", code);
94 printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
95 lower, title, upper, fold);
96 printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
97 iculower, icutitle, icuupper, icufold);
98 printf("\n");
99 exit(1);
100 }
101}
102
103static void
104icu_test_full(char *str)
105{
106 char lower[BUFSZ];
107 char title[BUFSZ];
108 char upper[BUFSZ];
109 char fold[BUFSZ];
110 char icu_lower[BUFSZ];
111 char icu_title[BUFSZ];
112 char icu_upper[BUFSZ];
113 char icu_fold[BUFSZ];
114 UErrorCode status;
115 struct WordBoundaryState wbstate = {
116 .str = str,
117 .len = strlen(str),
118 .offset = 0,
119 .init = false,
120 .prev_alnum = false,
121 };
122
123 unicode_strlower(lower, BUFSZ, str, -1, true);
124 unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
125 unicode_strupper(upper, BUFSZ, str, -1, true);
126 unicode_strfold(fold, BUFSZ, str, -1, true);
127 status = U_ZERO_ERROR;
128 ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
129 status = U_ZERO_ERROR;
130 ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
131 status = U_ZERO_ERROR;
132 ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
133 status = U_ZERO_ERROR;
134 ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
135
136 if (strcmp(lower, icu_lower) != 0)
137 {
138 printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
139 icu_lower);
140 exit(1);
141 }
142 if (strcmp(title, icu_title) != 0)
143 {
144 printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
145 icu_title);
146 exit(1);
147 }
148 if (strcmp(upper, icu_upper) != 0)
149 {
150 printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
151 icu_upper);
152 exit(1);
153 }
154 if (strcmp(fold, icu_fold) != 0)
155 {
156 printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
157 icu_fold);
158 exit(1);
159 }
160}
161
162/*
163 * Exhaustively compare case mappings with the results from ICU.
164 */
165static void
166test_icu(void)
167{
168 int successful = 0;
169 int skipped_mismatch = 0;
170
171 for (pg_wchar code = 0; code <= 0x10ffff; code++)
172 {
173 pg_unicode_category category = unicode_category(code);
174
175 if (category != PG_U_UNASSIGNED)
176 {
177 uint8_t icu_category = u_charType(code);
178 char code_str[5] = {0};
179
180 if (icu_category == PG_U_UNASSIGNED)
181 {
182 skipped_mismatch++;
183 continue;
184 }
185
186 icu_test_simple(code);
187 unicode_to_utf8(code, (unsigned char *) code_str);
188 icu_test_full(code_str);
189
190 successful++;
191 }
192 }
193
194 if (skipped_mismatch > 0)
195 printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
196 skipped_mismatch);
197
198 printf("case_test: ICU simple mapping test: %d codepoints successful\n",
199 successful);
200}
201#endif
202
203static void
204test_convert(TestFunc tfunc, const char *test_string, const char *expected)
205{
206 size_t src1len = strlen(test_string);
207 size_t src2len = -1; /* NUL-terminated */
208 size_t dst1len = strlen(expected);
209 size_t dst2len = strlen(expected) + 1; /* NUL-terminated */
210 char *src1 = malloc(src1len);
211 char *dst1 = malloc(dst1len);
212 char *src2 = strdup(test_string);
213 char *dst2 = malloc(dst2len);
214 size_t needed;
215
216 memcpy(src1, test_string, src1len); /* not NUL-terminated */
217
218 /* neither source nor destination are NUL-terminated */
219 memset(dst1, 0x7F, dst1len);
220 needed = tfunc(dst1, dst1len, src1, src1len);
221 if (needed != strlen(expected))
222 {
223 printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
224 test_string, needed, strlen(expected));
225 exit(1);
226 }
227 if (memcmp(dst1, expected, dst1len) != 0)
228 {
229 printf("case_test: convert_case test1 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
230 test_string, (int) dst1len, dst1, expected);
231 exit(1);
232 }
233
234 /* destination is NUL-terminated and source is not */
235 memset(dst2, 0x7F, dst2len);
236 needed = tfunc(dst2, dst2len, src1, src1len);
237 if (needed != strlen(expected))
238 {
239 printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
240 test_string, needed, strlen(expected));
241 exit(1);
242 }
243 if (strcmp(dst2, expected) != 0)
244 {
245 printf("case_test: convert_case test2 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
246 test_string, dst2, expected);
247 exit(1);
248 }
249
250 /* source is NUL-terminated and destination is not */
251 memset(dst1, 0x7F, dst1len);
252 needed = tfunc(dst1, dst1len, src2, src2len);
253 if (needed != strlen(expected))
254 {
255 printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
256 test_string, needed, strlen(expected));
257 printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
258 exit(1);
259 }
260 if (memcmp(dst1, expected, dst1len) != 0)
261 {
262 printf("case_test: convert_case test3 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
263 test_string, (int) dst1len, dst1, expected);
264 exit(1);
265 }
266
267 /* both source and destination are NUL-terminated */
268 memset(dst2, 0x7F, dst2len);
269 needed = tfunc(dst2, dst2len, src2, src2len);
270 if (needed != strlen(expected))
271 {
272 printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
273 test_string, needed, strlen(expected));
274 exit(1);
275 }
276 if (strcmp(dst2, expected) != 0)
277 {
278 printf("case_test: convert_case test4 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
279 test_string, dst2, expected);
280 exit(1);
281 }
282
283 free(src1);
284 free(dst1);
285 free(src2);
286 free(dst2);
287}
288
289static size_t
290tfunc_lower(char *dst, size_t dstsize, const char *src,
291 ssize_t srclen)
292{
293 return unicode_strlower(dst, dstsize, src, srclen, true);
294}
295
296static size_t
297tfunc_title(char *dst, size_t dstsize, const char *src,
298 ssize_t srclen)
299{
300 struct WordBoundaryState wbstate = {
301 .str = src,
302 .len = srclen,
303 .offset = 0,
304 .init = false,
305 .prev_alnum = false,
306 };
307
308 return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
309 &wbstate);
310}
311
312static size_t
313tfunc_upper(char *dst, size_t dstsize, const char *src,
314 ssize_t srclen)
315{
316 return unicode_strupper(dst, dstsize, src, srclen, true);
317}
318
319static size_t
320tfunc_fold(char *dst, size_t dstsize, const char *src,
321 ssize_t srclen)
322{
323 return unicode_strfold(dst, dstsize, src, srclen, true);
324}
325
326static void
328{
329 /* test string with no case changes */
330 test_convert(tfunc_lower, "√∞", "√∞");
331 /* test adjust-to-cased behavior */
332 test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
333 /* test string with case changes */
334 test_convert(tfunc_upper, "abc", "ABC");
335 /* test string with case changes and byte length changes */
336 test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
337 /* test special case conversions */
338 test_convert(tfunc_upper, "ß", "SS");
339 test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
340 test_convert(tfunc_upper, "ıiIİ", "IIIİ");
341 test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
342 /* test final sigma */
343 test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
344 test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
345 test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
346 test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
347
348#ifdef USE_ICU
349 icu_test_full("");
350 icu_test_full("ȺȺȺ");
351 icu_test_full("ßßß");
352 icu_test_full("√∞");
353 icu_test_full("a b");
354 icu_test_full("abc 123xyz");
355 icu_test_full("σςΣ ΣΣΣ");
356 icu_test_full("ıiIİ");
357 /* test <alpha><iota_subscript><acute> */
358 icu_test_full("\u0391\u0345\u0301");
359#endif
360
361 printf("case_test: convert_case: success\n");
362}
363
364int
365main(int argc, char **argv)
366{
367#ifdef USE_ICU
368 UErrorCode status = U_ZERO_ERROR;
369
370 /*
371 * Disable ICU's word break adjustment for titlecase to match the expected
372 * behavior of unicode_strtitle().
373 */
374 casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
375 if (U_FAILURE(status))
376 {
377 printf("case_test: failure opening UCaseMap: %s\n",
378 u_errorName(status));
379 exit(1);
380 }
381#endif
382
383 printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
384#ifdef USE_ICU
385 printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
386 test_icu();
387#else
388 printf("case_test: ICU not available; skipping\n");
389#endif
390
392
393#ifdef USE_ICU
394 ucasemap_close(casemap);
395#endif
396 exit(0);
397}
static void test_convert_case()
Definition: case_test.c:327
static void test_convert(TestFunc tfunc, const char *test_string, const char *expected)
Definition: case_test.c:204
int main(int argc, char **argv)
Definition: case_test.c:365
static size_t tfunc_lower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:290
static size_t initcap_wbnext(void *state)
Definition: case_test.c:49
static size_t tfunc_title(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:297
static size_t tfunc_upper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:313
#define BUFSZ
Definition: case_test.c:29
static size_t tfunc_fold(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:320
size_t(* TestFunc)(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: case_test.c:35
const char * str
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
exit(1)
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
#define printf(...)
Definition: port.h:245
Definition: regguts.h:323
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:47
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:39
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:146
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:82
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:119
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:31
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
Definition: unicode_case.c:170
pg_wchar unicode_casefold_simple(pg_wchar code)
Definition: unicode_case.c:55
bool pg_u_isalnum(pg_wchar code, bool posix)
pg_unicode_category unicode_category(pg_wchar code)
pg_unicode_category
@ PG_U_UNASSIGNED
#define PG_UNICODE_VERSION