PostgreSQL Source Code  git master
category_test.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * category_test.c
3  * Program to test Unicode general category and character properties.
4  *
5  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  * src/common/unicode/category_test.c
9  *
10  *-------------------------------------------------------------------------
11  */
12 #include "postgres_fe.h"
13 
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <wctype.h>
18 
19 #ifdef USE_ICU
20 #include <unicode/uchar.h>
21 #endif
22 
24 #include "common/unicode_version.h"
25 
26 static int pg_unicode_version = 0;
27 #ifdef USE_ICU
28 static int icu_unicode_version = 0;
29 #endif
30 
31 /*
32  * Parse version into integer for easy comparison.
33  */
34 static int
35 parse_unicode_version(const char *version)
36 {
38  int major;
39  int minor;
40 
41  n = sscanf(version, "%d.%d", &major, &minor);
42 
43  Assert(n == 2);
44  Assert(minor < 100);
45 
46  return major * 100 + minor;
47 }
48 
49 #ifdef USE_ICU
50 /*
51  * Test Postgres Unicode tables by comparing with ICU. Test the General
52  * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
53  * White_Space, and Hex_Digit.
54  */
55 static void
56 icu_test()
57 {
58  int successful = 0;
59  int pg_skipped_codepoints = 0;
60  int icu_skipped_codepoints = 0;
61 
62  for (pg_wchar code = 0; code <= 0x10ffff; code++)
63  {
64  uint8_t pg_category = unicode_category(code);
65  uint8_t icu_category = u_charType(code);
66 
67  /* Property tests */
68  bool prop_alphabetic = pg_u_prop_alphabetic(code);
69  bool prop_lowercase = pg_u_prop_lowercase(code);
70  bool prop_uppercase = pg_u_prop_uppercase(code);
71  bool prop_cased = pg_u_prop_cased(code);
72  bool prop_case_ignorable = pg_u_prop_case_ignorable(code);
73  bool prop_white_space = pg_u_prop_white_space(code);
74  bool prop_hex_digit = pg_u_prop_hex_digit(code);
75  bool prop_join_control = pg_u_prop_join_control(code);
76 
77  bool icu_prop_alphabetic = u_hasBinaryProperty(
78  code, UCHAR_ALPHABETIC);
79  bool icu_prop_lowercase = u_hasBinaryProperty(
80  code, UCHAR_LOWERCASE);
81  bool icu_prop_uppercase = u_hasBinaryProperty(
82  code, UCHAR_UPPERCASE);
83  bool icu_prop_cased = u_hasBinaryProperty(
84  code, UCHAR_CASED);
85  bool icu_prop_case_ignorable = u_hasBinaryProperty(
86  code, UCHAR_CASE_IGNORABLE);
87  bool icu_prop_white_space = u_hasBinaryProperty(
88  code, UCHAR_WHITE_SPACE);
89  bool icu_prop_hex_digit = u_hasBinaryProperty(
90  code, UCHAR_HEX_DIGIT);
91  bool icu_prop_join_control = u_hasBinaryProperty(
92  code, UCHAR_JOIN_CONTROL);
93 
94  /*
95  * Compare with ICU for character classes using:
96  *
97  * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
98  *
99  * which describes how to use ICU to test for membership in regex
100  * character classes.
101  *
102  * NB: the document suggests testing for some properties such as
103  * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
104  * "POSIX Compatible" character classes.
105  */
106  bool isalpha = pg_u_isalpha(code);
107  bool islower = pg_u_islower(code);
108  bool isupper = pg_u_isupper(code);
109  bool ispunct = pg_u_ispunct(code, false);
110  bool isdigit = pg_u_isdigit(code, false);
111  bool isxdigit = pg_u_isxdigit(code, false);
112  bool isalnum = pg_u_isalnum(code, false);
113  bool isspace = pg_u_isspace(code);
114  bool isblank = pg_u_isblank(code);
115  bool iscntrl = pg_u_iscntrl(code);
116  bool isgraph = pg_u_isgraph(code);
117  bool isprint = pg_u_isprint(code);
118 
119  bool icu_isalpha = u_isUAlphabetic(code);
120  bool icu_islower = u_isULowercase(code);
121  bool icu_isupper = u_isUUppercase(code);
122  bool icu_ispunct = u_ispunct(code);
123  bool icu_isdigit = u_isdigit(code);
124  bool icu_isxdigit = u_hasBinaryProperty(code,
125  UCHAR_POSIX_XDIGIT);
126  bool icu_isalnum = u_hasBinaryProperty(code,
127  UCHAR_POSIX_ALNUM);
128  bool icu_isspace = u_isUWhiteSpace(code);
129  bool icu_isblank = u_isblank(code);
130  bool icu_iscntrl = icu_category == PG_U_CONTROL;
131  bool icu_isgraph = u_hasBinaryProperty(code,
132  UCHAR_POSIX_GRAPH);
133  bool icu_isprint = u_hasBinaryProperty(code,
134  UCHAR_POSIX_PRINT);
135 
136  /*
137  * A version mismatch means that some assigned codepoints in the newer
138  * version may be unassigned in the older version. That's OK, though
139  * the test will not cover those codepoints marked unassigned in the
140  * older version (that is, it will no longer be an exhaustive test).
141  */
142  if (pg_category == PG_U_UNASSIGNED &&
143  icu_category != PG_U_UNASSIGNED &&
145  {
146  pg_skipped_codepoints++;
147  continue;
148  }
149 
150  if (icu_category == PG_U_UNASSIGNED &&
151  pg_category != PG_U_UNASSIGNED &&
153  {
154  icu_skipped_codepoints++;
155  continue;
156  }
157 
158  if (pg_category != icu_category)
159  {
160  printf("category_test: FAILURE for codepoint 0x%06x\n", code);
161  printf("category_test: Postgres category: %02d %s %s\n", pg_category,
162  unicode_category_abbrev(pg_category),
163  unicode_category_string(pg_category));
164  printf("category_test: ICU category: %02d %s %s\n", icu_category,
165  unicode_category_abbrev(icu_category),
166  unicode_category_string(icu_category));
167  printf("\n");
168  exit(1);
169  }
170 
171  if (prop_alphabetic != icu_prop_alphabetic ||
172  prop_lowercase != icu_prop_lowercase ||
173  prop_uppercase != icu_prop_uppercase ||
174  prop_cased != icu_prop_cased ||
175  prop_case_ignorable != icu_prop_case_ignorable ||
176  prop_white_space != icu_prop_white_space ||
177  prop_hex_digit != icu_prop_hex_digit ||
178  prop_join_control != icu_prop_join_control)
179  {
180  printf("category_test: FAILURE for codepoint 0x%06x\n", code);
181  printf("category_test: Postgres property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
182  prop_alphabetic, prop_lowercase, prop_uppercase,
183  prop_cased, prop_case_ignorable,
184  prop_white_space, prop_hex_digit, prop_join_control);
185  printf("category_test: ICU property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
186  icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,
187  icu_prop_cased, icu_prop_case_ignorable,
188  icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);
189  printf("\n");
190  exit(1);
191  }
192 
193  if (isalpha != icu_isalpha ||
194  islower != icu_islower ||
195  isupper != icu_isupper ||
196  ispunct != icu_ispunct ||
197  isdigit != icu_isdigit ||
198  isxdigit != icu_isxdigit ||
199  isalnum != icu_isalnum ||
200  isspace != icu_isspace ||
201  isblank != icu_isblank ||
202  iscntrl != icu_iscntrl ||
203  isgraph != icu_isgraph ||
204  isprint != icu_isprint)
205  {
206  printf("category_test: FAILURE for codepoint 0x%06x\n", code);
207  printf("category_test: Postgres class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
208  isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);
209  printf("category_test: ICU class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
210  icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);
211  printf("\n");
212  exit(1);
213  }
214 
215  if (pg_category != PG_U_UNASSIGNED)
216  successful++;
217  }
218 
219  if (pg_skipped_codepoints > 0)
220  printf("category_test: skipped %d codepoints unassigned in Postgres due to Unicode version mismatch\n",
221  pg_skipped_codepoints);
222  if (icu_skipped_codepoints > 0)
223  printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
224  icu_skipped_codepoints);
225 
226  printf("category_test: ICU test: %d codepoints successful\n", successful);
227 }
228 #endif
229 
230 int
231 main(int argc, char **argv)
232 {
234  printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
235 
236 #ifdef USE_ICU
237  icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
238  printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
239 
240  icu_test();
241 #else
242  printf("category_test: ICU not available; skipping\n");
243 #endif
244 }
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:182
#define Assert(condition)
Definition: c.h:858
static int pg_unicode_version
Definition: category_test.c:26
int main(int argc, char **argv)
static int parse_unicode_version(const char *version)
Definition: category_test.c:35
exit(1)
unsigned int pg_wchar
Definition: mbprint.c:31
#define printf(...)
Definition: port.h:244
bool pg_u_prop_uppercase(pg_wchar code)
bool pg_u_isspace(pg_wchar code)
bool pg_u_isxdigit(pg_wchar code, bool posix)
bool pg_u_ispunct(pg_wchar code, bool posix)
bool pg_u_isprint(pg_wchar code)
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_islower(pg_wchar code)
bool pg_u_prop_white_space(pg_wchar code)
bool pg_u_isblank(pg_wchar code)
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_isalpha(pg_wchar code)
bool pg_u_prop_lowercase(pg_wchar code)
const char * unicode_category_abbrev(pg_unicode_category category)
bool pg_u_isalnum(pg_wchar code, bool posix)
bool pg_u_isupper(pg_wchar code)
bool pg_u_prop_alphabetic(pg_wchar code)
bool pg_u_isdigit(pg_wchar code, bool posix)
bool pg_u_iscntrl(pg_wchar code)
bool pg_u_prop_join_control(pg_wchar code)
bool pg_u_isgraph(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)
bool pg_u_prop_hex_digit(pg_wchar code)
pg_unicode_category unicode_category(pg_wchar code)
@ PG_U_CONTROL
@ PG_U_UNASSIGNED
#define PG_UNICODE_VERSION
Datum icu_unicode_version(PG_FUNCTION_ARGS)
Definition: varlena.c:6302