PostgreSQL Source Code git master
unicode_category.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * unicode_category.c
3 * Determine general category and character properties of Unicode
4 * characters. Encoding must be UTF8, where we assume that the pg_wchar
5 * representation is a code point.
6 *
7 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
8 *
9 * IDENTIFICATION
10 * src/common/unicode_category.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#ifndef FRONTEND
15#include "postgres.h"
16#else
17#include "postgres_fe.h"
18#endif
19
22
23/*
24 * Create bitmasks from pg_unicode_category values for efficient comparison of
25 * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
26 * the general category Mn; and PG_U_M_MASK represents general categories Mn,
27 * Me, and Mc.
28 *
29 * The number of Unicode General Categories should never grow, so a 32-bit
30 * mask is fine.
31 */
32#define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
33
34#define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
35#define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
36#define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
37#define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
38#define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
39#define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
40#define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
41 PG_U_LO_MASK)
42#define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
43#define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
44#define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
45#define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
46#define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
47#define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
48#define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
49#define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
50#define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
51#define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
52#define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
53#define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
54#define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
55#define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
56#define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
57#define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
58 PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
59#define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
60#define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
61#define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
62#define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
63#define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
64#define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
65#define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
66#define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
67#define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
68#define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
69#define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
70#define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
71#define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
72#define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
73#define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
74 PG_U_CN_MASK)
75
76#define PG_U_CHARACTER_TAB 0x09
77
78static bool range_search(const pg_unicode_range *tbl, size_t size,
79 pg_wchar code);
80
81/*
82 * Unicode general category for the given codepoint.
83 */
86{
87 int min = 0;
88 int mid;
89 int max = lengthof(unicode_categories) - 1;
90
91 Assert(code <= 0x10ffff);
92
93 if (code < 0x80)
94 return unicode_opt_ascii[code].category;
95
96 while (max >= min)
97 {
98 mid = (min + max) / 2;
99 if (code > unicode_categories[mid].last)
100 min = mid + 1;
101 else if (code < unicode_categories[mid].first)
102 max = mid - 1;
103 else
104 return unicode_categories[mid].category;
105 }
106
107 return PG_U_UNASSIGNED;
108}
109
110bool
112{
113 if (code < 0x80)
115
118 code);
119}
120
121bool
123{
124 if (code < 0x80)
126
129 code);
130}
131
132bool
134{
135 if (code < 0x80)
137
140 code);
141}
142
143bool
145{
146 uint32 category_mask;
147
148 if (code < 0x80)
150
151 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
152
153 return category_mask & PG_U_LT_MASK ||
154 pg_u_prop_lowercase(code) ||
156}
157
158bool
160{
161 if (code < 0x80)
163
166 code);
167}
168
169bool
171{
172 if (code < 0x80)
174
177 code);
178}
179
180bool
182{
183 if (code < 0x80)
185
188 code);
189}
190
191bool
193{
194 if (code < 0x80)
196
199 code);
200}
201
202/*
203 * The following functions implement the Compatibility Properties described
204 * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
205 *
206 * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
207 * the "Standard" variant.
208 */
209
210bool
211pg_u_isdigit(pg_wchar code, bool posix)
212{
213 if (posix)
214 return ('0' <= code && code <= '9');
215 else
217}
218
219bool
221{
222 return pg_u_prop_alphabetic(code);
223}
224
225bool
226pg_u_isalnum(pg_wchar code, bool posix)
227{
228 return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
229}
230
231bool
233{
234 uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
235
236 return
237 category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
238 pg_u_isalpha(code) ||
240}
241
242bool
244{
245 return pg_u_prop_uppercase(code);
246}
247
248bool
250{
251 return pg_u_prop_lowercase(code);
252}
253
254bool
256{
257 return code == PG_U_CHARACTER_TAB ||
259}
260
261bool
263{
264 return unicode_category(code) == PG_U_CONTROL;
265}
266
267bool
269{
270 uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
271
272 if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
273 pg_u_isspace(code))
274 return false;
275 return true;
276}
277
278bool
280{
281 pg_unicode_category category = unicode_category(code);
282
283 if (category == PG_U_CONTROL)
284 return false;
285
286 return pg_u_isgraph(code) || pg_u_isblank(code);
287}
288
289bool
290pg_u_ispunct(pg_wchar code, bool posix)
291{
292 uint32 category_mask;
293
294 if (posix)
295 {
296 if (pg_u_isalpha(code))
297 return false;
298
299 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
300 return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
301 }
302 else
303 {
304 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
305
306 return category_mask & PG_U_P_MASK;
307 }
308}
309
310bool
312{
313 return pg_u_prop_white_space(code);
314}
315
316bool
317pg_u_isxdigit(pg_wchar code, bool posix)
318{
319 if (posix)
320 return (('0' <= code && code <= '9') ||
321 ('A' <= code && code <= 'F') ||
322 ('a' <= code && code <= 'f'));
323 else
324 return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
326}
327
328/*
329 * Description of Unicode general category.
330 */
331const char *
333{
334 switch (category)
335 {
336 case PG_U_UNASSIGNED:
337 return "Unassigned";
339 return "Uppercase_Letter";
341 return "Lowercase_Letter";
343 return "Titlecase_Letter";
345 return "Modifier_Letter";
347 return "Other_Letter";
349 return "Nonspacing_Mark";
351 return "Enclosing_Mark";
353 return "Spacing_Mark";
355 return "Decimal_Number";
357 return "Letter_Number";
359 return "Other_Number";
361 return "Space_Separator";
363 return "Line_Separator";
365 return "Paragraph_Separator";
366 case PG_U_CONTROL:
367 return "Control";
368 case PG_U_FORMAT:
369 return "Format";
370 case PG_U_PRIVATE_USE:
371 return "Private_Use";
372 case PG_U_SURROGATE:
373 return "Surrogate";
375 return "Dash_Punctuation";
377 return "Open_Punctuation";
379 return "Close_Punctuation";
381 return "Connector_Punctuation";
383 return "Other_Punctuation";
384 case PG_U_MATH_SYMBOL:
385 return "Math_Symbol";
387 return "Currency_Symbol";
389 return "Modifier_Symbol";
391 return "Other_Symbol";
393 return "Initial_Punctuation";
395 return "Final_Punctuation";
396 }
397
398 Assert(false);
399 return "Unrecognized"; /* keep compiler quiet */
400}
401
402/*
403 * Short code for Unicode general category.
404 */
405const char *
407{
408 switch (category)
409 {
410 case PG_U_UNASSIGNED:
411 return "Cn";
413 return "Lu";
415 return "Ll";
417 return "Lt";
419 return "Lm";
421 return "Lo";
423 return "Mn";
425 return "Me";
427 return "Mc";
429 return "Nd";
431 return "Nl";
433 return "No";
435 return "Zs";
437 return "Zl";
439 return "Zp";
440 case PG_U_CONTROL:
441 return "Cc";
442 case PG_U_FORMAT:
443 return "Cf";
444 case PG_U_PRIVATE_USE:
445 return "Co";
446 case PG_U_SURROGATE:
447 return "Cs";
449 return "Pd";
451 return "Ps";
453 return "Pe";
455 return "Pc";
457 return "Po";
458 case PG_U_MATH_SYMBOL:
459 return "Sm";
461 return "Sc";
463 return "Sk";
465 return "So";
467 return "Pi";
469 return "Pf";
470 }
471
472 Assert(false);
473 return "??"; /* keep compiler quiet */
474}
475
476/*
477 * Binary search to test if given codepoint exists in one of the ranges in the
478 * given table.
479 */
480static bool
481range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
482{
483 int min = 0;
484 int mid;
485 int max = size - 1;
486
487 Assert(code <= 0x10ffff);
488
489 while (max >= min)
490 {
491 mid = (min + max) / 2;
492 if (code > tbl[mid].last)
493 min = mid + 1;
494 else if (code < tbl[mid].first)
495 max = mid - 1;
496 else
497 return true;
498 }
499
500 return false;
501}
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
Assert(PointerIsAligned(start, uint64))
unsigned int pg_wchar
Definition: mbprint.c:31
bool pg_u_prop_uppercase(pg_wchar code)
#define PG_U_CC_MASK
bool pg_u_isspace(pg_wchar code)
bool pg_u_isxdigit(pg_wchar code, bool posix)
bool pg_u_ispunct(pg_wchar code, bool posix)
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_isprint(pg_wchar code)
bool pg_u_islower(pg_wchar code)
const char * unicode_category_abbrev(pg_unicode_category category)
#define PG_U_CATEGORY_MASK(X)
#define PG_U_CHARACTER_TAB
#define PG_U_CS_MASK
bool pg_u_prop_white_space(pg_wchar code)
#define PG_U_PC_MASK
bool pg_u_isblank(pg_wchar code)
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_isalpha(pg_wchar code)
bool pg_u_prop_lowercase(pg_wchar code)
#define PG_U_ND_MASK
#define PG_U_M_MASK
static bool range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
#define PG_U_P_MASK
bool pg_u_isalnum(pg_wchar code, bool posix)
bool pg_u_isupper(pg_wchar code)
bool pg_u_prop_alphabetic(pg_wchar code)
#define PG_U_LT_MASK
bool pg_u_isdigit(pg_wchar code, bool posix)
bool pg_u_iscntrl(pg_wchar code)
bool pg_u_prop_join_control(pg_wchar code)
bool pg_u_isgraph(pg_wchar code)
bool pg_u_isword(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)
bool pg_u_prop_hex_digit(pg_wchar code)
pg_unicode_category unicode_category(pg_wchar code)
#define PG_U_S_MASK
#define PG_U_CN_MASK
pg_unicode_category
@ PG_U_CONNECTOR_PUNCTUATION
@ PG_U_OTHER_SYMBOL
@ PG_U_DASH_PUNCTUATION
@ PG_U_UPPERCASE_LETTER
@ PG_U_DECIMAL_NUMBER
@ PG_U_CLOSE_PUNCTUATION
@ PG_U_NONSPACING_MARK
@ PG_U_INITIAL_PUNCTUATION
@ PG_U_CURRENCY_SYMBOL
@ PG_U_LETTER_NUMBER
@ PG_U_MODIFIER_SYMBOL
@ PG_U_SPACE_SEPARATOR
@ PG_U_OPEN_PUNCTUATION
@ PG_U_FORMAT
@ PG_U_PRIVATE_USE
@ PG_U_OTHER_LETTER
@ PG_U_PARAGRAPH_SEPARATOR
@ PG_U_CONTROL
@ PG_U_SPACING_MARK
@ PG_U_TITLECASE_LETTER
@ PG_U_OTHER_NUMBER
@ PG_U_MATH_SYMBOL
@ PG_U_LOWERCASE_LETTER
@ PG_U_LINE_SEPARATOR
@ PG_U_UNASSIGNED
@ PG_U_SURROGATE
@ PG_U_FINAL_PUNCTUATION
@ PG_U_MODIFIER_LETTER
@ PG_U_OTHER_PUNCTUATION
@ PG_U_ENCLOSING_MARK
#define PG_U_PROP_ALPHABETIC
static const pg_unicode_range unicode_white_space[11]
#define PG_U_PROP_LOWERCASE
#define PG_U_PROP_JOIN_CONTROL
#define PG_U_PROP_UPPERCASE
#define PG_U_PROP_HEX_DIGIT
static const pg_unicode_range unicode_join_control[1]
static const pg_category_range unicode_categories[3302]
static const pg_unicode_range unicode_alphabetic[1141]
static const pg_unicode_properties unicode_opt_ascii[128]
static const pg_unicode_range unicode_case_ignorable[491]
static const pg_unicode_range unicode_uppercase[651]
#define PG_U_PROP_WHITE_SPACE
static const pg_unicode_range unicode_hex_digit[6]
#define PG_U_PROP_CASED
static const pg_unicode_range unicode_lowercase[686]
#define PG_U_PROP_CASE_IGNORABLE