PostgreSQL Source Code  git master
unicode_category.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * unicode_category.c
3  * Determine general category and character properties of Unicode
4  * characters. Encoding must be UTF8, where we assume that the pg_wchar
5  * representation is a code point.
6  *
7  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
8  *
9  * IDENTIFICATION
10  * src/common/unicode_category.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef FRONTEND
15 #include "postgres.h"
16 #else
17 #include "postgres_fe.h"
18 #endif
19 
22 
23 /*
24  * Create bitmasks from pg_unicode_category values for efficient comparison of
25  * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
26  * the general category Mn; and PG_U_M_MASK represents general categories Mn,
27  * Me, and Mc.
28  *
29  * The number of Unicode General Categories should never grow, so a 32-bit
30  * mask is fine.
31  */
32 #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
33 
34 #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
35 #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
36 #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
37 #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
38 #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
39 #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
40 #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
41  PG_U_LO_MASK)
42 #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
43 #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
44 #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
45 #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
46 #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
47 #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
48 #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
49 #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
50 #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
51 #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
52 #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
53 #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
54 #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
55 #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
56 #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
57 #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
58  PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
59 #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
60 #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
61 #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
62 #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
63 #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
64 #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
65 #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
66 #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
67 #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
68 #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
69 #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
70 #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
71 #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
72 #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
73 #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
74  PG_U_CN_MASK)
75 
76 #define PG_U_CHARACTER_TAB 0x09
77 
78 static bool range_search(const pg_unicode_range *tbl, size_t size,
79  pg_wchar code);
80 
81 /*
82  * Unicode general category for the given codepoint.
83  */
86 {
87  int min = 0;
88  int mid;
89  int max = lengthof(unicode_categories) - 1;
90 
91  Assert(code <= 0x10ffff);
92 
93  if (code < 0x80)
94  return unicode_opt_ascii[code].category;
95 
96  while (max >= min)
97  {
98  mid = (min + max) / 2;
99  if (code > unicode_categories[mid].last)
100  min = mid + 1;
101  else if (code < unicode_categories[mid].first)
102  max = mid - 1;
103  else
104  return unicode_categories[mid].category;
105  }
106 
107  return PG_U_UNASSIGNED;
108 }
109 
110 bool
112 {
113  if (code < 0x80)
115 
118  code);
119 }
120 
121 bool
123 {
124  if (code < 0x80)
126 
129  code);
130 }
131 
132 bool
134 {
135  if (code < 0x80)
137 
140  code);
141 }
142 
143 bool
145 {
146  uint32 category_mask;
147 
148  if (code < 0x80)
150 
151  category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
152 
153  return category_mask & PG_U_LT_MASK ||
154  pg_u_prop_lowercase(code) ||
155  pg_u_prop_uppercase(code);
156 }
157 
158 bool
160 {
161  if (code < 0x80)
163 
166  code);
167 }
168 
169 bool
171 {
172  if (code < 0x80)
174 
177  code);
178 }
179 
180 bool
182 {
183  if (code < 0x80)
185 
188  code);
189 }
190 
191 bool
193 {
194  if (code < 0x80)
196 
199  code);
200 }
201 
202 /*
203  * The following functions implement the Compatibility Properties described
204  * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
205  *
206  * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
207  * the "Standard" variant.
208  */
209 
210 bool
211 pg_u_isdigit(pg_wchar code, bool posix)
212 {
213  if (posix)
214  return ('0' <= code && code <= '9');
215  else
216  return unicode_category(code) == PG_U_DECIMAL_NUMBER;
217 }
218 
219 bool
221 {
222  return pg_u_prop_alphabetic(code);
223 }
224 
225 bool
226 pg_u_isalnum(pg_wchar code, bool posix)
227 {
228  return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
229 }
230 
231 bool
233 {
234  uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
235 
236  return
237  category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
238  pg_u_isalpha(code) ||
240 }
241 
242 bool
244 {
245  return pg_u_prop_uppercase(code);
246 }
247 
248 bool
250 {
251  return pg_u_prop_lowercase(code);
252 }
253 
254 bool
256 {
257  return code == PG_U_CHARACTER_TAB ||
259 }
260 
261 bool
263 {
264  return unicode_category(code) == PG_U_CONTROL;
265 }
266 
267 bool
269 {
270  uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
271 
272  if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
273  pg_u_isspace(code))
274  return false;
275  return true;
276 }
277 
278 bool
280 {
281  pg_unicode_category category = unicode_category(code);
282 
283  if (category == PG_U_CONTROL)
284  return false;
285 
286  return pg_u_isgraph(code) || pg_u_isblank(code);
287 }
288 
289 bool
290 pg_u_ispunct(pg_wchar code, bool posix)
291 {
292  uint32 category_mask;
293 
294  if (posix)
295  {
296  if (pg_u_isalpha(code))
297  return false;
298 
299  category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
300  return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
301  }
302  else
303  {
304  category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
305 
306  return category_mask & PG_U_P_MASK;
307  }
308 }
309 
310 bool
312 {
313  return pg_u_prop_white_space(code);
314 }
315 
316 bool
317 pg_u_isxdigit(pg_wchar code, bool posix)
318 {
319  if (posix)
320  return (('0' <= code && code <= '9') ||
321  ('A' <= code && code <= 'F') ||
322  ('a' <= code && code <= 'f'));
323  else
324  return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
325  pg_u_prop_hex_digit(code);
326 }
327 
328 /*
329  * Description of Unicode general category.
330  */
331 const char *
333 {
334  switch (category)
335  {
336  case PG_U_UNASSIGNED:
337  return "Unassigned";
339  return "Uppercase_Letter";
341  return "Lowercase_Letter";
343  return "Titlecase_Letter";
345  return "Modifier_Letter";
346  case PG_U_OTHER_LETTER:
347  return "Other_Letter";
349  return "Nonspacing_Mark";
350  case PG_U_ENCLOSING_MARK:
351  return "Enclosing_Mark";
352  case PG_U_SPACING_MARK:
353  return "Spacing_Mark";
354  case PG_U_DECIMAL_NUMBER:
355  return "Decimal_Number";
356  case PG_U_LETTER_NUMBER:
357  return "Letter_Number";
358  case PG_U_OTHER_NUMBER:
359  return "Other_Number";
361  return "Space_Separator";
362  case PG_U_LINE_SEPARATOR:
363  return "Line_Separator";
365  return "Paragraph_Separator";
366  case PG_U_CONTROL:
367  return "Control";
368  case PG_U_FORMAT:
369  return "Format";
370  case PG_U_PRIVATE_USE:
371  return "Private_Use";
372  case PG_U_SURROGATE:
373  return "Surrogate";
375  return "Dash_Punctuation";
377  return "Open_Punctuation";
379  return "Close_Punctuation";
381  return "Connector_Punctuation";
383  return "Other_Punctuation";
384  case PG_U_MATH_SYMBOL:
385  return "Math_Symbol";
387  return "Currency_Symbol";
389  return "Modifier_Symbol";
390  case PG_U_OTHER_SYMBOL:
391  return "Other_Symbol";
393  return "Initial_Punctuation";
395  return "Final_Punctuation";
396  }
397 
398  Assert(false);
399  return "Unrecognized"; /* keep compiler quiet */
400 }
401 
402 /*
403  * Short code for Unicode general category.
404  */
405 const char *
407 {
408  switch (category)
409  {
410  case PG_U_UNASSIGNED:
411  return "Cn";
413  return "Lu";
415  return "Ll";
417  return "Lt";
419  return "Lm";
420  case PG_U_OTHER_LETTER:
421  return "Lo";
423  return "Mn";
424  case PG_U_ENCLOSING_MARK:
425  return "Me";
426  case PG_U_SPACING_MARK:
427  return "Mc";
428  case PG_U_DECIMAL_NUMBER:
429  return "Nd";
430  case PG_U_LETTER_NUMBER:
431  return "Nl";
432  case PG_U_OTHER_NUMBER:
433  return "No";
435  return "Zs";
436  case PG_U_LINE_SEPARATOR:
437  return "Zl";
439  return "Zp";
440  case PG_U_CONTROL:
441  return "Cc";
442  case PG_U_FORMAT:
443  return "Cf";
444  case PG_U_PRIVATE_USE:
445  return "Co";
446  case PG_U_SURROGATE:
447  return "Cs";
449  return "Pd";
451  return "Ps";
453  return "Pe";
455  return "Pc";
457  return "Po";
458  case PG_U_MATH_SYMBOL:
459  return "Sm";
461  return "Sc";
463  return "Sk";
464  case PG_U_OTHER_SYMBOL:
465  return "So";
467  return "Pi";
469  return "Pf";
470  }
471 
472  Assert(false);
473  return "??"; /* keep compiler quiet */
474 }
475 
476 /*
477  * Binary search to test if given codepoint exists in one of the ranges in the
478  * given table.
479  */
480 static bool
481 range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
482 {
483  int min = 0;
484  int mid;
485  int max = size - 1;
486 
487  Assert(code <= 0x10ffff);
488 
489  while (max >= min)
490  {
491  mid = (min + max) / 2;
492  if (code > tbl[mid].last)
493  min = mid + 1;
494  else if (code < tbl[mid].first)
495  max = mid - 1;
496  else
497  return true;
498  }
499 
500  return false;
501 }
unsigned int uint32
Definition: c.h:506
#define Assert(condition)
Definition: c.h:858
#define lengthof(array)
Definition: c.h:788
unsigned int pg_wchar
Definition: mbprint.c:31
static pg_noinline void Size size
Definition: slab.c:607
bool pg_u_prop_uppercase(pg_wchar code)
#define PG_U_CC_MASK
bool pg_u_isspace(pg_wchar code)
bool pg_u_isxdigit(pg_wchar code, bool posix)
bool pg_u_ispunct(pg_wchar code, bool posix)
bool pg_u_isprint(pg_wchar code)
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_islower(pg_wchar code)
#define PG_U_CATEGORY_MASK(X)
#define PG_U_CHARACTER_TAB
#define PG_U_CS_MASK
bool pg_u_prop_white_space(pg_wchar code)
#define PG_U_PC_MASK
bool pg_u_isblank(pg_wchar code)
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_isalpha(pg_wchar code)
bool pg_u_prop_lowercase(pg_wchar code)
#define PG_U_ND_MASK
#define PG_U_M_MASK
static bool range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
const char * unicode_category_abbrev(pg_unicode_category category)
#define PG_U_P_MASK
bool pg_u_isalnum(pg_wchar code, bool posix)
bool pg_u_isupper(pg_wchar code)
bool pg_u_prop_alphabetic(pg_wchar code)
#define PG_U_LT_MASK
bool pg_u_isdigit(pg_wchar code, bool posix)
bool pg_u_iscntrl(pg_wchar code)
bool pg_u_prop_join_control(pg_wchar code)
bool pg_u_isgraph(pg_wchar code)
bool pg_u_isword(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)
bool pg_u_prop_hex_digit(pg_wchar code)
pg_unicode_category unicode_category(pg_wchar code)
#define PG_U_S_MASK
#define PG_U_CN_MASK
pg_unicode_category
@ PG_U_CONNECTOR_PUNCTUATION
@ PG_U_OTHER_SYMBOL
@ PG_U_DASH_PUNCTUATION
@ PG_U_UPPERCASE_LETTER
@ PG_U_DECIMAL_NUMBER
@ PG_U_CLOSE_PUNCTUATION
@ PG_U_NONSPACING_MARK
@ PG_U_INITIAL_PUNCTUATION
@ PG_U_CURRENCY_SYMBOL
@ PG_U_LETTER_NUMBER
@ PG_U_MODIFIER_SYMBOL
@ PG_U_SPACE_SEPARATOR
@ PG_U_OPEN_PUNCTUATION
@ PG_U_FORMAT
@ PG_U_PRIVATE_USE
@ PG_U_OTHER_LETTER
@ PG_U_PARAGRAPH_SEPARATOR
@ PG_U_CONTROL
@ PG_U_SPACING_MARK
@ PG_U_TITLECASE_LETTER
@ PG_U_OTHER_NUMBER
@ PG_U_MATH_SYMBOL
@ PG_U_LOWERCASE_LETTER
@ PG_U_LINE_SEPARATOR
@ PG_U_UNASSIGNED
@ PG_U_SURROGATE
@ PG_U_FINAL_PUNCTUATION
@ PG_U_MODIFIER_LETTER
@ PG_U_OTHER_PUNCTUATION
@ PG_U_ENCLOSING_MARK
#define PG_U_PROP_ALPHABETIC
static const pg_unicode_range unicode_white_space[11]
#define PG_U_PROP_LOWERCASE
#define PG_U_PROP_JOIN_CONTROL
#define PG_U_PROP_UPPERCASE
#define PG_U_PROP_HEX_DIGIT
static const pg_unicode_range unicode_join_control[1]
static const pg_category_range unicode_categories[3302]
static const pg_unicode_range unicode_alphabetic[1141]
static const pg_unicode_properties unicode_opt_ascii[128]
static const pg_unicode_range unicode_case_ignorable[491]
static const pg_unicode_range unicode_uppercase[651]
#define PG_U_PROP_WHITE_SPACE
static const pg_unicode_range unicode_hex_digit[6]
#define PG_U_PROP_CASED
static const pg_unicode_range unicode_lowercase[686]
#define PG_U_PROP_CASE_IGNORABLE