PostgreSQL Source Code git master
Loading...
Searching...
No Matches
unicode_category.c File Reference
Include dependency graph for unicode_category.c:

Go to the source code of this file.

Macros

#define PG_U_CATEGORY_MASK(X)   ((uint32)(1 << (X)))
 
#define PG_U_LU_MASK   PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
 
#define PG_U_LL_MASK   PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
 
#define PG_U_LT_MASK   PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
 
#define PG_U_LC_MASK   (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
 
#define PG_U_LM_MASK   PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
 
#define PG_U_LO_MASK   PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
 
#define PG_U_L_MASK
 
#define PG_U_MN_MASK   PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
 
#define PG_U_ME_MASK   PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
 
#define PG_U_MC_MASK   PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
 
#define PG_U_M_MASK   (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
 
#define PG_U_ND_MASK   PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
 
#define PG_U_NL_MASK   PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
 
#define PG_U_NO_MASK   PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
 
#define PG_U_N_MASK   (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
 
#define PG_U_PC_MASK   PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
 
#define PG_U_PD_MASK   PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
 
#define PG_U_PS_MASK   PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
 
#define PG_U_PE_MASK   PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
 
#define PG_U_PI_MASK   PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
 
#define PG_U_PF_MASK   PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
 
#define PG_U_PO_MASK   PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
 
#define PG_U_P_MASK
 
#define PG_U_SM_MASK   PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
 
#define PG_U_SC_MASK   PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
 
#define PG_U_SK_MASK   PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
 
#define PG_U_SO_MASK   PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
 
#define PG_U_S_MASK   (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
 
#define PG_U_ZS_MASK   PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
 
#define PG_U_ZL_MASK   PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
 
#define PG_U_ZP_MASK   PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
 
#define PG_U_Z_MASK   (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
 
#define PG_U_CC_MASK   PG_U_CATEGORY_MASK(PG_U_CONTROL)
 
#define PG_U_CF_MASK   PG_U_CATEGORY_MASK(PG_U_FORMAT)
 
#define PG_U_CS_MASK   PG_U_CATEGORY_MASK(PG_U_SURROGATE)
 
#define PG_U_CO_MASK   PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
 
#define PG_U_CN_MASK   PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
 
#define PG_U_C_MASK
 
#define PG_U_CHARACTER_TAB   0x09
 

Functions

static bool range_search (const pg_unicode_range *tbl, size_t size, char32_t code)
 
pg_unicode_category unicode_category (char32_t code)
 
bool pg_u_prop_alphabetic (char32_t code)
 
bool pg_u_prop_lowercase (char32_t code)
 
bool pg_u_prop_uppercase (char32_t code)
 
bool pg_u_prop_cased (char32_t code)
 
bool pg_u_prop_case_ignorable (char32_t code)
 
bool pg_u_prop_white_space (char32_t code)
 
bool pg_u_prop_hex_digit (char32_t code)
 
bool pg_u_prop_join_control (char32_t code)
 
bool pg_u_isdigit (char32_t code, bool posix)
 
bool pg_u_isalpha (char32_t code)
 
bool pg_u_isalnum (char32_t code, bool posix)
 
bool pg_u_isword (char32_t code)
 
bool pg_u_isupper (char32_t code)
 
bool pg_u_islower (char32_t code)
 
bool pg_u_isblank (char32_t code)
 
bool pg_u_iscntrl (char32_t code)
 
bool pg_u_isgraph (char32_t code)
 
bool pg_u_isprint (char32_t code)
 
bool pg_u_ispunct (char32_t code, bool posix)
 
bool pg_u_isspace (char32_t code)
 
bool pg_u_isxdigit (char32_t code, bool posix)
 
const charunicode_category_string (pg_unicode_category category)
 
const charunicode_category_abbrev (pg_unicode_category category)
 

Macro Definition Documentation

◆ PG_U_C_MASK

#define PG_U_C_MASK
Value:
#define PG_U_CC_MASK
#define PG_U_CO_MASK
#define PG_U_CS_MASK
#define PG_U_CF_MASK
#define PG_U_CN_MASK

Definition at line 73 of file unicode_category.c.

85{
86 int min = 0;
87 int mid;
88 int max = lengthof(unicode_categories) - 1;
89
90 Assert(code <= 0x10ffff);
91
92 if (code < 0x80)
93 return unicode_opt_ascii[code].category;
94
95 while (max >= min)
96 {
97 mid = (min + max) / 2;
98 if (code > unicode_categories[mid].last)
99 min = mid + 1;
100 else if (code < unicode_categories[mid].first)
101 max = mid - 1;
102 else
103 return unicode_categories[mid].category;
104 }
105
106 return PG_U_UNASSIGNED;
107}
108
109bool
110pg_u_prop_alphabetic(char32_t code)
111{
112 if (code < 0x80)
114
117 code);
118}
119
120bool
121pg_u_prop_lowercase(char32_t code)
122{
123 if (code < 0x80)
125
128 code);
129}
130
131bool
132pg_u_prop_uppercase(char32_t code)
133{
134 if (code < 0x80)
136
139 code);
140}
141
142bool
143pg_u_prop_cased(char32_t code)
144{
146
147 if (code < 0x80)
149
151
152 return category_mask & PG_U_LT_MASK ||
153 pg_u_prop_lowercase(code) ||
155}
156
157bool
158pg_u_prop_case_ignorable(char32_t code)
159{
160 if (code < 0x80)
162
165 code);
166}
167
168bool
169pg_u_prop_white_space(char32_t code)
170{
171 if (code < 0x80)
173
176 code);
177}
178
179bool
180pg_u_prop_hex_digit(char32_t code)
181{
182 if (code < 0x80)
184
187 code);
188}
189
190bool
191pg_u_prop_join_control(char32_t code)
192{
193 if (code < 0x80)
195
198 code);
199}
200
201/*
202 * The following functions implement the Compatibility Properties described
203 * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
204 *
205 * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
206 * the "Standard" variant.
207 */
208
209bool
210pg_u_isdigit(char32_t code, bool posix)
211{
212 if (posix)
213 return ('0' <= code && code <= '9');
214 else
216}
217
218bool
219pg_u_isalpha(char32_t code)
220{
221 return pg_u_prop_alphabetic(code);
222}
223
224bool
225pg_u_isalnum(char32_t code, bool posix)
226{
227 return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
228}
229
230bool
231pg_u_isword(char32_t code)
232{
234
235 return
237 pg_u_isalpha(code) ||
239}
240
241bool
242pg_u_isupper(char32_t code)
243{
244 return pg_u_prop_uppercase(code);
245}
246
247bool
248pg_u_islower(char32_t code)
249{
250 return pg_u_prop_lowercase(code);
251}
252
253bool
254pg_u_isblank(char32_t code)
255{
256 return code == PG_U_CHARACTER_TAB ||
258}
259
260bool
261pg_u_iscntrl(char32_t code)
262{
263 return unicode_category(code) == PG_U_CONTROL;
264}
265
266bool
267pg_u_isgraph(char32_t code)
268{
270
272 pg_u_isspace(code))
273 return false;
274 return true;
275}
276
277bool
278pg_u_isprint(char32_t code)
279{
280 pg_unicode_category category = unicode_category(code);
281
282 if (category == PG_U_CONTROL)
283 return false;
284
285 return pg_u_isgraph(code) || pg_u_isblank(code);
286}
287
288bool
289pg_u_ispunct(char32_t code, bool posix)
290{
292
293 if (posix)
294 {
295 if (pg_u_isalpha(code))
296 return false;
297
300 }
301 else
302 {
304
305 return category_mask & PG_U_P_MASK;
306 }
307}
308
309bool
310pg_u_isspace(char32_t code)
311{
312 return pg_u_prop_white_space(code);
313}
314
315bool
316pg_u_isxdigit(char32_t code, bool posix)
317{
318 if (posix)
319 return (('0' <= code && code <= '9') ||
320 ('A' <= code && code <= 'F') ||
321 ('a' <= code && code <= 'f'));
322 else
323 return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
325}
326
327/*
328 * Description of Unicode general category.
329 */
330const char *
332{
333 switch (category)
334 {
335 case PG_U_UNASSIGNED:
336 return "Unassigned";
338 return "Uppercase_Letter";
340 return "Lowercase_Letter";
342 return "Titlecase_Letter";
344 return "Modifier_Letter";
346 return "Other_Letter";
348 return "Nonspacing_Mark";
350 return "Enclosing_Mark";
352 return "Spacing_Mark";
354 return "Decimal_Number";
356 return "Letter_Number";
358 return "Other_Number";
360 return "Space_Separator";
362 return "Line_Separator";
364 return "Paragraph_Separator";
365 case PG_U_CONTROL:
366 return "Control";
367 case PG_U_FORMAT:
368 return "Format";
369 case PG_U_PRIVATE_USE:
370 return "Private_Use";
371 case PG_U_SURROGATE:
372 return "Surrogate";
374 return "Dash_Punctuation";
376 return "Open_Punctuation";
378 return "Close_Punctuation";
380 return "Connector_Punctuation";
382 return "Other_Punctuation";
383 case PG_U_MATH_SYMBOL:
384 return "Math_Symbol";
386 return "Currency_Symbol";
388 return "Modifier_Symbol";
390 return "Other_Symbol";
392 return "Initial_Punctuation";
394 return "Final_Punctuation";
395 }
396
397 Assert(false);
398 return "Unrecognized"; /* keep compiler quiet */
399}
400
401/*
402 * Short code for Unicode general category.
403 */
404const char *
406{
407 switch (category)
408 {
409 case PG_U_UNASSIGNED:
410 return "Cn";
412 return "Lu";
414 return "Ll";
416 return "Lt";
418 return "Lm";
420 return "Lo";
422 return "Mn";
424 return "Me";
426 return "Mc";
428 return "Nd";
430 return "Nl";
432 return "No";
434 return "Zs";
436 return "Zl";
438 return "Zp";
439 case PG_U_CONTROL:
440 return "Cc";
441 case PG_U_FORMAT:
442 return "Cf";
443 case PG_U_PRIVATE_USE:
444 return "Co";
445 case PG_U_SURROGATE:
446 return "Cs";
448 return "Pd";
450 return "Ps";
452 return "Pe";
454 return "Pc";
456 return "Po";
457 case PG_U_MATH_SYMBOL:
458 return "Sm";
460 return "Sc";
462 return "Sk";
464 return "So";
466 return "Pi";
468 return "Pf";
469 }
470
471 Assert(false);
472 return "??"; /* keep compiler quiet */
473}
474
475/*
476 * Binary search to test if given codepoint exists in one of the ranges in the
477 * given table.
478 */
479static bool
480range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
481{
482 int min = 0;
483 int mid;
484 int max = size - 1;
485
486 Assert(code <= 0x10ffff);
487
488 while (max >= min)
489 {
490 mid = (min + max) / 2;
491 if (code > tbl[mid].last)
492 min = mid + 1;
493 else if (code < tbl[mid].first)
494 max = mid - 1;
495 else
496 return true;
497 }
498
499 return false;
500}
#define Assert(condition)
Definition c.h:873
uint32_t uint32
Definition c.h:546
#define lengthof(array)
Definition c.h:803
static int fb(int x)
bool pg_u_isalnum(char32_t code, bool posix)
const char * unicode_category_string(pg_unicode_category category)
bool pg_u_prop_cased(char32_t code)
bool pg_u_prop_white_space(char32_t code)
bool pg_u_isprint(char32_t code)
bool pg_u_islower(char32_t code)
const char * unicode_category_abbrev(pg_unicode_category category)
#define PG_U_CATEGORY_MASK(X)
bool pg_u_iscntrl(char32_t code)
#define PG_U_CHARACTER_TAB
pg_unicode_category unicode_category(char32_t code)
bool pg_u_prop_lowercase(char32_t code)
#define PG_U_PC_MASK
bool pg_u_prop_join_control(char32_t code)
bool pg_u_isdigit(char32_t code, bool posix)
bool pg_u_isalpha(char32_t code)
bool pg_u_prop_uppercase(char32_t code)
bool pg_u_isword(char32_t code)
#define PG_U_ND_MASK
#define PG_U_M_MASK
bool pg_u_isxdigit(char32_t code, bool posix)
bool pg_u_prop_case_ignorable(char32_t code)
#define PG_U_P_MASK
bool pg_u_ispunct(char32_t code, bool posix)
bool pg_u_prop_hex_digit(char32_t code)
bool pg_u_isblank(char32_t code)
#define PG_U_LT_MASK
bool pg_u_isgraph(char32_t code)
bool pg_u_isspace(char32_t code)
static bool range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
bool pg_u_isupper(char32_t code)
#define PG_U_S_MASK
bool pg_u_prop_alphabetic(char32_t code)
pg_unicode_category
@ PG_U_CONNECTOR_PUNCTUATION
@ PG_U_OTHER_SYMBOL
@ PG_U_DASH_PUNCTUATION
@ PG_U_UPPERCASE_LETTER
@ PG_U_DECIMAL_NUMBER
@ PG_U_CLOSE_PUNCTUATION
@ PG_U_NONSPACING_MARK
@ PG_U_INITIAL_PUNCTUATION
@ PG_U_CURRENCY_SYMBOL
@ PG_U_LETTER_NUMBER
@ PG_U_MODIFIER_SYMBOL
@ PG_U_SPACE_SEPARATOR
@ PG_U_OPEN_PUNCTUATION
@ PG_U_FORMAT
@ PG_U_PRIVATE_USE
@ PG_U_OTHER_LETTER
@ PG_U_PARAGRAPH_SEPARATOR
@ PG_U_CONTROL
@ PG_U_SPACING_MARK
@ PG_U_TITLECASE_LETTER
@ PG_U_OTHER_NUMBER
@ PG_U_MATH_SYMBOL
@ PG_U_LOWERCASE_LETTER
@ PG_U_LINE_SEPARATOR
@ PG_U_UNASSIGNED
@ PG_U_SURROGATE
@ PG_U_FINAL_PUNCTUATION
@ PG_U_MODIFIER_LETTER
@ PG_U_OTHER_PUNCTUATION
@ PG_U_ENCLOSING_MARK
#define PG_U_PROP_ALPHABETIC
static const pg_unicode_range unicode_white_space[11]
#define PG_U_PROP_LOWERCASE
#define PG_U_PROP_JOIN_CONTROL
#define PG_U_PROP_UPPERCASE
#define PG_U_PROP_HEX_DIGIT
static const pg_unicode_range unicode_join_control[1]
static const pg_unicode_range unicode_alphabetic[1179]
static const pg_unicode_range unicode_lowercase[690]
static const pg_unicode_range unicode_uppercase[656]
static const pg_unicode_range unicode_case_ignorable[506]
static const pg_unicode_properties unicode_opt_ascii[128]
#define PG_U_PROP_WHITE_SPACE
static const pg_unicode_range unicode_hex_digit[6]
#define PG_U_PROP_CASED
static const pg_category_range unicode_categories[3368]
#define PG_U_PROP_CASE_IGNORABLE

◆ PG_U_CATEGORY_MASK

#define PG_U_CATEGORY_MASK (   X)    ((uint32)(1 << (X)))

Definition at line 32 of file unicode_category.c.

◆ PG_U_CC_MASK

#define PG_U_CC_MASK   PG_U_CATEGORY_MASK(PG_U_CONTROL)

Definition at line 68 of file unicode_category.c.

◆ PG_U_CF_MASK

#define PG_U_CF_MASK   PG_U_CATEGORY_MASK(PG_U_FORMAT)

Definition at line 69 of file unicode_category.c.

◆ PG_U_CHARACTER_TAB

#define PG_U_CHARACTER_TAB   0x09

Definition at line 76 of file unicode_category.c.

◆ PG_U_CN_MASK

Definition at line 72 of file unicode_category.c.

◆ PG_U_CO_MASK

Definition at line 71 of file unicode_category.c.

◆ PG_U_CS_MASK

Definition at line 70 of file unicode_category.c.

◆ PG_U_L_MASK

#define PG_U_L_MASK
Value:
#define PG_U_LO_MASK
#define PG_U_LL_MASK
#define PG_U_LM_MASK
#define PG_U_LU_MASK

Definition at line 40 of file unicode_category.c.

◆ PG_U_LC_MASK

Definition at line 37 of file unicode_category.c.

◆ PG_U_LL_MASK

Definition at line 35 of file unicode_category.c.

◆ PG_U_LM_MASK

Definition at line 38 of file unicode_category.c.

◆ PG_U_LO_MASK

Definition at line 39 of file unicode_category.c.

◆ PG_U_LT_MASK

Definition at line 36 of file unicode_category.c.

◆ PG_U_LU_MASK

Definition at line 34 of file unicode_category.c.

◆ PG_U_M_MASK

Definition at line 45 of file unicode_category.c.

◆ PG_U_MC_MASK

Definition at line 44 of file unicode_category.c.

◆ PG_U_ME_MASK

Definition at line 43 of file unicode_category.c.

◆ PG_U_MN_MASK

Definition at line 42 of file unicode_category.c.

◆ PG_U_N_MASK

Definition at line 49 of file unicode_category.c.

◆ PG_U_ND_MASK

Definition at line 46 of file unicode_category.c.

◆ PG_U_NL_MASK

Definition at line 47 of file unicode_category.c.

◆ PG_U_NO_MASK

Definition at line 48 of file unicode_category.c.

◆ PG_U_P_MASK

#define PG_U_P_MASK
Value:
#define PG_U_PO_MASK
#define PG_U_PS_MASK
#define PG_U_PD_MASK
#define PG_U_PI_MASK
#define PG_U_PF_MASK
#define PG_U_PE_MASK

Definition at line 57 of file unicode_category.c.

◆ PG_U_PC_MASK

Definition at line 50 of file unicode_category.c.

◆ PG_U_PD_MASK

Definition at line 51 of file unicode_category.c.

◆ PG_U_PE_MASK

Definition at line 53 of file unicode_category.c.

◆ PG_U_PF_MASK

Definition at line 55 of file unicode_category.c.

◆ PG_U_PI_MASK

Definition at line 54 of file unicode_category.c.

◆ PG_U_PO_MASK

Definition at line 56 of file unicode_category.c.

◆ PG_U_PS_MASK

Definition at line 52 of file unicode_category.c.

◆ PG_U_S_MASK

Definition at line 63 of file unicode_category.c.

◆ PG_U_SC_MASK

Definition at line 60 of file unicode_category.c.

◆ PG_U_SK_MASK

Definition at line 61 of file unicode_category.c.

◆ PG_U_SM_MASK

Definition at line 59 of file unicode_category.c.

◆ PG_U_SO_MASK

Definition at line 62 of file unicode_category.c.

◆ PG_U_Z_MASK

Definition at line 67 of file unicode_category.c.

◆ PG_U_ZL_MASK

Definition at line 65 of file unicode_category.c.

◆ PG_U_ZP_MASK

Definition at line 66 of file unicode_category.c.

◆ PG_U_ZS_MASK

Definition at line 64 of file unicode_category.c.

Function Documentation

◆ pg_u_isalnum()

bool pg_u_isalnum ( char32_t  code,
bool  posix 
)

Definition at line 226 of file unicode_category.c.

227{
228 return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
229}

References pg_u_isalpha(), and pg_u_isdigit().

Referenced by initcap_wbnext(), initcap_wbnext(), and wc_isalnum_builtin().

◆ pg_u_isalpha()

bool pg_u_isalpha ( char32_t  code)

Definition at line 220 of file unicode_category.c.

221{
222 return pg_u_prop_alphabetic(code);
223}

References pg_u_prop_alphabetic().

Referenced by pg_u_isalnum(), pg_u_ispunct(), pg_u_isword(), and wc_isalpha_builtin().

◆ pg_u_isblank()

bool pg_u_isblank ( char32_t  code)

Definition at line 255 of file unicode_category.c.

256{
257 return code == PG_U_CHARACTER_TAB ||
259}

References PG_U_CHARACTER_TAB, PG_U_SPACE_SEPARATOR, and unicode_category().

Referenced by pg_u_isprint().

◆ pg_u_iscntrl()

bool pg_u_iscntrl ( char32_t  code)

Definition at line 262 of file unicode_category.c.

263{
264 return unicode_category(code) == PG_U_CONTROL;
265}

References PG_U_CONTROL, and unicode_category().

◆ pg_u_isdigit()

bool pg_u_isdigit ( char32_t  code,
bool  posix 
)

Definition at line 211 of file unicode_category.c.

212{
213 if (posix)
214 return ('0' <= code && code <= '9');
215 else
217}

References PG_U_DECIMAL_NUMBER, and unicode_category().

Referenced by pg_u_isalnum(), and wc_isdigit_builtin().

◆ pg_u_isgraph()

bool pg_u_isgraph ( char32_t  code)

Definition at line 268 of file unicode_category.c.

269{
271
273 pg_u_isspace(code))
274 return false;
275 return true;
276}

References fb(), PG_U_CATEGORY_MASK, PG_U_CC_MASK, PG_U_CN_MASK, PG_U_CS_MASK, pg_u_isspace(), and unicode_category().

Referenced by pg_u_isprint(), and wc_isgraph_builtin().

◆ pg_u_islower()

bool pg_u_islower ( char32_t  code)

Definition at line 249 of file unicode_category.c.

250{
251 return pg_u_prop_lowercase(code);
252}

References pg_u_prop_lowercase().

Referenced by wc_islower_builtin().

◆ pg_u_isprint()

bool pg_u_isprint ( char32_t  code)

Definition at line 279 of file unicode_category.c.

280{
281 pg_unicode_category category = unicode_category(code);
282
283 if (category == PG_U_CONTROL)
284 return false;
285
286 return pg_u_isgraph(code) || pg_u_isblank(code);
287}

References PG_U_CONTROL, pg_u_isblank(), pg_u_isgraph(), and unicode_category().

Referenced by wc_isprint_builtin().

◆ pg_u_ispunct()

bool pg_u_ispunct ( char32_t  code,
bool  posix 
)

Definition at line 290 of file unicode_category.c.

291{
293
294 if (posix)
295 {
296 if (pg_u_isalpha(code))
297 return false;
298
301 }
302 else
303 {
305
306 return category_mask & PG_U_P_MASK;
307 }
308}

References fb(), PG_U_CATEGORY_MASK, pg_u_isalpha(), PG_U_P_MASK, PG_U_S_MASK, and unicode_category().

Referenced by wc_ispunct_builtin().

◆ pg_u_isspace()

bool pg_u_isspace ( char32_t  code)

Definition at line 311 of file unicode_category.c.

312{
313 return pg_u_prop_white_space(code);
314}

References pg_u_prop_white_space().

Referenced by pg_u_isgraph(), and wc_isspace_builtin().

◆ pg_u_isupper()

bool pg_u_isupper ( char32_t  code)

Definition at line 243 of file unicode_category.c.

244{
245 return pg_u_prop_uppercase(code);
246}

References pg_u_prop_uppercase().

Referenced by wc_isupper_builtin().

◆ pg_u_isword()

◆ pg_u_isxdigit()

bool pg_u_isxdigit ( char32_t  code,
bool  posix 
)

Definition at line 317 of file unicode_category.c.

318{
319 if (posix)
320 return (('0' <= code && code <= '9') ||
321 ('A' <= code && code <= 'F') ||
322 ('a' <= code && code <= 'f'));
323 else
324 return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
326}

References PG_U_DECIMAL_NUMBER, pg_u_prop_hex_digit(), and unicode_category().

Referenced by wc_isxdigit_builtin().

◆ pg_u_prop_alphabetic()

bool pg_u_prop_alphabetic ( char32_t  code)

◆ pg_u_prop_case_ignorable()

bool pg_u_prop_case_ignorable ( char32_t  code)

◆ pg_u_prop_cased()

◆ pg_u_prop_hex_digit()

bool pg_u_prop_hex_digit ( char32_t  code)

Definition at line 181 of file unicode_category.c.

182{
183 if (code < 0x80)
185
188 code);
189}

References lengthof, PG_U_PROP_HEX_DIGIT, pg_unicode_properties::properties, range_search(), unicode_hex_digit, and unicode_opt_ascii.

Referenced by pg_u_isxdigit().

◆ pg_u_prop_join_control()

bool pg_u_prop_join_control ( char32_t  code)

◆ pg_u_prop_lowercase()

bool pg_u_prop_lowercase ( char32_t  code)

◆ pg_u_prop_uppercase()

bool pg_u_prop_uppercase ( char32_t  code)

◆ pg_u_prop_white_space()

bool pg_u_prop_white_space ( char32_t  code)

◆ range_search()

static bool range_search ( const pg_unicode_range tbl,
size_t  size,
char32_t  code 
)
static

Definition at line 481 of file unicode_category.c.

482{
483 int min = 0;
484 int mid;
485 int max = size - 1;
486
487 Assert(code <= 0x10ffff);
488
489 while (max >= min)
490 {
491 mid = (min + max) / 2;
492 if (code > tbl[mid].last)
493 min = mid + 1;
494 else if (code < tbl[mid].first)
495 max = mid - 1;
496 else
497 return true;
498 }
499
500 return false;
501}

References Assert, and fb().

Referenced by pg_u_prop_alphabetic(), pg_u_prop_case_ignorable(), pg_u_prop_hex_digit(), pg_u_prop_join_control(), pg_u_prop_lowercase(), pg_u_prop_uppercase(), and pg_u_prop_white_space().

◆ unicode_category()

pg_unicode_category unicode_category ( char32_t  code)

Definition at line 85 of file unicode_category.c.

86{
87 int min = 0;
88 int mid;
89 int max = lengthof(unicode_categories) - 1;
90
91 Assert(code <= 0x10ffff);
92
93 if (code < 0x80)
94 return unicode_opt_ascii[code].category;
95
96 while (max >= min)
97 {
98 mid = (min + max) / 2;
99 if (code > unicode_categories[mid].last)
100 min = mid + 1;
101 else if (code < unicode_categories[mid].first)
102 max = mid - 1;
103 else
104 return unicode_categories[mid].category;
105 }
106
107 return PG_U_UNASSIGNED;
108}

References Assert, pg_category_range::category, pg_unicode_properties::category, lengthof, PG_U_UNASSIGNED, unicode_categories, and unicode_opt_ascii.

Referenced by pg_u_isblank(), pg_u_iscntrl(), pg_u_isdigit(), pg_u_isgraph(), pg_u_isprint(), pg_u_ispunct(), pg_u_isword(), pg_u_isxdigit(), pg_u_prop_cased(), and unicode_assigned().

◆ unicode_category_abbrev()

const char * unicode_category_abbrev ( pg_unicode_category  category)

Definition at line 406 of file unicode_category.c.

407{
408 switch (category)
409 {
410 case PG_U_UNASSIGNED:
411 return "Cn";
413 return "Lu";
415 return "Ll";
417 return "Lt";
419 return "Lm";
421 return "Lo";
423 return "Mn";
425 return "Me";
427 return "Mc";
429 return "Nd";
431 return "Nl";
433 return "No";
435 return "Zs";
437 return "Zl";
439 return "Zp";
440 case PG_U_CONTROL:
441 return "Cc";
442 case PG_U_FORMAT:
443 return "Cf";
444 case PG_U_PRIVATE_USE:
445 return "Co";
446 case PG_U_SURROGATE:
447 return "Cs";
449 return "Pd";
451 return "Ps";
453 return "Pe";
455 return "Pc";
457 return "Po";
458 case PG_U_MATH_SYMBOL:
459 return "Sm";
461 return "Sc";
463 return "Sk";
465 return "So";
467 return "Pi";
469 return "Pf";
470 }
471
472 Assert(false);
473 return "??"; /* keep compiler quiet */
474}

References Assert, PG_U_CLOSE_PUNCTUATION, PG_U_CONNECTOR_PUNCTUATION, PG_U_CONTROL, PG_U_CURRENCY_SYMBOL, PG_U_DASH_PUNCTUATION, PG_U_DECIMAL_NUMBER, PG_U_ENCLOSING_MARK, PG_U_FINAL_PUNCTUATION, PG_U_FORMAT, PG_U_INITIAL_PUNCTUATION, PG_U_LETTER_NUMBER, PG_U_LINE_SEPARATOR, PG_U_LOWERCASE_LETTER, PG_U_MATH_SYMBOL, PG_U_MODIFIER_LETTER, PG_U_MODIFIER_SYMBOL, PG_U_NONSPACING_MARK, PG_U_OPEN_PUNCTUATION, PG_U_OTHER_LETTER, PG_U_OTHER_NUMBER, PG_U_OTHER_PUNCTUATION, PG_U_OTHER_SYMBOL, PG_U_PARAGRAPH_SEPARATOR, PG_U_PRIVATE_USE, PG_U_SPACE_SEPARATOR, PG_U_SPACING_MARK, PG_U_SURROGATE, PG_U_TITLECASE_LETTER, PG_U_UNASSIGNED, and PG_U_UPPERCASE_LETTER.

◆ unicode_category_string()

const char * unicode_category_string ( pg_unicode_category  category)

Definition at line 332 of file unicode_category.c.

333{
334 switch (category)
335 {
336 case PG_U_UNASSIGNED:
337 return "Unassigned";
339 return "Uppercase_Letter";
341 return "Lowercase_Letter";
343 return "Titlecase_Letter";
345 return "Modifier_Letter";
347 return "Other_Letter";
349 return "Nonspacing_Mark";
351 return "Enclosing_Mark";
353 return "Spacing_Mark";
355 return "Decimal_Number";
357 return "Letter_Number";
359 return "Other_Number";
361 return "Space_Separator";
363 return "Line_Separator";
365 return "Paragraph_Separator";
366 case PG_U_CONTROL:
367 return "Control";
368 case PG_U_FORMAT:
369 return "Format";
370 case PG_U_PRIVATE_USE:
371 return "Private_Use";
372 case PG_U_SURROGATE:
373 return "Surrogate";
375 return "Dash_Punctuation";
377 return "Open_Punctuation";
379 return "Close_Punctuation";
381 return "Connector_Punctuation";
383 return "Other_Punctuation";
384 case PG_U_MATH_SYMBOL:
385 return "Math_Symbol";
387 return "Currency_Symbol";
389 return "Modifier_Symbol";
391 return "Other_Symbol";
393 return "Initial_Punctuation";
395 return "Final_Punctuation";
396 }
397
398 Assert(false);
399 return "Unrecognized"; /* keep compiler quiet */
400}

References Assert, PG_U_CLOSE_PUNCTUATION, PG_U_CONNECTOR_PUNCTUATION, PG_U_CONTROL, PG_U_CURRENCY_SYMBOL, PG_U_DASH_PUNCTUATION, PG_U_DECIMAL_NUMBER, PG_U_ENCLOSING_MARK, PG_U_FINAL_PUNCTUATION, PG_U_FORMAT, PG_U_INITIAL_PUNCTUATION, PG_U_LETTER_NUMBER, PG_U_LINE_SEPARATOR, PG_U_LOWERCASE_LETTER, PG_U_MATH_SYMBOL, PG_U_MODIFIER_LETTER, PG_U_MODIFIER_SYMBOL, PG_U_NONSPACING_MARK, PG_U_OPEN_PUNCTUATION, PG_U_OTHER_LETTER, PG_U_OTHER_NUMBER, PG_U_OTHER_PUNCTUATION, PG_U_OTHER_SYMBOL, PG_U_PARAGRAPH_SEPARATOR, PG_U_PRIVATE_USE, PG_U_SPACE_SEPARATOR, PG_U_SPACING_MARK, PG_U_SURROGATE, PG_U_TITLECASE_LETTER, PG_U_UNASSIGNED, and PG_U_UPPERCASE_LETTER.