PostgreSQL Source Code git master
Loading...
Searching...
No Matches
unicode_norm.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * unicode_norm.c
3 * Normalize a Unicode string
4 *
5 * This implements Unicode normalization, per the documentation at
6 * https://www.unicode.org/reports/tr15/.
7 *
8 * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
9 *
10 * IDENTIFICATION
11 * src/common/unicode_norm.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef FRONTEND
16#include "postgres.h"
17#else
18#include "postgres_fe.h"
19#endif
20
21#include "common/unicode_norm.h"
22#ifndef FRONTEND
25#include "port/pg_bswap.h"
26#include "utils/memutils.h"
27#else
29#endif
30
31#ifndef FRONTEND
32#define ALLOC(size) palloc(size)
33#define FREE(size) pfree(size)
34#else
35#define ALLOC(size) malloc(size)
36#define FREE(size) free(size)
37#endif
38
39/* Constants for calculations with Hangul characters */
40#define SBASE 0xAC00 /* U+AC00 */
41#define LBASE 0x1100 /* U+1100 */
42#define VBASE 0x1161 /* U+1161 */
43#define TBASE 0x11A7 /* U+11A7 */
44#define LCOUNT 19
45#define VCOUNT 21
46#define TCOUNT 28
47#define NCOUNT VCOUNT * TCOUNT
48#define SCOUNT LCOUNT * NCOUNT
49
50#ifdef FRONTEND
51/* comparison routine for bsearch() of decomposition lookup table. */
52static int
53conv_compare(const void *p1, const void *p2)
54{
55 uint32 v1,
56 v2;
57
58 v1 = *(const uint32 *) p1;
59 v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
60 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
61}
62
63#endif
64
65/*
66 * get_code_entry
67 *
68 * Get the entry corresponding to code in the decomposition lookup table.
69 * The backend version of this code uses a perfect hash function for the
70 * lookup, while the frontend version uses a binary search.
71 */
72static const pg_unicode_decomposition *
73get_code_entry(char32_t code)
74{
75#ifndef FRONTEND
76 int h;
79
80 /*
81 * Compute the hash function. The hash key is the codepoint with the bytes
82 * in network order.
83 */
84 hashkey = pg_hton32(code);
85 h = decompinfo.hash(&hashkey);
86
87 /* An out-of-range result implies no match */
88 if (h < 0 || h >= decompinfo.num_decomps)
89 return NULL;
90
91 /*
92 * Since it's a perfect hash, we need only match to the specific codepoint
93 * it identifies.
94 */
95 if (code != decompinfo.decomps[h].codepoint)
96 return NULL;
97
98 /* Success! */
99 return &decompinfo.decomps[h];
100#else
101 return bsearch(&(code),
106#endif
107}
108
109/*
110 * Get the combining class of the given codepoint.
111 */
112static uint8
114{
115 const pg_unicode_decomposition *entry = get_code_entry(code);
116
117 /*
118 * If no entries are found, the character used is either a Hangul
119 * character or a character with a class of 0 and no decompositions.
120 */
121 if (!entry)
122 return 0;
123 else
124 return entry->comb_class;
125}
126
127/*
128 * Given a decomposition entry looked up earlier, get the decomposed
129 * characters.
130 *
131 * Note: the returned pointer can point to statically allocated buffer, and
132 * is only valid until next call to this function!
133 */
134static const char32_t *
136{
137 static char32_t x;
138
139 if (DECOMPOSITION_IS_INLINE(entry))
140 {
141 Assert(DECOMPOSITION_SIZE(entry) == 1);
142 x = (char32_t) entry->dec_index;
143 *dec_size = 1;
144 return &x;
145 }
146 else
147 {
149 return &UnicodeDecomp_codepoints[entry->dec_index];
150 }
151}
152
153/*
154 * Calculate how many characters a given character will decompose to.
155 *
156 * This needs to recurse, if the character decomposes into characters that
157 * are, in turn, decomposable.
158 */
159static int
160get_decomposed_size(char32_t code, bool compat)
161{
162 const pg_unicode_decomposition *entry;
163 int size = 0;
164 int i;
165 const uint32 *decomp;
166 int dec_size;
167
168 /*
169 * Fast path for Hangul characters not stored in tables to save memory as
170 * decomposition is algorithmic. See
171 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
172 * on the matter.
173 */
174 if (code >= SBASE && code < SBASE + SCOUNT)
175 {
177 sindex;
178
179 sindex = code - SBASE;
180 tindex = sindex % TCOUNT;
181
182 if (tindex != 0)
183 return 3;
184 return 2;
185 }
186
187 entry = get_code_entry(code);
188
189 /*
190 * Just count current code if no other decompositions. A NULL entry is
191 * equivalent to a character with class 0 and no decompositions.
192 */
193 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
194 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
195 return 1;
196
197 /*
198 * If this entry has other decomposition codes look at them as well. First
199 * get its decomposition in the list of tables available.
200 */
202 for (i = 0; i < dec_size; i++)
203 {
204 uint32 lcode = decomp[i];
205
207 }
208
209 return size;
210}
211
212/*
213 * Recompose a set of characters. For hangul characters, the calculation
214 * is algorithmic. For others, an inverse lookup at the decomposition
215 * table is necessary. Returns true if a recomposition can be done, and
216 * false otherwise.
217 */
218static bool
220{
221 /*
222 * Handle Hangul characters algorithmically, per the Unicode spec.
223 *
224 * Check if two current characters are L and V.
225 */
226 if (start >= LBASE && start < LBASE + LCOUNT &&
227 code >= VBASE && code < VBASE + VCOUNT)
228 {
229 /* make syllable of form LV */
231 uint32 vindex = code - VBASE;
232
233 *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
234 return true;
235 }
236 /* Check if two current characters are LV and T */
237 else if (start >= SBASE && start < (SBASE + SCOUNT) &&
238 ((start - SBASE) % TCOUNT) == 0 &&
239 code >= TBASE && code < (TBASE + TCOUNT))
240 {
241 /* make syllable of form LVT */
242 uint32 tindex = code - TBASE;
243
244 *result = start + tindex;
245 return true;
246 }
247 else
248 {
249 const pg_unicode_decomposition *entry;
250
251 /*
252 * Do an inverse lookup of the decomposition tables to see if anything
253 * matches. The comparison just needs to be a perfect match on the
254 * sub-table of size two, because the start character has already been
255 * recomposed partially. This lookup uses a perfect hash function for
256 * the backend code.
257 */
258#ifndef FRONTEND
259
260 int h,
264
265 /*
266 * Compute the hash function. The hash key is formed by concatenating
267 * bytes of the two codepoints in network order. See also
268 * src/common/unicode/generate-unicode_norm_table.pl.
269 */
270 hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
271 h = recompinfo.hash(&hashkey);
272
273 /* An out-of-range result implies no match */
274 if (h < 0 || h >= recompinfo.num_recomps)
275 return false;
276
277 inv_lookup_index = recompinfo.inverse_lookup[h];
279
281 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
282 {
283 *result = entry->codepoint;
284 return true;
285 }
286
287#else
288
289 int i;
290
291 for (i = 0; i < lengthof(UnicodeDecompMain); i++)
292 {
293 entry = &UnicodeDecompMain[i];
294
295 if (DECOMPOSITION_SIZE(entry) != 2)
296 continue;
297
298 if (DECOMPOSITION_NO_COMPOSE(entry))
299 continue;
300
302 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
303 {
304 *result = entry->codepoint;
305 return true;
306 }
307 }
308#endif /* !FRONTEND */
309 }
310
311 return false;
312}
313
314/*
315 * Decompose the given code into the array given by caller. The
316 * decomposition begins at the position given by caller, saving one
317 * lookup on the decomposition table. The current position needs to be
318 * updated here to let the caller know from where to continue filling
319 * in the array result.
320 */
321static void
322decompose_code(char32_t code, bool compat, char32_t **result, int *current)
323{
324 const pg_unicode_decomposition *entry;
325 int i;
326 const uint32 *decomp;
327 int dec_size;
328
329 /*
330 * Fast path for Hangul characters not stored in tables to save memory as
331 * decomposition is algorithmic. See
332 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
333 * on the matter.
334 */
335 if (code >= SBASE && code < SBASE + SCOUNT)
336 {
337 uint32 l,
338 v,
339 tindex,
340 sindex;
341 char32_t *res = *result;
342
343 sindex = code - SBASE;
344 l = LBASE + sindex / (VCOUNT * TCOUNT);
345 v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
346 tindex = sindex % TCOUNT;
347
348 res[*current] = l;
349 (*current)++;
350 res[*current] = v;
351 (*current)++;
352
353 if (tindex != 0)
354 {
355 res[*current] = TBASE + tindex;
356 (*current)++;
357 }
358
359 return;
360 }
361
362 entry = get_code_entry(code);
363
364 /*
365 * Just fill in with the current decomposition if there are no
366 * decomposition codes to recurse to. A NULL entry is equivalent to a
367 * character with class 0 and no decompositions, so just leave also in
368 * this case.
369 */
370 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
371 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
372 {
373 char32_t *res = *result;
374
375 res[*current] = code;
376 (*current)++;
377 return;
378 }
379
380 /*
381 * If this entry has other decomposition codes look at them as well.
382 */
384 for (i = 0; i < dec_size; i++)
385 {
386 char32_t lcode = (char32_t) decomp[i];
387
388 /* Leave if no more decompositions */
389 decompose_code(lcode, compat, result, current);
390 }
391}
392
393/*
394 * unicode_normalize - Normalize a Unicode string to the specified form.
395 *
396 * The input is a 0-terminated array of codepoints.
397 *
398 * In frontend, returns a 0-terminated array of codepoints, allocated with
399 * malloc. Or NULL if we run out of memory. In backend, the returned
400 * string is palloc'd instead, and OOM is reported with ereport().
401 */
402char32_t *
404{
405 bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
406 bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
407 char32_t *decomp_chars;
408 char32_t *recomp_chars;
409 int decomp_size,
411 int count;
412 const char32_t *p;
413
414 /* variables for recomposition */
415 int last_class;
416 int starter_pos;
417 int target_pos;
419
420 /* First, do character decomposition */
421
422 /*
423 * Calculate how many characters long the decomposed version will be.
424 *
425 * Some characters decompose to quite a few code points, so that the
426 * decomposed version's size could overrun MaxAllocSize, and even 32-bit
427 * size_t, even though the input string presumably fits in that. In
428 * frontend we want to just return NULL in that case, so monitor the sum
429 * and exit early once we'd need more than MaxAllocSize bytes.
430 */
431 decomp_size = 0;
432 for (p = input; *p; p++)
433 {
435 if (unlikely(decomp_size > MaxAllocSize / sizeof(char32_t)))
436 {
437#ifndef FRONTEND
438 /* Exit loop and let palloc() throw error below */
439 break;
440#else
441 /* Just return NULL with no explicit error */
442 return NULL;
443#endif
444 }
445 }
446
447 decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
448 if (decomp_chars == NULL)
449 return NULL;
450
451 /*
452 * Now fill in each entry recursively. This needs a second pass on the
453 * decomposition table.
454 */
455 current_size = 0;
456 for (p = input; *p; p++)
460
461 /* Leave if there is nothing to decompose */
462 if (decomp_size == 0)
463 return decomp_chars;
464
465 /*
466 * Now apply canonical ordering.
467 */
468 for (count = 1; count < decomp_size; count++)
469 {
470 char32_t prev = decomp_chars[count - 1];
471 char32_t next = decomp_chars[count];
472 char32_t tmp;
473 const uint8 prevClass = get_canonical_class(prev);
475
476 /*
477 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
478 * annex 4, a sequence of two adjacent characters in a string is an
479 * exchangeable pair if the combining class (from the Unicode
480 * Character Database) for the first character is greater than the
481 * combining class for the second, and the second is not a starter. A
482 * character is a starter if its combining class is 0.
483 */
484 if (prevClass == 0 || nextClass == 0)
485 continue;
486
487 if (prevClass <= nextClass)
488 continue;
489
490 /* exchange can happen */
491 tmp = decomp_chars[count - 1];
492 decomp_chars[count - 1] = decomp_chars[count];
493 decomp_chars[count] = tmp;
494
495 /* backtrack to check again */
496 if (count > 1)
497 count -= 2;
498 }
499
500 if (!recompose)
501 return decomp_chars;
502
503 /*
504 * The last phase of NFC and NFKC is the recomposition of the reordered
505 * Unicode string using combining classes. The recomposed string cannot be
506 * longer than the decomposed one, so make the allocation of the output
507 * string based on that assumption.
508 */
509 recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
510 if (!recomp_chars)
511 {
513 return NULL;
514 }
515
516 last_class = -1; /* this eliminates a special check */
517 starter_pos = 0;
518 target_pos = 1;
520
521 for (count = 1; count < decomp_size; count++)
522 {
523 char32_t ch = decomp_chars[count];
525 char32_t composite;
526
527 if (last_class < ch_class &&
528 recompose_code(starter_ch, ch, &composite))
529 {
530 recomp_chars[starter_pos] = composite;
531 starter_ch = composite;
532 }
533 else if (ch_class == 0)
534 {
536 starter_ch = ch;
537 last_class = -1;
539 }
540 else
541 {
544 }
545 }
547
549
550 return recomp_chars;
551}
552
553/*
554 * Normalization "quick check" algorithm; see
555 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
556 */
557
558/* We only need this in the backend. */
559#ifndef FRONTEND
560
561static const pg_unicode_normprops *
563{
564 int h;
566
567 /*
568 * Compute the hash function. The hash key is the codepoint with the bytes
569 * in network order.
570 */
572 h = norminfo->hash(&hashkey);
573
574 /* An out-of-range result implies no match */
575 if (h < 0 || h >= norminfo->num_normprops)
576 return NULL;
577
578 /*
579 * Since it's a perfect hash, we need only match to the specific codepoint
580 * it identifies.
581 */
582 if (ch != norminfo->normprops[h].codepoint)
583 return NULL;
584
585 /* Success! */
586 return &norminfo->normprops[h];
587}
588
589/*
590 * Look up the normalization quick check character property
591 */
594{
595 const pg_unicode_normprops *found = NULL;
596
597 switch (form)
598 {
599 case UNICODE_NFC:
601 break;
602 case UNICODE_NFKC:
604 break;
605 default:
606 Assert(false);
607 break;
608 }
609
610 if (found)
611 return found->quickcheck;
612 else
613 return UNICODE_NORM_QC_YES;
614}
615
618{
621
622 /*
623 * For the "D" forms, we don't run the quickcheck. We don't include the
624 * lookup tables for those because they are huge, checking for these
625 * particular forms is less common, and running the slow path is faster
626 * for the "D" forms than the "C" forms because you don't need to
627 * recompose, which is slow.
628 */
629 if (form == UNICODE_NFD || form == UNICODE_NFKD)
631
632 for (const char32_t *p = input; *p; p++)
633 {
634 char32_t ch = *p;
637
640 return UNICODE_NORM_QC_NO;
641
642 check = qc_is_allowed(form, ch);
643 if (check == UNICODE_NORM_QC_NO)
644 return UNICODE_NORM_QC_NO;
645 else if (check == UNICODE_NORM_QC_MAYBE)
647
649 }
650 return result;
651}
652
653#endif /* !FRONTEND */
static int32 next
Definition blutils.c:225
uint8_t uint8
Definition c.h:622
#define Assert(condition)
Definition c.h:943
uint64_t uint64
Definition c.h:625
#define unlikely(x)
Definition c.h:438
uint32_t uint32
Definition c.h:624
#define lengthof(array)
Definition c.h:873
uint32_t char32_t
Definition c.h:1504
uint32 result
enum COMPAT_MODE compat
Definition ecpg.c:26
#define MaxAllocSize
Definition fe_memutils.h:22
return str start
FILE * input
int x
Definition isn.c:75
int i
Definition isn.c:77
#define pg_hton32(x)
Definition pg_bswap.h:121
#define pg_hton64(x)
Definition pg_bswap.h:122
static int64 current_size
static int fb(int x)
static void decompose_code(char32_t code, bool compat, char32_t **result, int *current)
#define TCOUNT
static uint8 get_canonical_class(char32_t code)
#define TBASE
#define VBASE
#define VCOUNT
UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
static const pg_unicode_normprops * qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
#define LBASE
static const char32_t * get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
#define LCOUNT
char32_t * unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
#define ALLOC(size)
#define FREE(size)
static int get_decomposed_size(char32_t code, bool compat)
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
static const pg_unicode_decomposition * get_code_entry(char32_t code)
#define SBASE
#define SCOUNT
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
UnicodeNormalizationForm
@ UNICODE_NFKD
@ UNICODE_NFD
@ UNICODE_NFC
@ UNICODE_NFKC
UnicodeNormalizationQC
@ UNICODE_NORM_QC_YES
@ UNICODE_NORM_QC_NO
@ UNICODE_NORM_QC_MAYBE
static const pg_unicode_decompinfo UnicodeDecompInfo
static const pg_unicode_recompinfo UnicodeRecompInfo
#define DECOMPOSITION_NO_COMPOSE(x)
static const uint32 UnicodeDecomp_codepoints[5138]
#define DECOMPOSITION_IS_INLINE(x)
#define DECOMPOSITION_IS_COMPAT(x)
static const pg_unicode_decomposition UnicodeDecompMain[6878]
#define DECOMPOSITION_SIZE(x)
static const pg_unicode_norminfo UnicodeNormInfo_NFKC_QC
static const pg_unicode_norminfo UnicodeNormInfo_NFC_QC