PostgreSQL Source Code  git master
unicode_norm.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * unicode_norm.c
3  * Normalize a Unicode string
4  *
5  * This implements Unicode normalization, per the documentation at
6  * https://www.unicode.org/reports/tr15/.
7  *
8  * Portions Copyright (c) 2017-2020, PostgreSQL Global Development Group
9  *
10  * IDENTIFICATION
11  * src/common/unicode_norm.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef FRONTEND
16 #include "postgres.h"
17 #else
18 #include "postgres_fe.h"
19 #endif
20 
21 #include "common/unicode_norm.h"
23 #ifndef FRONTEND
25 #endif
26 
27 #ifndef FRONTEND
28 #define ALLOC(size) palloc(size)
29 #define FREE(size) pfree(size)
30 #else
31 #define ALLOC(size) malloc(size)
32 #define FREE(size) free(size)
33 #endif
34 
35 /* Constants for calculations with Hangul characters */
36 #define SBASE 0xAC00 /* U+AC00 */
37 #define LBASE 0x1100 /* U+1100 */
38 #define VBASE 0x1161 /* U+1161 */
39 #define TBASE 0x11A7 /* U+11A7 */
40 #define LCOUNT 19
41 #define VCOUNT 21
42 #define TCOUNT 28
43 #define NCOUNT VCOUNT * TCOUNT
44 #define SCOUNT LCOUNT * NCOUNT
45 
46 /* comparison routine for bsearch() of decomposition lookup table. */
47 static int
48 conv_compare(const void *p1, const void *p2)
49 {
50  uint32 v1,
51  v2;
52 
53  v1 = *(const uint32 *) p1;
54  v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
55  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
56 }
57 
58 /*
59  * Get the entry corresponding to code in the decomposition lookup table.
60  */
63 {
64  return bsearch(&(code),
68  conv_compare);
69 }
70 
71 /*
72  * Given a decomposition entry looked up earlier, get the decomposed
73  * characters.
74  *
75  * Note: the returned pointer can point to statically allocated buffer, and
76  * is only valid until next call to this function!
77  */
78 static const pg_wchar *
80 {
81  static pg_wchar x;
82 
83  if (DECOMPOSITION_IS_INLINE(entry))
84  {
85  Assert(DECOMPOSITION_SIZE(entry) == 1);
86  x = (pg_wchar) entry->dec_index;
87  *dec_size = 1;
88  return &x;
89  }
90  else
91  {
92  *dec_size = DECOMPOSITION_SIZE(entry);
93  return &UnicodeDecomp_codepoints[entry->dec_index];
94  }
95 }
96 
97 /*
98  * Calculate how many characters a given character will decompose to.
99  *
100  * This needs to recurse, if the character decomposes into characters that
101  * are, in turn, decomposable.
102  */
103 static int
105 {
107  int size = 0;
108  int i;
109  const uint32 *decomp;
110  int dec_size;
111 
112  /*
113  * Fast path for Hangul characters not stored in tables to save memory as
114  * decomposition is algorithmic. See
115  * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
116  * on the matter.
117  */
118  if (code >= SBASE && code < SBASE + SCOUNT)
119  {
120  uint32 tindex,
121  sindex;
122 
123  sindex = code - SBASE;
124  tindex = sindex % TCOUNT;
125 
126  if (tindex != 0)
127  return 3;
128  return 2;
129  }
130 
131  entry = get_code_entry(code);
132 
133  /*
134  * Just count current code if no other decompositions. A NULL entry is
135  * equivalent to a character with class 0 and no decompositions.
136  */
137  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
138  (!compat && DECOMPOSITION_IS_COMPAT(entry)))
139  return 1;
140 
141  /*
142  * If this entry has other decomposition codes look at them as well. First
143  * get its decomposition in the list of tables available.
144  */
145  decomp = get_code_decomposition(entry, &dec_size);
146  for (i = 0; i < dec_size; i++)
147  {
148  uint32 lcode = decomp[i];
149 
150  size += get_decomposed_size(lcode, compat);
151  }
152 
153  return size;
154 }
155 
156 /*
157  * Recompose a set of characters. For hangul characters, the calculation
158  * is algorithmic. For others, an inverse lookup at the decomposition
159  * table is necessary. Returns true if a recomposition can be done, and
160  * false otherwise.
161  */
162 static bool
163 recompose_code(uint32 start, uint32 code, uint32 *result)
164 {
165  /*
166  * Handle Hangul characters algorithmically, per the Unicode spec.
167  *
168  * Check if two current characters are L and V.
169  */
170  if (start >= LBASE && start < LBASE + LCOUNT &&
171  code >= VBASE && code < VBASE + VCOUNT)
172  {
173  /* make syllable of form LV */
174  uint32 lindex = start - LBASE;
175  uint32 vindex = code - VBASE;
176 
177  *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
178  return true;
179  }
180  /* Check if two current characters are LV and T */
181  else if (start >= SBASE && start < (SBASE + SCOUNT) &&
182  ((start - SBASE) % TCOUNT) == 0 &&
183  code >= TBASE && code < (TBASE + TCOUNT))
184  {
185  /* make syllable of form LVT */
186  uint32 tindex = code - TBASE;
187 
188  *result = start + tindex;
189  return true;
190  }
191  else
192  {
193  int i;
194 
195  /*
196  * Do an inverse lookup of the decomposition tables to see if anything
197  * matches. The comparison just needs to be a perfect match on the
198  * sub-table of size two, because the start character has already been
199  * recomposed partially.
200  */
201  for (i = 0; i < lengthof(UnicodeDecompMain); i++)
202  {
204 
205  if (DECOMPOSITION_SIZE(entry) != 2)
206  continue;
207 
208  if (DECOMPOSITION_NO_COMPOSE(entry))
209  continue;
210 
211  if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
212  code == UnicodeDecomp_codepoints[entry->dec_index + 1])
213  {
214  *result = entry->codepoint;
215  return true;
216  }
217  }
218  }
219 
220  return false;
221 }
222 
223 /*
224  * Decompose the given code into the array given by caller. The
225  * decomposition begins at the position given by caller, saving one
226  * lookup on the decomposition table. The current position needs to be
227  * updated here to let the caller know from where to continue filling
228  * in the array result.
229  */
230 static void
231 decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
232 {
234  int i;
235  const uint32 *decomp;
236  int dec_size;
237 
238  /*
239  * Fast path for Hangul characters not stored in tables to save memory as
240  * decomposition is algorithmic. See
241  * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
242  * on the matter.
243  */
244  if (code >= SBASE && code < SBASE + SCOUNT)
245  {
246  uint32 l,
247  v,
248  tindex,
249  sindex;
250  pg_wchar *res = *result;
251 
252  sindex = code - SBASE;
253  l = LBASE + sindex / (VCOUNT * TCOUNT);
254  v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
255  tindex = sindex % TCOUNT;
256 
257  res[*current] = l;
258  (*current)++;
259  res[*current] = v;
260  (*current)++;
261 
262  if (tindex != 0)
263  {
264  res[*current] = TBASE + tindex;
265  (*current)++;
266  }
267 
268  return;
269  }
270 
271  entry = get_code_entry(code);
272 
273  /*
274  * Just fill in with the current decomposition if there are no
275  * decomposition codes to recurse to. A NULL entry is equivalent to a
276  * character with class 0 and no decompositions, so just leave also in
277  * this case.
278  */
279  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
280  (!compat && DECOMPOSITION_IS_COMPAT(entry)))
281  {
282  pg_wchar *res = *result;
283 
284  res[*current] = code;
285  (*current)++;
286  return;
287  }
288 
289  /*
290  * If this entry has other decomposition codes look at them as well.
291  */
292  decomp = get_code_decomposition(entry, &dec_size);
293  for (i = 0; i < dec_size; i++)
294  {
295  pg_wchar lcode = (pg_wchar) decomp[i];
296 
297  /* Leave if no more decompositions */
298  decompose_code(lcode, compat, result, current);
299  }
300 }
301 
302 /*
303  * unicode_normalize - Normalize a Unicode string to the specified form.
304  *
305  * The input is a 0-terminated array of codepoints.
306  *
307  * In frontend, returns a 0-terminated array of codepoints, allocated with
308  * malloc. Or NULL if we run out of memory. In backend, the returned
309  * string is palloc'd instead, and OOM is reported with ereport().
310  */
311 pg_wchar *
313 {
314  bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
315  bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
316  pg_wchar *decomp_chars;
317  pg_wchar *recomp_chars;
318  int decomp_size,
319  current_size;
320  int count;
321  const pg_wchar *p;
322 
323  /* variables for recomposition */
324  int last_class;
325  int starter_pos;
326  int target_pos;
327  uint32 starter_ch;
328 
329  /* First, do character decomposition */
330 
331  /*
332  * Calculate how many characters long the decomposed version will be.
333  */
334  decomp_size = 0;
335  for (p = input; *p; p++)
336  decomp_size += get_decomposed_size(*p, compat);
337 
338  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
339  if (decomp_chars == NULL)
340  return NULL;
341 
342  /*
343  * Now fill in each entry recursively. This needs a second pass on the
344  * decomposition table.
345  */
346  current_size = 0;
347  for (p = input; *p; p++)
348  decompose_code(*p, compat, &decomp_chars, &current_size);
349  decomp_chars[decomp_size] = '\0';
350  Assert(decomp_size == current_size);
351 
352  /*
353  * Now apply canonical ordering.
354  */
355  for (count = 1; count < decomp_size; count++)
356  {
357  pg_wchar prev = decomp_chars[count - 1];
358  pg_wchar next = decomp_chars[count];
359  pg_wchar tmp;
360  pg_unicode_decomposition *prevEntry = get_code_entry(prev);
361  pg_unicode_decomposition *nextEntry = get_code_entry(next);
362 
363  /*
364  * If no entries are found, the character used is either an Hangul
365  * character or a character with a class of 0 and no decompositions,
366  * so move to next result.
367  */
368  if (prevEntry == NULL || nextEntry == NULL)
369  continue;
370 
371  /*
372  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
373  * annex 4, a sequence of two adjacent characters in a string is an
374  * exchangeable pair if the combining class (from the Unicode
375  * Character Database) for the first character is greater than the
376  * combining class for the second, and the second is not a starter. A
377  * character is a starter if its combining class is 0.
378  */
379  if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
380  continue;
381 
382  if (prevEntry->comb_class <= nextEntry->comb_class)
383  continue;
384 
385  /* exchange can happen */
386  tmp = decomp_chars[count - 1];
387  decomp_chars[count - 1] = decomp_chars[count];
388  decomp_chars[count] = tmp;
389 
390  /* backtrack to check again */
391  if (count > 1)
392  count -= 2;
393  }
394 
395  if (!recompose)
396  return decomp_chars;
397 
398  /*
399  * The last phase of NFC and NFKC is the recomposition of the reordered
400  * Unicode string using combining classes. The recomposed string cannot be
401  * longer than the decomposed one, so make the allocation of the output
402  * string based on that assumption.
403  */
404  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
405  if (!recomp_chars)
406  {
407  FREE(decomp_chars);
408  return NULL;
409  }
410 
411  last_class = -1; /* this eliminates a special check */
412  starter_pos = 0;
413  target_pos = 1;
414  starter_ch = recomp_chars[0] = decomp_chars[0];
415 
416  for (count = 1; count < decomp_size; count++)
417  {
418  pg_wchar ch = decomp_chars[count];
420  int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
421  pg_wchar composite;
422 
423  if (last_class < ch_class &&
424  recompose_code(starter_ch, ch, &composite))
425  {
426  recomp_chars[starter_pos] = composite;
427  starter_ch = composite;
428  }
429  else if (ch_class == 0)
430  {
431  starter_pos = target_pos;
432  starter_ch = ch;
433  last_class = -1;
434  recomp_chars[target_pos++] = ch;
435  }
436  else
437  {
438  last_class = ch_class;
439  recomp_chars[target_pos++] = ch;
440  }
441  }
442  recomp_chars[target_pos] = (pg_wchar) '\0';
443 
444  FREE(decomp_chars);
445 
446  return recomp_chars;
447 }
448 
449 /*
450  * Normalization "quick check" algorithm; see
451  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
452  */
453 
454 /* We only need this in the backend. */
455 #ifndef FRONTEND
456 
457 static uint8
459 {
461 
462  if (!entry)
463  return 0;
464  else
465  return entry->comb_class;
466 }
467 
468 static int
469 qc_compare(const void *p1, const void *p2)
470 {
471  uint32 v1,
472  v2;
473 
474  v1 = ((const pg_unicode_normprops *) p1)->codepoint;
475  v2 = ((const pg_unicode_normprops *) p2)->codepoint;
476  return (v1 - v2);
477 }
478 
479 /*
480  * Look up the normalization quick check character property
481  */
484 {
486  pg_unicode_normprops *found = NULL;
487 
488  key.codepoint = ch;
489 
490  switch (form)
491  {
492  case UNICODE_NFC:
493  found = bsearch(&key,
496  sizeof(pg_unicode_normprops),
497  qc_compare);
498  break;
499  case UNICODE_NFKC:
500  found = bsearch(&key,
503  sizeof(pg_unicode_normprops),
504  qc_compare);
505  break;
506  default:
507  Assert(false);
508  break;
509  }
510 
511  if (found)
512  return found->quickcheck;
513  else
514  return UNICODE_NORM_QC_YES;
515 }
516 
519 {
520  uint8 lastCanonicalClass = 0;
522 
523  /*
524  * For the "D" forms, we don't run the quickcheck. We don't include the
525  * lookup tables for those because they are huge, checking for these
526  * particular forms is less common, and running the slow path is faster
527  * for the "D" forms than the "C" forms because you don't need to
528  * recompose, which is slow.
529  */
530  if (form == UNICODE_NFD || form == UNICODE_NFKD)
531  return UNICODE_NORM_QC_MAYBE;
532 
533  for (const pg_wchar *p = input; *p; p++)
534  {
535  pg_wchar ch = *p;
536  uint8 canonicalClass;
538 
539  canonicalClass = get_canonical_class(ch);
540  if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
541  return UNICODE_NORM_QC_NO;
542 
543  check = qc_is_allowed(form, ch);
544  if (check == UNICODE_NORM_QC_NO)
545  return UNICODE_NORM_QC_NO;
546  else if (check == UNICODE_NORM_QC_MAYBE)
547  result = UNICODE_NORM_QC_MAYBE;
548 
549  lastCanonicalClass = canonicalClass;
550  }
551  return result;
552 }
553 
554 #endif /* !FRONTEND */
static int qc_compare(const void *p1, const void *p2)
Definition: unicode_norm.c:469
static int32 next
Definition: blutils.c:218
unsigned char uint8
Definition: c.h:365
#define DECOMPOSITION_SIZE(x)
static uint8 get_canonical_class(pg_wchar ch)
Definition: unicode_norm.c:458
UnicodeNormalizationQC
Definition: unicode_norm.h:28
#define LCOUNT
Definition: unicode_norm.c:40
#define DECOMPOSITION_IS_COMPAT(x)
#define lengthof(array)
Definition: c.h:668
static const uint32 UnicodeDecomp_codepoints[5092]
UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
Definition: unicode_norm.c:518
#define SBASE
Definition: unicode_norm.c:36
UnicodeNormalizationForm
Definition: unicode_norm.h:19
static const pg_unicode_normprops UnicodeNormProps_NFKC_QC[]
#define VBASE
Definition: unicode_norm.c:38
#define ALLOC(size)
Definition: unicode_norm.c:28
static const pg_unicode_normprops UnicodeNormProps_NFC_QC[]
#define DECOMPOSITION_IS_INLINE(x)
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:104
#define TBASE
Definition: unicode_norm.c:39
unsigned int uint32
Definition: c.h:367
unsigned int pg_wchar
Definition: mbprint.c:31
enum COMPAT_MODE compat
Definition: ecpg.c:25
#define LBASE
Definition: unicode_norm.c:37
#define SCOUNT
Definition: unicode_norm.c:44
#define Assert(condition)
Definition: c.h:738
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:483
#define FREE(size)
Definition: unicode_norm.c:29
static const pg_wchar * get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
Definition: unicode_norm.c:79
static const pg_unicode_decomposition UnicodeDecompMain[6604]
static pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:62
int i
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:231
#define TCOUNT
Definition: unicode_norm.c:42
int64 current_size
Definition: pg_checksums.c:69
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:163
static int conv_compare(const void *p1, const void *p2)
Definition: unicode_norm.c:48
pg_wchar * unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
Definition: unicode_norm.c:312
#define VCOUNT
Definition: unicode_norm.c:41
#define DECOMPOSITION_NO_COMPOSE(x)