PostgreSQL Source Code  git master
unicode_norm.h File Reference
#include "mb/pg_wchar.h"
Include dependency graph for unicode_norm.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Enumerations

enum  UnicodeNormalizationForm { UNICODE_NFC = 0, UNICODE_NFD = 1, UNICODE_NFKC = 2, UNICODE_NFKD = 3 }
 
enum  UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0, UNICODE_NORM_QC_YES = 1, UNICODE_NORM_QC_MAYBE = -1 }
 

Functions

pg_wcharunicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)
 

Enumeration Type Documentation

◆ UnicodeNormalizationForm

Enumerator
UNICODE_NFC 
UNICODE_NFD 
UNICODE_NFKC 
UNICODE_NFKD 

Definition at line 19 of file unicode_norm.h.

◆ UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO 
UNICODE_NORM_QC_YES 
UNICODE_NORM_QC_MAYBE 

Definition at line 28 of file unicode_norm.h.

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 518 of file unicode_norm.c.

References get_canonical_class(), qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

519 {
520  uint8 lastCanonicalClass = 0;
522 
523  /*
524  * For the "D" forms, we don't run the quickcheck. We don't include the
525  * lookup tables for those because they are huge, checking for these
526  * particular forms is less common, and running the slow path is faster
527  * for the "D" forms than the "C" forms because you don't need to
528  * recompose, which is slow.
529  */
530  if (form == UNICODE_NFD || form == UNICODE_NFKD)
531  return UNICODE_NORM_QC_MAYBE;
532 
533  for (const pg_wchar *p = input; *p; p++)
534  {
535  pg_wchar ch = *p;
536  uint8 canonicalClass;
538 
539  canonicalClass = get_canonical_class(ch);
540  if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
541  return UNICODE_NORM_QC_NO;
542 
543  check = qc_is_allowed(form, ch);
544  if (check == UNICODE_NORM_QC_NO)
545  return UNICODE_NORM_QC_NO;
546  else if (check == UNICODE_NORM_QC_MAYBE)
547  result = UNICODE_NORM_QC_MAYBE;
548 
549  lastCanonicalClass = canonicalClass;
550  }
551  return result;
552 }
unsigned char uint8
Definition: c.h:365
static uint8 get_canonical_class(pg_wchar ch)
Definition: unicode_norm.c:458
UnicodeNormalizationQC
Definition: unicode_norm.h:28
unsigned int pg_wchar
Definition: mbprint.c:31
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:483

◆ unicode_normalize()

pg_wchar* unicode_normalize ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 312 of file unicode_norm.c.

References ALLOC, Assert, pg_unicode_decomposition::comb_class, compat, current_size, decompose_code(), FREE, get_code_entry(), get_decomposed_size(), next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().

313 {
314  bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
315  bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
316  pg_wchar *decomp_chars;
317  pg_wchar *recomp_chars;
318  int decomp_size,
319  current_size;
320  int count;
321  const pg_wchar *p;
322 
323  /* variables for recomposition */
324  int last_class;
325  int starter_pos;
326  int target_pos;
327  uint32 starter_ch;
328 
329  /* First, do character decomposition */
330 
331  /*
332  * Calculate how many characters long the decomposed version will be.
333  */
334  decomp_size = 0;
335  for (p = input; *p; p++)
336  decomp_size += get_decomposed_size(*p, compat);
337 
338  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
339  if (decomp_chars == NULL)
340  return NULL;
341 
342  /*
343  * Now fill in each entry recursively. This needs a second pass on the
344  * decomposition table.
345  */
346  current_size = 0;
347  for (p = input; *p; p++)
348  decompose_code(*p, compat, &decomp_chars, &current_size);
349  decomp_chars[decomp_size] = '\0';
350  Assert(decomp_size == current_size);
351 
352  /*
353  * Now apply canonical ordering.
354  */
355  for (count = 1; count < decomp_size; count++)
356  {
357  pg_wchar prev = decomp_chars[count - 1];
358  pg_wchar next = decomp_chars[count];
359  pg_wchar tmp;
360  pg_unicode_decomposition *prevEntry = get_code_entry(prev);
361  pg_unicode_decomposition *nextEntry = get_code_entry(next);
362 
363  /*
364  * If no entries are found, the character used is either an Hangul
365  * character or a character with a class of 0 and no decompositions,
366  * so move to next result.
367  */
368  if (prevEntry == NULL || nextEntry == NULL)
369  continue;
370 
371  /*
372  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
373  * annex 4, a sequence of two adjacent characters in a string is an
374  * exchangeable pair if the combining class (from the Unicode
375  * Character Database) for the first character is greater than the
376  * combining class for the second, and the second is not a starter. A
377  * character is a starter if its combining class is 0.
378  */
379  if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
380  continue;
381 
382  if (prevEntry->comb_class <= nextEntry->comb_class)
383  continue;
384 
385  /* exchange can happen */
386  tmp = decomp_chars[count - 1];
387  decomp_chars[count - 1] = decomp_chars[count];
388  decomp_chars[count] = tmp;
389 
390  /* backtrack to check again */
391  if (count > 1)
392  count -= 2;
393  }
394 
395  if (!recompose)
396  return decomp_chars;
397 
398  /*
399  * The last phase of NFC and NFKC is the recomposition of the reordered
400  * Unicode string using combining classes. The recomposed string cannot be
401  * longer than the decomposed one, so make the allocation of the output
402  * string based on that assumption.
403  */
404  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
405  if (!recomp_chars)
406  {
407  FREE(decomp_chars);
408  return NULL;
409  }
410 
411  last_class = -1; /* this eliminates a special check */
412  starter_pos = 0;
413  target_pos = 1;
414  starter_ch = recomp_chars[0] = decomp_chars[0];
415 
416  for (count = 1; count < decomp_size; count++)
417  {
418  pg_wchar ch = decomp_chars[count];
420  int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
421  pg_wchar composite;
422 
423  if (last_class < ch_class &&
424  recompose_code(starter_ch, ch, &composite))
425  {
426  recomp_chars[starter_pos] = composite;
427  starter_ch = composite;
428  }
429  else if (ch_class == 0)
430  {
431  starter_pos = target_pos;
432  starter_ch = ch;
433  last_class = -1;
434  recomp_chars[target_pos++] = ch;
435  }
436  else
437  {
438  last_class = ch_class;
439  recomp_chars[target_pos++] = ch;
440  }
441  }
442  recomp_chars[target_pos] = (pg_wchar) '\0';
443 
444  FREE(decomp_chars);
445 
446  return recomp_chars;
447 }
static int32 next
Definition: blutils.c:218
#define ALLOC(size)
Definition: unicode_norm.c:28
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:104
unsigned int uint32
Definition: c.h:367
unsigned int pg_wchar
Definition: mbprint.c:31
enum COMPAT_MODE compat
Definition: ecpg.c:25
#define Assert(condition)
Definition: c.h:738
#define FREE(size)
Definition: unicode_norm.c:29
static pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:62
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:231
int64 current_size
Definition: pg_checksums.c:69
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:163