PostgreSQL Source Code  git master
unicode_norm.h File Reference
#include "mb/pg_wchar.h"
Include dependency graph for unicode_norm.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Enumerations

enum  UnicodeNormalizationForm { UNICODE_NFC = 0, UNICODE_NFD = 1, UNICODE_NFKC = 2, UNICODE_NFKD = 3 }
 
enum  UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0, UNICODE_NORM_QC_YES = 1, UNICODE_NORM_QC_MAYBE = -1 }
 

Functions

pg_wcharunicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)
 

Enumeration Type Documentation

◆ UnicodeNormalizationForm

Enumerator
UNICODE_NFC 
UNICODE_NFD 
UNICODE_NFKC 
UNICODE_NFKD 

Definition at line 19 of file unicode_norm.h.

◆ UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO 
UNICODE_NORM_QC_YES 
UNICODE_NORM_QC_MAYBE 

Definition at line 28 of file unicode_norm.h.

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 597 of file unicode_norm.c.

References get_canonical_class(), qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

598 {
599  uint8 lastCanonicalClass = 0;
601 
602  /*
603  * For the "D" forms, we don't run the quickcheck. We don't include the
604  * lookup tables for those because they are huge, checking for these
605  * particular forms is less common, and running the slow path is faster
606  * for the "D" forms than the "C" forms because you don't need to
607  * recompose, which is slow.
608  */
609  if (form == UNICODE_NFD || form == UNICODE_NFKD)
610  return UNICODE_NORM_QC_MAYBE;
611 
612  for (const pg_wchar *p = input; *p; p++)
613  {
614  pg_wchar ch = *p;
615  uint8 canonicalClass;
617 
618  canonicalClass = get_canonical_class(ch);
619  if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
620  return UNICODE_NORM_QC_NO;
621 
622  check = qc_is_allowed(form, ch);
623  if (check == UNICODE_NORM_QC_NO)
624  return UNICODE_NORM_QC_NO;
625  else if (check == UNICODE_NORM_QC_MAYBE)
626  result = UNICODE_NORM_QC_MAYBE;
627 
628  lastCanonicalClass = canonicalClass;
629  }
630  return result;
631 }
unsigned char uint8
Definition: c.h:427
static uint8 get_canonical_class(pg_wchar ch)
Definition: unicode_norm.c:531
UnicodeNormalizationQC
Definition: unicode_norm.h:28
unsigned int pg_wchar
Definition: mbprint.c:31
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:573

◆ unicode_normalize()

pg_wchar* unicode_normalize ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 385 of file unicode_norm.c.

References ALLOC, Assert, pg_unicode_decomposition::comb_class, compat, current_size, decompose_code(), FREE, get_code_entry(), get_decomposed_size(), next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().

386 {
387  bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
388  bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
389  pg_wchar *decomp_chars;
390  pg_wchar *recomp_chars;
391  int decomp_size,
392  current_size;
393  int count;
394  const pg_wchar *p;
395 
396  /* variables for recomposition */
397  int last_class;
398  int starter_pos;
399  int target_pos;
400  uint32 starter_ch;
401 
402  /* First, do character decomposition */
403 
404  /*
405  * Calculate how many characters long the decomposed version will be.
406  */
407  decomp_size = 0;
408  for (p = input; *p; p++)
409  decomp_size += get_decomposed_size(*p, compat);
410 
411  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
412  if (decomp_chars == NULL)
413  return NULL;
414 
415  /*
416  * Now fill in each entry recursively. This needs a second pass on the
417  * decomposition table.
418  */
419  current_size = 0;
420  for (p = input; *p; p++)
421  decompose_code(*p, compat, &decomp_chars, &current_size);
422  decomp_chars[decomp_size] = '\0';
423  Assert(decomp_size == current_size);
424 
425  /*
426  * Now apply canonical ordering.
427  */
428  for (count = 1; count < decomp_size; count++)
429  {
430  pg_wchar prev = decomp_chars[count - 1];
431  pg_wchar next = decomp_chars[count];
432  pg_wchar tmp;
433  const pg_unicode_decomposition *prevEntry = get_code_entry(prev);
434  const pg_unicode_decomposition *nextEntry = get_code_entry(next);
435 
436  /*
437  * If no entries are found, the character used is either an Hangul
438  * character or a character with a class of 0 and no decompositions,
439  * so move to next result.
440  */
441  if (prevEntry == NULL || nextEntry == NULL)
442  continue;
443 
444  /*
445  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
446  * annex 4, a sequence of two adjacent characters in a string is an
447  * exchangeable pair if the combining class (from the Unicode
448  * Character Database) for the first character is greater than the
449  * combining class for the second, and the second is not a starter. A
450  * character is a starter if its combining class is 0.
451  */
452  if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
453  continue;
454 
455  if (prevEntry->comb_class <= nextEntry->comb_class)
456  continue;
457 
458  /* exchange can happen */
459  tmp = decomp_chars[count - 1];
460  decomp_chars[count - 1] = decomp_chars[count];
461  decomp_chars[count] = tmp;
462 
463  /* backtrack to check again */
464  if (count > 1)
465  count -= 2;
466  }
467 
468  if (!recompose)
469  return decomp_chars;
470 
471  /*
472  * The last phase of NFC and NFKC is the recomposition of the reordered
473  * Unicode string using combining classes. The recomposed string cannot be
474  * longer than the decomposed one, so make the allocation of the output
475  * string based on that assumption.
476  */
477  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
478  if (!recomp_chars)
479  {
480  FREE(decomp_chars);
481  return NULL;
482  }
483 
484  last_class = -1; /* this eliminates a special check */
485  starter_pos = 0;
486  target_pos = 1;
487  starter_ch = recomp_chars[0] = decomp_chars[0];
488 
489  for (count = 1; count < decomp_size; count++)
490  {
491  pg_wchar ch = decomp_chars[count];
492  const pg_unicode_decomposition *ch_entry = get_code_entry(ch);
493  int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
494  pg_wchar composite;
495 
496  if (last_class < ch_class &&
497  recompose_code(starter_ch, ch, &composite))
498  {
499  recomp_chars[starter_pos] = composite;
500  starter_ch = composite;
501  }
502  else if (ch_class == 0)
503  {
504  starter_pos = target_pos;
505  starter_ch = ch;
506  last_class = -1;
507  recomp_chars[target_pos++] = ch;
508  }
509  else
510  {
511  last_class = ch_class;
512  recomp_chars[target_pos++] = ch;
513  }
514  }
515  recomp_chars[target_pos] = (pg_wchar) '\0';
516 
517  FREE(decomp_chars);
518 
519  return recomp_chars;
520 }
static int32 next
Definition: blutils.c:219
static const pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:72
#define ALLOC(size)
Definition: unicode_norm.c:31
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:142
unsigned int uint32
Definition: c.h:429
unsigned int pg_wchar
Definition: mbprint.c:31
enum COMPAT_MODE compat
Definition: ecpg.c:25
#define Assert(condition)
Definition: c.h:800
#define FREE(size)
Definition: unicode_norm.c:32
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:304
int64 current_size
Definition: pg_checksums.c:69
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:201