PostgreSQL Source Code  git master
unicode_norm.h File Reference
#include "mb/pg_wchar.h"
Include dependency graph for unicode_norm.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Enumerations

enum  UnicodeNormalizationForm { UNICODE_NFC = 0 , UNICODE_NFD = 1 , UNICODE_NFKC = 2 , UNICODE_NFKD = 3 }
 
enum  UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0 , UNICODE_NORM_QC_YES = 1 , UNICODE_NORM_QC_MAYBE = -1 }
 

Functions

pg_wcharunicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)
 

Enumeration Type Documentation

◆ UnicodeNormalizationForm

Enumerator
UNICODE_NFC 
UNICODE_NFD 
UNICODE_NFKC 
UNICODE_NFKD 

Definition at line 19 of file unicode_norm.h.

20 {
21  UNICODE_NFC = 0,
22  UNICODE_NFD = 1,
23  UNICODE_NFKC = 2,
24  UNICODE_NFKD = 3,
UnicodeNormalizationForm
Definition: unicode_norm.h:20
@ UNICODE_NFKD
Definition: unicode_norm.h:24
@ UNICODE_NFD
Definition: unicode_norm.h:22
@ UNICODE_NFC
Definition: unicode_norm.h:21
@ UNICODE_NFKC
Definition: unicode_norm.h:23

◆ UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO 
UNICODE_NORM_QC_YES 
UNICODE_NORM_QC_MAYBE 

Definition at line 28 of file unicode_norm.h.

29 {
UnicodeNormalizationQC
Definition: unicode_norm.h:29
@ UNICODE_NORM_QC_YES
Definition: unicode_norm.h:31
@ UNICODE_NORM_QC_NO
Definition: unicode_norm.h:30
@ UNICODE_NORM_QC_MAYBE
Definition: unicode_norm.h:32

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 598 of file unicode_norm.c.

599 {
600  uint8 lastCanonicalClass = 0;
602 
603  /*
604  * For the "D" forms, we don't run the quickcheck. We don't include the
605  * lookup tables for those because they are huge, checking for these
606  * particular forms is less common, and running the slow path is faster
607  * for the "D" forms than the "C" forms because you don't need to
608  * recompose, which is slow.
609  */
610  if (form == UNICODE_NFD || form == UNICODE_NFKD)
611  return UNICODE_NORM_QC_MAYBE;
612 
613  for (const pg_wchar *p = input; *p; p++)
614  {
615  pg_wchar ch = *p;
616  uint8 canonicalClass;
618 
619  canonicalClass = get_canonical_class(ch);
620  if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
621  return UNICODE_NORM_QC_NO;
622 
623  check = qc_is_allowed(form, ch);
624  if (check == UNICODE_NORM_QC_NO)
625  return UNICODE_NORM_QC_NO;
626  else if (check == UNICODE_NORM_QC_MAYBE)
627  result = UNICODE_NORM_QC_MAYBE;
628 
629  lastCanonicalClass = canonicalClass;
630  }
631  return result;
632 }
unsigned char uint8
Definition: c.h:507
FILE * input
unsigned int pg_wchar
Definition: mbprint.c:31
static uint8 get_canonical_class(pg_wchar code)
Definition: unicode_norm.c:112
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:574

References get_canonical_class(), input, qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

◆ unicode_normalize()

pg_wchar* unicode_normalize ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 402 of file unicode_norm.c.

403 {
404  bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405  bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406  pg_wchar *decomp_chars;
407  pg_wchar *recomp_chars;
408  int decomp_size,
409  current_size;
410  int count;
411  const pg_wchar *p;
412 
413  /* variables for recomposition */
414  int last_class;
415  int starter_pos;
416  int target_pos;
417  uint32 starter_ch;
418 
419  /* First, do character decomposition */
420 
421  /*
422  * Calculate how many characters long the decomposed version will be.
423  */
424  decomp_size = 0;
425  for (p = input; *p; p++)
426  decomp_size += get_decomposed_size(*p, compat);
427 
428  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429  if (decomp_chars == NULL)
430  return NULL;
431 
432  /*
433  * Now fill in each entry recursively. This needs a second pass on the
434  * decomposition table.
435  */
436  current_size = 0;
437  for (p = input; *p; p++)
438  decompose_code(*p, compat, &decomp_chars, &current_size);
439  decomp_chars[decomp_size] = '\0';
440  Assert(decomp_size == current_size);
441 
442  /* Leave if there is nothing to decompose */
443  if (decomp_size == 0)
444  return decomp_chars;
445 
446  /*
447  * Now apply canonical ordering.
448  */
449  for (count = 1; count < decomp_size; count++)
450  {
451  pg_wchar prev = decomp_chars[count - 1];
452  pg_wchar next = decomp_chars[count];
453  pg_wchar tmp;
454  const uint8 prevClass = get_canonical_class(prev);
455  const uint8 nextClass = get_canonical_class(next);
456 
457  /*
458  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459  * annex 4, a sequence of two adjacent characters in a string is an
460  * exchangeable pair if the combining class (from the Unicode
461  * Character Database) for the first character is greater than the
462  * combining class for the second, and the second is not a starter. A
463  * character is a starter if its combining class is 0.
464  */
465  if (prevClass == 0 || nextClass == 0)
466  continue;
467 
468  if (prevClass <= nextClass)
469  continue;
470 
471  /* exchange can happen */
472  tmp = decomp_chars[count - 1];
473  decomp_chars[count - 1] = decomp_chars[count];
474  decomp_chars[count] = tmp;
475 
476  /* backtrack to check again */
477  if (count > 1)
478  count -= 2;
479  }
480 
481  if (!recompose)
482  return decomp_chars;
483 
484  /*
485  * The last phase of NFC and NFKC is the recomposition of the reordered
486  * Unicode string using combining classes. The recomposed string cannot be
487  * longer than the decomposed one, so make the allocation of the output
488  * string based on that assumption.
489  */
490  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
491  if (!recomp_chars)
492  {
493  FREE(decomp_chars);
494  return NULL;
495  }
496 
497  last_class = -1; /* this eliminates a special check */
498  starter_pos = 0;
499  target_pos = 1;
500  starter_ch = recomp_chars[0] = decomp_chars[0];
501 
502  for (count = 1; count < decomp_size; count++)
503  {
504  pg_wchar ch = decomp_chars[count];
505  int ch_class = get_canonical_class(ch);
506  pg_wchar composite;
507 
508  if (last_class < ch_class &&
509  recompose_code(starter_ch, ch, &composite))
510  {
511  recomp_chars[starter_pos] = composite;
512  starter_ch = composite;
513  }
514  else if (ch_class == 0)
515  {
516  starter_pos = target_pos;
517  starter_ch = ch;
518  last_class = -1;
519  recomp_chars[target_pos++] = ch;
520  }
521  else
522  {
523  last_class = ch_class;
524  recomp_chars[target_pos++] = ch;
525  }
526  }
527  recomp_chars[target_pos] = (pg_wchar) '\0';
528 
529  FREE(decomp_chars);
530 
531  return recomp_chars;
532 }
static int32 next
Definition: blutils.c:222
unsigned int uint32
Definition: c.h:509
#define Assert(condition)
Definition: c.h:861
enum COMPAT_MODE compat
Definition: ecpg.c:25
static int64 current_size
Definition: pg_checksums.c:64
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:321
#define ALLOC(size)
Definition: unicode_norm.c:31
#define FREE(size)
Definition: unicode_norm.c:32
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:218
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:159

References ALLOC, Assert, compat, current_size, decompose_code(), FREE, get_canonical_class(), get_decomposed_size(), input, next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().