PostgreSQL Source Code  git master
unicode_norm.h File Reference
#include "mb/pg_wchar.h"
Include dependency graph for unicode_norm.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Enumerations

enum  UnicodeNormalizationForm { UNICODE_NFC = 0, UNICODE_NFD = 1, UNICODE_NFKC = 2, UNICODE_NFKD = 3 }
 
enum  UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0, UNICODE_NORM_QC_YES = 1, UNICODE_NORM_QC_MAYBE = -1 }
 

Functions

pg_wcharunicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)
 

Enumeration Type Documentation

◆ UnicodeNormalizationForm

Enumerator
UNICODE_NFC 
UNICODE_NFD 
UNICODE_NFKC 
UNICODE_NFKD 

Definition at line 19 of file unicode_norm.h.

◆ UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO 
UNICODE_NORM_QC_YES 
UNICODE_NORM_QC_MAYBE 

Definition at line 28 of file unicode_norm.h.

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 594 of file unicode_norm.c.

References get_canonical_class(), qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

595 {
596  uint8 lastCanonicalClass = 0;
598 
599  /*
600  * For the "D" forms, we don't run the quickcheck. We don't include the
601  * lookup tables for those because they are huge, checking for these
602  * particular forms is less common, and running the slow path is faster
603  * for the "D" forms than the "C" forms because you don't need to
604  * recompose, which is slow.
605  */
606  if (form == UNICODE_NFD || form == UNICODE_NFKD)
607  return UNICODE_NORM_QC_MAYBE;
608 
609  for (const pg_wchar *p = input; *p; p++)
610  {
611  pg_wchar ch = *p;
612  uint8 canonicalClass;
614 
615  canonicalClass = get_canonical_class(ch);
616  if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
617  return UNICODE_NORM_QC_NO;
618 
619  check = qc_is_allowed(form, ch);
620  if (check == UNICODE_NORM_QC_NO)
621  return UNICODE_NORM_QC_NO;
622  else if (check == UNICODE_NORM_QC_MAYBE)
623  result = UNICODE_NORM_QC_MAYBE;
624 
625  lastCanonicalClass = canonicalClass;
626  }
627  return result;
628 }
unsigned char uint8
Definition: c.h:439
UnicodeNormalizationQC
Definition: unicode_norm.h:28
unsigned int pg_wchar
Definition: mbprint.c:31
static uint8 get_canonical_class(pg_wchar code)
Definition: unicode_norm.c:112
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:570

◆ unicode_normalize()

pg_wchar* unicode_normalize ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 402 of file unicode_norm.c.

References ALLOC, Assert, compat, current_size, decompose_code(), FREE, get_canonical_class(), get_decomposed_size(), next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().

403 {
404  bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405  bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406  pg_wchar *decomp_chars;
407  pg_wchar *recomp_chars;
408  int decomp_size,
409  current_size;
410  int count;
411  const pg_wchar *p;
412 
413  /* variables for recomposition */
414  int last_class;
415  int starter_pos;
416  int target_pos;
417  uint32 starter_ch;
418 
419  /* First, do character decomposition */
420 
421  /*
422  * Calculate how many characters long the decomposed version will be.
423  */
424  decomp_size = 0;
425  for (p = input; *p; p++)
426  decomp_size += get_decomposed_size(*p, compat);
427 
428  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429  if (decomp_chars == NULL)
430  return NULL;
431 
432  /*
433  * Now fill in each entry recursively. This needs a second pass on the
434  * decomposition table.
435  */
436  current_size = 0;
437  for (p = input; *p; p++)
438  decompose_code(*p, compat, &decomp_chars, &current_size);
439  decomp_chars[decomp_size] = '\0';
440  Assert(decomp_size == current_size);
441 
442  /*
443  * Now apply canonical ordering.
444  */
445  for (count = 1; count < decomp_size; count++)
446  {
447  pg_wchar prev = decomp_chars[count - 1];
448  pg_wchar next = decomp_chars[count];
449  pg_wchar tmp;
450  const uint8 prevClass = get_canonical_class(prev);
451  const uint8 nextClass = get_canonical_class(next);
452 
453  /*
454  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
455  * annex 4, a sequence of two adjacent characters in a string is an
456  * exchangeable pair if the combining class (from the Unicode
457  * Character Database) for the first character is greater than the
458  * combining class for the second, and the second is not a starter. A
459  * character is a starter if its combining class is 0.
460  */
461  if (prevClass == 0 || nextClass == 0)
462  continue;
463 
464  if (prevClass <= nextClass)
465  continue;
466 
467  /* exchange can happen */
468  tmp = decomp_chars[count - 1];
469  decomp_chars[count - 1] = decomp_chars[count];
470  decomp_chars[count] = tmp;
471 
472  /* backtrack to check again */
473  if (count > 1)
474  count -= 2;
475  }
476 
477  if (!recompose)
478  return decomp_chars;
479 
480  /*
481  * The last phase of NFC and NFKC is the recomposition of the reordered
482  * Unicode string using combining classes. The recomposed string cannot be
483  * longer than the decomposed one, so make the allocation of the output
484  * string based on that assumption.
485  */
486  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
487  if (!recomp_chars)
488  {
489  FREE(decomp_chars);
490  return NULL;
491  }
492 
493  last_class = -1; /* this eliminates a special check */
494  starter_pos = 0;
495  target_pos = 1;
496  starter_ch = recomp_chars[0] = decomp_chars[0];
497 
498  for (count = 1; count < decomp_size; count++)
499  {
500  pg_wchar ch = decomp_chars[count];
501  int ch_class = get_canonical_class(ch);
502  pg_wchar composite;
503 
504  if (last_class < ch_class &&
505  recompose_code(starter_ch, ch, &composite))
506  {
507  recomp_chars[starter_pos] = composite;
508  starter_ch = composite;
509  }
510  else if (ch_class == 0)
511  {
512  starter_pos = target_pos;
513  starter_ch = ch;
514  last_class = -1;
515  recomp_chars[target_pos++] = ch;
516  }
517  else
518  {
519  last_class = ch_class;
520  recomp_chars[target_pos++] = ch;
521  }
522  }
523  recomp_chars[target_pos] = (pg_wchar) '\0';
524 
525  FREE(decomp_chars);
526 
527  return recomp_chars;
528 }
static int32 next
Definition: blutils.c:219
unsigned char uint8
Definition: c.h:439
#define ALLOC(size)
Definition: unicode_norm.c:31
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:159
unsigned int uint32
Definition: c.h:441
unsigned int pg_wchar
Definition: mbprint.c:31
enum COMPAT_MODE compat
Definition: ecpg.c:25
static uint8 get_canonical_class(pg_wchar code)
Definition: unicode_norm.c:112
#define Assert(condition)
Definition: c.h:804
#define FREE(size)
Definition: unicode_norm.c:32
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:321
int64 current_size
Definition: pg_checksums.c:69
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:218