#include "postgres.h"
#include "common/unicode_norm.h"
#include "common/unicode_norm_hashfunc.h"
#include "common/unicode_normprops_table.h"
#include "port/pg_bswap.h"

Include dependency graph for unicode_norm.c:

Macros
#define	ALLOC(size) palloc(size)

#define	FREE(size) pfree(size)

#define	SBASE 0xAC00 /* U+AC00 */

#define	LBASE 0x1100 /* U+1100 */

#define	VBASE 0x1161 /* U+1161 */

#define	TBASE 0x11A7 /* U+11A7 */

#define	LCOUNT 19

#define	VCOUNT 21

#define	TCOUNT 28

#define	NCOUNT VCOUNT * TCOUNT

#define	SCOUNT LCOUNT * NCOUNT

Functions
static const pg_unicode_decomposition *	get_code_entry (pg_wchar code)

static uint8	get_canonical_class (pg_wchar code)

static const pg_wchar *	get_code_decomposition (const pg_unicode_decomposition entry, int dec_size)

static int	get_decomposed_size (pg_wchar code, bool compat)

static bool	recompose_code (uint32 start, uint32 code, uint32 *result)

static void	decompose_code (pg_wchar code, bool compat, pg_wchar *result, int current)

pg_wchar *	unicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)

static const pg_unicode_normprops *	qc_hash_lookup (pg_wchar ch, const pg_unicode_norminfo *norminfo)

static UnicodeNormalizationQC	qc_is_allowed (UnicodeNormalizationForm form, pg_wchar ch)

UnicodeNormalizationQC	unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)

Macro Definition Documentation

◆ ALLOC

#define ALLOC ( size ) palloc(size)

Definition at line 31 of file unicode_norm.c.

◆ FREE

#define FREE ( size ) pfree(size)

Definition at line 32 of file unicode_norm.c.

◆ LBASE

#define LBASE 0x1100 /* U+1100 */

Definition at line 40 of file unicode_norm.c.

◆ LCOUNT

#define LCOUNT 19

Definition at line 43 of file unicode_norm.c.

◆ NCOUNT

#define NCOUNT VCOUNT * TCOUNT

Definition at line 46 of file unicode_norm.c.

◆ SBASE

#define SBASE 0xAC00 /* U+AC00 */

Definition at line 39 of file unicode_norm.c.

◆ SCOUNT

#define SCOUNT LCOUNT * NCOUNT

Definition at line 47 of file unicode_norm.c.

◆ TBASE

#define TBASE 0x11A7 /* U+11A7 */

Definition at line 42 of file unicode_norm.c.

◆ TCOUNT

#define TCOUNT 28

Definition at line 45 of file unicode_norm.c.

◆ VBASE

#define VBASE 0x1161 /* U+1161 */

Definition at line 41 of file unicode_norm.c.

◆ VCOUNT

#define VCOUNT 21

Definition at line 44 of file unicode_norm.c.

Function Documentation

◆ decompose_code()

static void decompose_code	(	pg_wchar	code,
		bool	compat,
		pg_wchar **	result,
		int *	current
	)

static

Definition at line 321 of file unicode_norm.c.

 {
     const pg_unicode_decomposition *entry;
     int         i;
     const uint32 *decomp;
     int         dec_size;
  
     /*
      * Fast path for Hangul characters not stored in tables to save memory as
      * decomposition is algorithmic. See
      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
      * on the matter.
      */
     if (code >= SBASE && code < SBASE + SCOUNT)
     {
         uint32      l,
                     v,
                     tindex,
                     sindex;
         pg_wchar   *res = *result;
  
         sindex = code - SBASE;
         l = LBASE + sindex / (VCOUNT * TCOUNT);
         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
         tindex = sindex % TCOUNT;
  
         res[*current] = l;
         (*current)++;
         res[*current] = v;
         (*current)++;
  
         if (tindex != 0)
         {
             res[*current] = TBASE + tindex;
             (*current)++;
         }
  
         return;
     }
  
     entry = get_code_entry(code);
  
     /*
      * Just fill in with the current decomposition if there are no
      * decomposition codes to recurse to.  A NULL entry is equivalent to a
      * character with class 0 and no decompositions, so just leave also in
      * this case.
      */
     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
     {
         pg_wchar   *res = *result;
  
         res[*current] = code;
         (*current)++;
         return;
     }
  
     /*
      * If this entry has other decomposition codes look at them as well.
      */
     decomp = get_code_decomposition(entry, &dec_size);
     for (i = 0; i < dec_size; i++)
     {
         pg_wchar    lcode = (pg_wchar) decomp[i];
  
         /* Leave if no more decompositions */
         decompose_code(lcode, compat, result, current);
     }
 }

References compat, DECOMPOSITION_IS_COMPAT, DECOMPOSITION_SIZE, get_code_decomposition(), get_code_entry(), i, LBASE, res, SBASE, SCOUNT, TBASE, TCOUNT, VBASE, and VCOUNT.

Referenced by unicode_normalize().

◆ get_canonical_class()

static uint8 get_canonical_class ( pg_wchar code )

static

Definition at line 112 of file unicode_norm.c.

 {
     const pg_unicode_decomposition *entry = get_code_entry(code);
  
     /*
      * If no entries are found, the character used is either an Hangul
      * character or a character with a class of 0 and no decompositions.
      */
     if (!entry)
         return 0;
     else
         return entry->comb_class;
 }

References pg_unicode_decomposition::comb_class, and get_code_entry().

Referenced by unicode_is_normalized_quickcheck(), and unicode_normalize().

◆ get_code_decomposition()

static const pg_wchar* get_code_decomposition	(	const pg_unicode_decomposition *	entry,
		int *	dec_size
	)

static

Definition at line 134 of file unicode_norm.c.

 {
     static pg_wchar x;
  
     if (DECOMPOSITION_IS_INLINE(entry))
     {
         Assert(DECOMPOSITION_SIZE(entry) == 1);
         x = (pg_wchar) entry->dec_index;
         *dec_size = 1;
         return &x;
     }
     else
     {
         *dec_size = DECOMPOSITION_SIZE(entry);
         return &UnicodeDecomp_codepoints[entry->dec_index];
     }
 }

References Assert, pg_unicode_decomposition::dec_index, DECOMPOSITION_IS_INLINE, DECOMPOSITION_SIZE, UnicodeDecomp_codepoints, and x.

Referenced by decompose_code(), and get_decomposed_size().

◆ get_code_entry()

static const pg_unicode_decomposition* get_code_entry ( pg_wchar code )

static

Definition at line 72 of file unicode_norm.c.

 {
 #ifndef FRONTEND
     int         h;
     uint32      hashkey;
     pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
  
     /*
      * Compute the hash function. The hash key is the codepoint with the bytes
      * in network order.
      */
     hashkey = pg_hton32(code);
     h = decompinfo.hash(&hashkey);
  
     /* An out-of-range result implies no match */
     if (h < 0 || h >= decompinfo.num_decomps)
         return NULL;
  
     /*
      * Since it's a perfect hash, we need only match to the specific codepoint
      * it identifies.
      */
     if (code != decompinfo.decomps[h].codepoint)
         return NULL;
  
     /* Success! */
     return &decompinfo.decomps[h];
 #else
     return bsearch(&(code),
                    UnicodeDecompMain,
                    lengthof(UnicodeDecompMain),
                    sizeof(pg_unicode_decomposition),
                    conv_compare);
 #endif
 }

References pg_unicode_decomposition::codepoint, pg_unicode_decompinfo::decomps, pg_unicode_decompinfo::hash, lengthof, pg_unicode_decompinfo::num_decomps, pg_hton32, UnicodeDecompInfo, and UnicodeDecompMain.

Referenced by decompose_code(), get_canonical_class(), and get_decomposed_size().

◆ get_decomposed_size()

static int get_decomposed_size	(	pg_wchar	code,
		bool	compat
	)

static

Definition at line 159 of file unicode_norm.c.

 {
     const pg_unicode_decomposition *entry;
     int         size = 0;
     int         i;
     const uint32 *decomp;
     int         dec_size;
  
     /*
      * Fast path for Hangul characters not stored in tables to save memory as
      * decomposition is algorithmic. See
      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
      * on the matter.
      */
     if (code >= SBASE && code < SBASE + SCOUNT)
     {
         uint32      tindex,
                     sindex;
  
         sindex = code - SBASE;
         tindex = sindex % TCOUNT;
  
         if (tindex != 0)
             return 3;
         return 2;
     }
  
     entry = get_code_entry(code);
  
     /*
      * Just count current code if no other decompositions.  A NULL entry is
      * equivalent to a character with class 0 and no decompositions.
      */
     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
         return 1;
  
     /*
      * If this entry has other decomposition codes look at them as well. First
      * get its decomposition in the list of tables available.
      */
     decomp = get_code_decomposition(entry, &dec_size);
     for (i = 0; i < dec_size; i++)
     {
         uint32      lcode = decomp[i];
  
         size += get_decomposed_size(lcode, compat);
     }
  
     return size;
 }

References compat, DECOMPOSITION_IS_COMPAT, DECOMPOSITION_SIZE, get_code_decomposition(), get_code_entry(), i, SBASE, SCOUNT, size, and TCOUNT.

Referenced by unicode_normalize().

◆ qc_hash_lookup()

static const pg_unicode_normprops* qc_hash_lookup	(	pg_wchar	ch,
		const pg_unicode_norminfo *	norminfo
	)

static

Definition at line 543 of file unicode_norm.c.

 {
     int         h;
     uint32      hashkey;
  
     /*
      * Compute the hash function. The hash key is the codepoint with the bytes
      * in network order.
      */
     hashkey = pg_hton32(ch);
     h = norminfo->hash(&hashkey);
  
     /* An out-of-range result implies no match */
     if (h < 0 || h >= norminfo->num_normprops)
         return NULL;
  
     /*
      * Since it's a perfect hash, we need only match to the specific codepoint
      * it identifies.
      */
     if (ch != norminfo->normprops[h].codepoint)
         return NULL;
  
     /* Success! */
     return &norminfo->normprops[h];
 }

References pg_unicode_normprops::codepoint, pg_unicode_norminfo::hash, pg_unicode_norminfo::normprops, pg_unicode_norminfo::num_normprops, and pg_hton32.

Referenced by qc_is_allowed().

◆ qc_is_allowed()

static UnicodeNormalizationQC qc_is_allowed	(	UnicodeNormalizationForm	form,
		pg_wchar	ch
	)

static

Definition at line 574 of file unicode_norm.c.

 {
     const pg_unicode_normprops *found = NULL;
  
     switch (form)
     {
         case UNICODE_NFC:
             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
             break;
         case UNICODE_NFKC:
             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
             break;
         default:
             Assert(false);
             break;
     }
  
     if (found)
         return found->quickcheck;
     else
         return UNICODE_NORM_QC_YES;
 }

References Assert, qc_hash_lookup(), pg_unicode_normprops::quickcheck, UNICODE_NFC, UNICODE_NFKC, UNICODE_NORM_QC_YES, UnicodeNormInfo_NFC_QC, and UnicodeNormInfo_NFKC_QC.

Referenced by unicode_is_normalized_quickcheck().

◆ recompose_code()

static bool recompose_code	(	uint32	start,
		uint32	code,
		uint32 *	result
	)

static

Definition at line 218 of file unicode_norm.c.

 {
     /*
      * Handle Hangul characters algorithmically, per the Unicode spec.
      *
      * Check if two current characters are L and V.
      */
     if (start >= LBASE && start < LBASE + LCOUNT &&
         code >= VBASE && code < VBASE + VCOUNT)
     {
         /* make syllable of form LV */
         uint32      lindex = start - LBASE;
         uint32      vindex = code - VBASE;
  
         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
         return true;
     }
     /* Check if two current characters are LV and T */
     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
              ((start - SBASE) % TCOUNT) == 0 &&
              code >= TBASE && code < (TBASE + TCOUNT))
     {
         /* make syllable of form LVT */
         uint32      tindex = code - TBASE;
  
         *result = start + tindex;
         return true;
     }
     else
     {
         const pg_unicode_decomposition *entry;
  
         /*
          * Do an inverse lookup of the decomposition tables to see if anything
          * matches. The comparison just needs to be a perfect match on the
          * sub-table of size two, because the start character has already been
          * recomposed partially.  This lookup uses a perfect hash function for
          * the backend code.
          */
 #ifndef FRONTEND
  
         int         h,
                     inv_lookup_index;
         uint64      hashkey;
         pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
  
         /*
          * Compute the hash function. The hash key is formed by concatenating
          * bytes of the two codepoints in network order. See also
          * src/common/unicode/generate-unicode_norm_table.pl.
          */
         hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
         h = recompinfo.hash(&hashkey);
  
         /* An out-of-range result implies no match */
         if (h < 0 || h >= recompinfo.num_recomps)
             return false;
  
         inv_lookup_index = recompinfo.inverse_lookup[h];
         entry = &UnicodeDecompMain[inv_lookup_index];
  
         if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
             code == UnicodeDecomp_codepoints[entry->dec_index + 1])
         {
             *result = entry->codepoint;
             return true;
         }
  
 #else
  
         int         i;
  
         for (i = 0; i < lengthof(UnicodeDecompMain); i++)
         {
             entry = &UnicodeDecompMain[i];
  
             if (DECOMPOSITION_SIZE(entry) != 2)
                 continue;
  
             if (DECOMPOSITION_NO_COMPOSE(entry))
                 continue;
  
             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
             {
                 *result = entry->codepoint;
                 return true;
             }
         }
 #endif                          /* !FRONTEND */
     }
  
     return false;
 }

References pg_unicode_decomposition::codepoint, pg_unicode_decomposition::dec_index, DECOMPOSITION_NO_COMPOSE, DECOMPOSITION_SIZE, pg_unicode_recompinfo::hash, i, pg_unicode_recompinfo::inverse_lookup, LBASE, LCOUNT, lengthof, pg_unicode_recompinfo::num_recomps, pg_hton64, SBASE, SCOUNT, start, TBASE, TCOUNT, UnicodeDecomp_codepoints, UnicodeDecompMain, UnicodeRecompInfo, VBASE, and VCOUNT.

Referenced by unicode_normalize().

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck	(	UnicodeNormalizationForm	form,
		const pg_wchar *	input
	)

Definition at line 598 of file unicode_norm.c.

 {
     uint8       lastCanonicalClass = 0;
     UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
  
     /*
      * For the "D" forms, we don't run the quickcheck.  We don't include the
      * lookup tables for those because they are huge, checking for these
      * particular forms is less common, and running the slow path is faster
      * for the "D" forms than the "C" forms because you don't need to
      * recompose, which is slow.
      */
     if (form == UNICODE_NFD || form == UNICODE_NFKD)
         return UNICODE_NORM_QC_MAYBE;
  
     for (const pg_wchar *p = input; *p; p++)
     {
         pg_wchar    ch = *p;
         uint8       canonicalClass;
         UnicodeNormalizationQC check;
  
         canonicalClass = get_canonical_class(ch);
         if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
             return UNICODE_NORM_QC_NO;
  
         check = qc_is_allowed(form, ch);
         if (check == UNICODE_NORM_QC_NO)
             return UNICODE_NORM_QC_NO;
         else if (check == UNICODE_NORM_QC_MAYBE)
             result = UNICODE_NORM_QC_MAYBE;
  
         lastCanonicalClass = canonicalClass;
     }
     return result;
 }

References get_canonical_class(), input, qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

◆ unicode_normalize()

pg_wchar* unicode_normalize	(	UnicodeNormalizationForm	form,
		const pg_wchar *	input
	)

Definition at line 402 of file unicode_norm.c.

 {
     bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
     bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
     pg_wchar   *decomp_chars;
     pg_wchar   *recomp_chars;
     int         decomp_size,
                 current_size;
     int         count;
     const pg_wchar *p;
  
     /* variables for recomposition */
     int         last_class;
     int         starter_pos;
     int         target_pos;
     uint32      starter_ch;
  
     /* First, do character decomposition */
  
     /*
      * Calculate how many characters long the decomposed version will be.
      */
     decomp_size = 0;
     for (p = input; *p; p++)
         decomp_size += get_decomposed_size(*p, compat);
  
     decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     if (decomp_chars == NULL)
         return NULL;
  
     /*
      * Now fill in each entry recursively. This needs a second pass on the
      * decomposition table.
      */
     current_size = 0;
     for (p = input; *p; p++)
         decompose_code(*p, compat, &decomp_chars, &current_size);
     decomp_chars[decomp_size] = '\0';
     Assert(decomp_size == current_size);
  
     /* Leave if there is nothing to decompose */
     if (decomp_size == 0)
         return decomp_chars;
  
     /*
      * Now apply canonical ordering.
      */
     for (count = 1; count < decomp_size; count++)
     {
         pg_wchar    prev = decomp_chars[count - 1];
         pg_wchar    next = decomp_chars[count];
         pg_wchar    tmp;
         const uint8 prevClass = get_canonical_class(prev);
         const uint8 nextClass = get_canonical_class(next);
  
         /*
          * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
          * annex 4, a sequence of two adjacent characters in a string is an
          * exchangeable pair if the combining class (from the Unicode
          * Character Database) for the first character is greater than the
          * combining class for the second, and the second is not a starter.  A
          * character is a starter if its combining class is 0.
          */
         if (prevClass == 0 || nextClass == 0)
             continue;
  
         if (prevClass <= nextClass)
             continue;
  
         /* exchange can happen */
         tmp = decomp_chars[count - 1];
         decomp_chars[count - 1] = decomp_chars[count];
         decomp_chars[count] = tmp;
  
         /* backtrack to check again */
         if (count > 1)
             count -= 2;
     }
  
     if (!recompose)
         return decomp_chars;
  
     /*
      * The last phase of NFC and NFKC is the recomposition of the reordered
      * Unicode string using combining classes. The recomposed string cannot be
      * longer than the decomposed one, so make the allocation of the output
      * string based on that assumption.
      */
     recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     if (!recomp_chars)
     {
         FREE(decomp_chars);
         return NULL;
     }
  
     last_class = -1;            /* this eliminates a special check */
     starter_pos = 0;
     target_pos = 1;
     starter_ch = recomp_chars[0] = decomp_chars[0];
  
     for (count = 1; count < decomp_size; count++)
     {
         pg_wchar    ch = decomp_chars[count];
         int         ch_class = get_canonical_class(ch);
         pg_wchar    composite;
  
         if (last_class < ch_class &&
             recompose_code(starter_ch, ch, &composite))
         {
             recomp_chars[starter_pos] = composite;
             starter_ch = composite;
         }
         else if (ch_class == 0)
         {
             starter_pos = target_pos;
             starter_ch = ch;
             last_class = -1;
             recomp_chars[target_pos++] = ch;
         }
         else
         {
             last_class = ch_class;
             recomp_chars[target_pos++] = ch;
         }
     }
     recomp_chars[target_pos] = (pg_wchar) '\0';
  
     FREE(decomp_chars);
  
     return recomp_chars;
 }

References ALLOC, Assert, compat, current_size, decompose_code(), FREE, get_canonical_class(), get_decomposed_size(), input, next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().

Macros

Functions

Macro Definition Documentation

◆ ALLOC

◆ FREE

◆ LBASE

◆ LCOUNT

◆ NCOUNT

◆ SBASE

◆ SCOUNT

◆ TBASE

◆ TCOUNT

◆ VBASE

◆ VCOUNT

Function Documentation

◆ decompose_code()

◆ get_canonical_class()

◆ get_code_decomposition()

◆ get_code_entry()

◆ get_decomposed_size()

◆ qc_hash_lookup()

◆ qc_is_allowed()

◆ recompose_code()

◆ unicode_is_normalized_quickcheck()

◆ unicode_normalize()