#include "mb/pg_wchar.h"

Include dependency graph for unicode_norm.h:

This graph shows which files directly or indirectly include this file:

Enumerations
enum	UnicodeNormalizationForm { UNICODE_NFC = 0 , UNICODE_NFD = 1 , UNICODE_NFKC = 2 , UNICODE_NFKD = 3 }

enum	UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0 , UNICODE_NORM_QC_YES = 1 , UNICODE_NORM_QC_MAYBE = -1 }

Functions
pg_wchar *	unicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)

UnicodeNormalizationQC	unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)

Enumeration Type Documentation

◆ UnicodeNormalizationForm

enum UnicodeNormalizationForm

Enumerator
UNICODE_NFC
UNICODE_NFD
UNICODE_NFKC
UNICODE_NFKD

Definition at line 19 of file unicode_norm.h.

{
    UNICODE_NFC = 0,
    UNICODE_NFD = 1,
    UNICODE_NFKC = 2,
    UNICODE_NFKD = 3,
} UnicodeNormalizationForm;

◆ UnicodeNormalizationQC

enum UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO
UNICODE_NORM_QC_YES
UNICODE_NORM_QC_MAYBE

Definition at line 28 of file unicode_norm.h.

{
    UNICODE_NORM_QC_NO = 0,
    UNICODE_NORM_QC_YES = 1,
    UNICODE_NORM_QC_MAYBE = -1,
} UnicodeNormalizationQC;

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck	(	UnicodeNormalizationForm	form,
		const pg_wchar *	input
	)

Definition at line 598 of file unicode_norm.c.

{
    uint8       lastCanonicalClass = 0;
    UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
 
    /*
     * For the "D" forms, we don't run the quickcheck.  We don't include the
     * lookup tables for those because they are huge, checking for these
     * particular forms is less common, and running the slow path is faster
     * for the "D" forms than the "C" forms because you don't need to
     * recompose, which is slow.
     */
    if (form == UNICODE_NFD || form == UNICODE_NFKD)
        return UNICODE_NORM_QC_MAYBE;
 
    for (const pg_wchar *p = input; *p; p++)
    {
        pg_wchar    ch = *p;
        uint8       canonicalClass;
        UnicodeNormalizationQC check;
 
        canonicalClass = get_canonical_class(ch);
        if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
            return UNICODE_NORM_QC_NO;
 
        check = qc_is_allowed(form, ch);
        if (check == UNICODE_NORM_QC_NO)
            return UNICODE_NORM_QC_NO;
        else if (check == UNICODE_NORM_QC_MAYBE)
            result = UNICODE_NORM_QC_MAYBE;
 
        lastCanonicalClass = canonicalClass;
    }
    return result;
}

References get_canonical_class(), input, qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

◆ unicode_normalize()

pg_wchar * unicode_normalize	(	UnicodeNormalizationForm	form,
		const pg_wchar *	input
	)

Definition at line 402 of file unicode_norm.c.

{
    bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
    bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
    pg_wchar   *decomp_chars;
    pg_wchar   *recomp_chars;
    int         decomp_size,
                current_size;
    int         count;
    const pg_wchar *p;
 
    /* variables for recomposition */
    int         last_class;
    int         starter_pos;
    int         target_pos;
    uint32      starter_ch;
 
    /* First, do character decomposition */
 
    /*
     * Calculate how many characters long the decomposed version will be.
     */
    decomp_size = 0;
    for (p = input; *p; p++)
        decomp_size += get_decomposed_size(*p, compat);
 
    decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
    if (decomp_chars == NULL)
        return NULL;
 
    /*
     * Now fill in each entry recursively. This needs a second pass on the
     * decomposition table.
     */
    current_size = 0;
    for (p = input; *p; p++)
        decompose_code(*p, compat, &decomp_chars, &current_size);
    decomp_chars[decomp_size] = '\0';
    Assert(decomp_size == current_size);
 
    /* Leave if there is nothing to decompose */
    if (decomp_size == 0)
        return decomp_chars;
 
    /*
     * Now apply canonical ordering.
     */
    for (count = 1; count < decomp_size; count++)
    {
        pg_wchar    prev = decomp_chars[count - 1];
        pg_wchar    next = decomp_chars[count];
        pg_wchar    tmp;
        const uint8 prevClass = get_canonical_class(prev);
        const uint8 nextClass = get_canonical_class(next);
 
        /*
         * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
         * annex 4, a sequence of two adjacent characters in a string is an
         * exchangeable pair if the combining class (from the Unicode
         * Character Database) for the first character is greater than the
         * combining class for the second, and the second is not a starter.  A
         * character is a starter if its combining class is 0.
         */
        if (prevClass == 0 || nextClass == 0)
            continue;
 
        if (prevClass <= nextClass)
            continue;
 
        /* exchange can happen */
        tmp = decomp_chars[count - 1];
        decomp_chars[count - 1] = decomp_chars[count];
        decomp_chars[count] = tmp;
 
        /* backtrack to check again */
        if (count > 1)
            count -= 2;
    }
 
    if (!recompose)
        return decomp_chars;
 
    /*
     * The last phase of NFC and NFKC is the recomposition of the reordered
     * Unicode string using combining classes. The recomposed string cannot be
     * longer than the decomposed one, so make the allocation of the output
     * string based on that assumption.
     */
    recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
    if (!recomp_chars)
    {
        FREE(decomp_chars);
        return NULL;
    }
 
    last_class = -1;            /* this eliminates a special check */
    starter_pos = 0;
    target_pos = 1;
    starter_ch = recomp_chars[0] = decomp_chars[0];
 
    for (count = 1; count < decomp_size; count++)
    {
        pg_wchar    ch = decomp_chars[count];
        int         ch_class = get_canonical_class(ch);
        pg_wchar    composite;
 
        if (last_class < ch_class &&
            recompose_code(starter_ch, ch, &composite))
        {
            recomp_chars[starter_pos] = composite;
            starter_ch = composite;
        }
        else if (ch_class == 0)
        {
            starter_pos = target_pos;
            starter_ch = ch;
            last_class = -1;
            recomp_chars[target_pos++] = ch;
        }
        else
        {
            last_class = ch_class;
            recomp_chars[target_pos++] = ch;
        }
    }
    recomp_chars[target_pos] = (pg_wchar) '\0';
 
    FREE(decomp_chars);
 
    return recomp_chars;
}

References ALLOC, Assert(), compat, current_size, decompose_code(), FREE, get_canonical_class(), get_decomposed_size(), input, next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().