#include "postgres.h"
#include "common/unicode_case.h"
#include "common/unicode_case_table.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"

Include dependency graph for unicode_case.c:

Enumerations
enum	CaseMapResult { CASEMAP_SELF , CASEMAP_SIMPLE , CASEMAP_SPECIAL }

Functions
static pg_wchar	find_case_map (pg_wchar ucs, const pg_wchar *map)

static size_t	convert_case (char dst, size_t dstsize, const char src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)

static enum CaseMapResult	casemap (pg_wchar u1, CaseKind casekind, bool full, const char src, size_t srclen, size_t srcoff, pg_wchar simple, const pg_wchar **special)

pg_wchar	unicode_lowercase_simple (pg_wchar code)

pg_wchar	unicode_titlecase_simple (pg_wchar code)

pg_wchar	unicode_uppercase_simple (pg_wchar code)

pg_wchar	unicode_casefold_simple (pg_wchar code)

size_t	unicode_strlower (char dst, size_t dstsize, const char src, ssize_t srclen, bool full)

size_t	unicode_strtitle (char dst, size_t dstsize, const char src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)

size_t	unicode_strupper (char dst, size_t dstsize, const char src, ssize_t srclen, bool full)

size_t	unicode_strfold (char dst, size_t dstsize, const char src, ssize_t srclen, bool full)

static bool	check_final_sigma (const unsigned char *str, size_t len, size_t offset)

static bool	check_special_conditions (int conditions, const char *str, size_t len, size_t offset)

Variables
static const pg_wchar *const	casekind_map [NCaseKind]

Enumeration Type Documentation

◆ CaseMapResult

enum CaseMapResult

Enumerator
CASEMAP_SELF
CASEMAP_SIMPLE
CASEMAP_SPECIAL

Definition at line 23 of file unicode_case.c.

{
    CASEMAP_SELF,
    CASEMAP_SIMPLE,
    CASEMAP_SPECIAL,
};

Function Documentation

◆ casemap()

static enum CaseMapResult casemap	(	pg_wchar	u1,
		CaseKind	casekind,
		bool	full,
		const char *	src,
		size_t	srclen,
		size_t	srcoff,
		pg_wchar *	simple,
		const pg_wchar **	special
	)

static

Definition at line 397 of file unicode_case.c.

{
    uint16      idx;
 
    /* Fast path for codepoints < 0x80 */
    if (u1 < 0x80)
    {
        /*
         * The first elements in all tables are reserved as 0 (as NULL). The
         * data starts at index 1, not 0.
         */
        *simple = casekind_map[casekind][u1 + 1];
 
        return CASEMAP_SIMPLE;
    }
 
    idx = case_index(u1);
 
    if (idx == 0)
        return CASEMAP_SELF;
 
    if (full && case_map_special[idx] &&
        check_special_conditions(special_case[case_map_special[idx]].conditions,
                                 src, srclen, srcoff))
    {
        *special = special_case[case_map_special[idx]].map[casekind];
        return CASEMAP_SPECIAL;
    }
 
    *simple = casekind_map[casekind][idx];
 
    return CASEMAP_SIMPLE;
}

References case_index(), case_map_special, casekind_map, CASEMAP_SELF, CASEMAP_SIMPLE, CASEMAP_SPECIAL, check_special_conditions(), idx(), pg_special_case::map, and special_case.

Referenced by convert_case(), and main().

◆ check_final_sigma()

static bool check_final_sigma	(	const unsigned char *	str,
		size_t	len,
		size_t	offset
	)

static

Definition at line 312 of file unicode_case.c.

{
    /* the start of the string is not preceded by a Cased character */
    if (offset == 0)
        return false;
 
    /* iterate backwards, looking for Cased character */
    for (int i = offset - 1; i >= 0; i--)
    {
        if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
        {
            pg_wchar    curr = utf8_to_unicode(str + i);
 
            if (pg_u_prop_case_ignorable(curr))
                continue;
            else if (pg_u_prop_cased(curr))
                break;
            else
                return false;
        }
        else if ((str[i] & 0xC0) == 0x80)
            continue;
 
        Assert(false);          /* invalid UTF-8 */
    }
 
    /* end of string is not followed by a Cased character */
    if (offset == len)
        return true;
 
    /* iterate forwards, looking for Cased character */
    for (int i = offset + 1; i < len && str[i] != '\0'; i++)
    {
        if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
        {
            pg_wchar    curr = utf8_to_unicode(str + i);
 
            if (pg_u_prop_case_ignorable(curr))
                continue;
            else if (pg_u_prop_cased(curr))
                return false;
            else
                break;
        }
        else if ((str[i] & 0xC0) == 0x80)
            continue;
 
        Assert(false);          /* invalid UTF-8 */
    }
 
    return true;
}

References Assert(), i, len, pg_u_prop_case_ignorable(), pg_u_prop_cased(), str, and utf8_to_unicode().

Referenced by check_special_conditions().

◆ check_special_conditions()

static bool check_special_conditions	(	int	conditions,
		const char *	str,
		size_t	len,
		size_t	offset
	)

static

Definition at line 370 of file unicode_case.c.

{
    if (conditions == 0)
        return true;
    else if (conditions == PG_U_FINAL_SIGMA)
        return check_final_sigma((unsigned char *) str, len, offset);
 
    /* no other conditions supported */
    Assert(false);
    return false;
}

References Assert(), check_final_sigma(), len, PG_U_FINAL_SIGMA, and str.

Referenced by casemap().

◆ convert_case()

static size_t convert_case	(	char *	dst,
		size_t	dstsize,
		const char *	src,
		ssize_t	srclen,
		CaseKind	str_casekind,
		bool	full,
		WordBoundaryNext	wbnext,
		void *	wbstate
	)

static

Definition at line 213 of file unicode_case.c.

{
    /* character CaseKind varies while titlecasing */
    CaseKind    chr_casekind = str_casekind;
    size_t      srcoff = 0;
    size_t      result_len = 0;
    size_t      boundary = 0;
 
    Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
           (str_casekind != CaseTitle && !wbnext && !wbstate));
 
    if (str_casekind == CaseTitle)
    {
        boundary = wbnext(wbstate);
        Assert(boundary == 0);  /* start of text is always a boundary */
    }
 
    while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
    {
        pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
        int         u1len = unicode_utf8len(u1);
        pg_wchar    simple = 0;
        const pg_wchar *special = NULL;
        enum CaseMapResult casemap_result;
 
        if (str_casekind == CaseTitle)
        {
            if (srcoff == boundary)
            {
                chr_casekind = full ? CaseTitle : CaseUpper;
                boundary = wbnext(wbstate);
            }
            else
                chr_casekind = CaseLower;
        }
 
        casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
                                 &simple, &special);
 
        switch (casemap_result)
        {
            case CASEMAP_SELF:
                /* no mapping; copy bytes from src */
                Assert(simple == 0);
                Assert(special == NULL);
                if (result_len + u1len <= dstsize)
                    memcpy(dst + result_len, src + srcoff, u1len);
 
                result_len += u1len;
                break;
            case CASEMAP_SIMPLE:
                {
                    /* replace with single character */
                    pg_wchar    u2 = simple;
                    pg_wchar    u2len = unicode_utf8len(u2);
 
                    Assert(special == NULL);
                    if (result_len + u2len <= dstsize)
                        unicode_to_utf8(u2, (unsigned char *) dst + result_len);
 
                    result_len += u2len;
                }
                break;
            case CASEMAP_SPECIAL:
                /* replace with up to MAX_CASE_EXPANSION characters */
                Assert(simple == 0);
                for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
                {
                    pg_wchar    u2 = special[i];
                    size_t      u2len = unicode_utf8len(u2);
 
                    if (result_len + u2len <= dstsize)
                        unicode_to_utf8(u2, (unsigned char *) dst + result_len);
 
                    result_len += u2len;
                }
                break;
        }
 
        srcoff += u1len;
    }
 
    if (result_len < dstsize)
        dst[result_len] = '\0';
 
    return result_len;
}

References Assert(), CaseLower, casemap(), CASEMAP_SELF, CASEMAP_SIMPLE, CASEMAP_SPECIAL, CaseTitle, CaseUpper, i, MAX_CASE_EXPANSION, unicode_to_utf8(), unicode_utf8len(), and utf8_to_unicode().

Referenced by unicode_strfold(), unicode_strlower(), unicode_strtitle(), and unicode_strupper().

◆ find_case_map()

static pg_wchar find_case_map	(	pg_wchar	ucs,
		const pg_wchar *	map
	)

static

Definition at line 438 of file unicode_case.c.

{
    /* Fast path for codepoints < 0x80 */
    if (ucs < 0x80)
        /* The first elements in all tables are reserved as 0 (as NULL). */
        return map[ucs + 1];
    return map[case_index(ucs)];
}

References case_index().

Referenced by unicode_casefold_simple(), unicode_lowercase_simple(), unicode_titlecase_simple(), and unicode_uppercase_simple().

◆ unicode_casefold_simple()

pg_wchar unicode_casefold_simple ( pg_wchar code )

Definition at line 74 of file unicode_case.c.

{
    pg_wchar    cp = find_case_map(code, case_map_fold);
 
    return cp != 0 ? cp : code;
}

References case_map_fold, and find_case_map().

◆ unicode_lowercase_simple()

pg_wchar unicode_lowercase_simple ( pg_wchar code )

Definition at line 50 of file unicode_case.c.

{
    pg_wchar    cp = find_case_map(code, case_map_lower);
 
    return cp != 0 ? cp : code;
}

References case_map_lower, and find_case_map().

Referenced by pg_wc_tolower().

◆ unicode_strfold()

size_t unicode_strfold	(	char *	dst,
		size_t	dstsize,
		const char *	src,
		ssize_t	srclen,
		bool	full
	)

Definition at line 189 of file unicode_case.c.

{
    return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
                        NULL);
}

References CaseFold, and convert_case().

Referenced by strfold_builtin(), and tfunc_fold().

◆ unicode_strlower()

size_t unicode_strlower	(	char *	dst,
		size_t	dstsize,
		const char *	src,
		ssize_t	srclen,
		bool	full
	)

Definition at line 101 of file unicode_case.c.

{
    return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
                        NULL);
}

References CaseLower, and convert_case().

Referenced by strlower_builtin(), and tfunc_lower().

◆ unicode_strtitle()

size_t unicode_strtitle	(	char *	dst,
		size_t	dstsize,
		const char *	src,
		ssize_t	srclen,
		bool	full,
		WordBoundaryNext	wbnext,
		void *	wbstate
	)

Definition at line 138 of file unicode_case.c.

{
    return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
                        wbstate);
}

References CaseTitle, and convert_case().

Referenced by strtitle_builtin(), and tfunc_title().

◆ unicode_strupper()

size_t unicode_strupper	(	char *	dst,
		size_t	dstsize,
		const char *	src,
		ssize_t	srclen,
		bool	full
	)

Definition at line 165 of file unicode_case.c.

{
    return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
                        NULL);
}

References CaseUpper, and convert_case().

Referenced by strupper_builtin(), and tfunc_upper().

◆ unicode_titlecase_simple()

pg_wchar unicode_titlecase_simple ( pg_wchar code )

Definition at line 58 of file unicode_case.c.

{
    pg_wchar    cp = find_case_map(code, case_map_title);
 
    return cp != 0 ? cp : code;
}

References case_map_title, and find_case_map().

◆ unicode_uppercase_simple()

pg_wchar unicode_uppercase_simple ( pg_wchar code )

Definition at line 66 of file unicode_case.c.

{
    pg_wchar    cp = find_case_map(code, case_map_upper);
 
    return cp != 0 ? cp : code;
}

References case_map_upper, and find_case_map().

Referenced by pg_wc_toupper().

Variable Documentation

◆ casekind_map

const pg_wchar* const casekind_map[NCaseKind]

static

Initial value:

=
{
    [CaseLower] = case_map_lower,
    [CaseTitle] = case_map_title,
    [CaseUpper] = case_map_upper,
    [CaseFold] = case_map_fold,
}

Definition at line 33 of file unicode_case.c.

Referenced by casemap().

Enumerations

Functions

Variables

Enumeration Type Documentation

◆ CaseMapResult

Function Documentation

◆ casemap()

◆ check_final_sigma()

◆ check_special_conditions()

◆ convert_case()

◆ find_case_map()

◆ unicode_casefold_simple()

◆ unicode_lowercase_simple()

◆ unicode_strfold()

◆ unicode_strlower()

◆ unicode_strtitle()

◆ unicode_strupper()

◆ unicode_titlecase_simple()

◆ unicode_uppercase_simple()

Variable Documentation

◆ casekind_map