category__test_8c_source.html

/*-------------------------------------------------------------------------

 * category_test.c

 *      Program to test Unicode general category and character properties.

 *

 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group

 *

 * IDENTIFICATION

 *    src/common/unicode/category_test.c

 *

 *-------------------------------------------------------------------------

 */

#include "postgres_fe.h"


#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <wctype.h>


#ifdef USE_ICU

#include <unicode/uchar.h>

#endif


#include "common/unicode_category.h"

#include "common/unicode_version.h"


static int  pg_unicode_version = 0;

#ifdef USE_ICU

static int  icu_unicode_version = 0;

#endif


/*

 * Parse version into integer for easy comparison.

 */

static int

parse_unicode_version(const char *version)

{

    int         n PG_USED_FOR_ASSERTS_ONLY;

    int         major;

    int         minor;


    n = sscanf(version, "%d.%d", &major, &minor);


    Assert(n == 2);

    Assert(minor < 100);


    return major * 100 + minor;

}


#ifdef USE_ICU

/*

 * Test Postgres Unicode tables by comparing with ICU. Test the General

 * Category, as well as the properties Alphabetic, Lowercase, Uppercase,

 * White_Space, and Hex_Digit.

 */

static void

icu_test()

{

    int         successful = 0;

    int         pg_skipped_codepoints = 0;

    int         icu_skipped_codepoints = 0;


    for (pg_wchar code = 0; code <= 0x10ffff; code++)

    {

        uint8_t     pg_category = unicode_category(code);

        uint8_t     icu_category = u_charType(code);


        /* Property tests */

        bool        prop_alphabetic = pg_u_prop_alphabetic(code);

        bool        prop_lowercase = pg_u_prop_lowercase(code);

        bool        prop_uppercase = pg_u_prop_uppercase(code);

        bool        prop_cased = pg_u_prop_cased(code);

        bool        prop_case_ignorable = pg_u_prop_case_ignorable(code);

        bool        prop_white_space = pg_u_prop_white_space(code);

        bool        prop_hex_digit = pg_u_prop_hex_digit(code);

        bool        prop_join_control = pg_u_prop_join_control(code);


        bool        icu_prop_alphabetic = u_hasBinaryProperty(code, UCHAR_ALPHABETIC);

        bool        icu_prop_lowercase = u_hasBinaryProperty(code, UCHAR_LOWERCASE);

        bool        icu_prop_uppercase = u_hasBinaryProperty(code, UCHAR_UPPERCASE);

        bool        icu_prop_cased = u_hasBinaryProperty(code, UCHAR_CASED);

        bool        icu_prop_case_ignorable = u_hasBinaryProperty(code, UCHAR_CASE_IGNORABLE);

        bool        icu_prop_white_space = u_hasBinaryProperty(code, UCHAR_WHITE_SPACE);

        bool        icu_prop_hex_digit = u_hasBinaryProperty(code, UCHAR_HEX_DIGIT);

        bool        icu_prop_join_control = u_hasBinaryProperty(code, UCHAR_JOIN_CONTROL);


        /*

         * Compare with ICU for character classes using:

         *

         * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details

         *

         * which describes how to use ICU to test for membership in regex

         * character classes.

         *

         * NB: the document suggests testing for some properties such as

         * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the

         * "POSIX Compatible" character classes.

         */

        bool        isalpha = pg_u_isalpha(code);

        bool        islower = pg_u_islower(code);

        bool        isupper = pg_u_isupper(code);

        bool        ispunct = pg_u_ispunct(code, false);

        bool        isdigit = pg_u_isdigit(code, false);

        bool        isxdigit = pg_u_isxdigit(code, false);

        bool        isalnum = pg_u_isalnum(code, false);

        bool        isspace = pg_u_isspace(code);

        bool        isblank = pg_u_isblank(code);

        bool        iscntrl = pg_u_iscntrl(code);

        bool        isgraph = pg_u_isgraph(code);

        bool        isprint = pg_u_isprint(code);


        bool        icu_isalpha = u_isUAlphabetic(code);

        bool        icu_islower = u_isULowercase(code);

        bool        icu_isupper = u_isUUppercase(code);

        bool        icu_ispunct = u_ispunct(code);

        bool        icu_isdigit = u_isdigit(code);

        bool        icu_isxdigit = u_hasBinaryProperty(code,

                                                       UCHAR_POSIX_XDIGIT);

        bool        icu_isalnum = u_hasBinaryProperty(code,

                                                      UCHAR_POSIX_ALNUM);

        bool        icu_isspace = u_isUWhiteSpace(code);

        bool        icu_isblank = u_isblank(code);

        bool        icu_iscntrl = icu_category == PG_U_CONTROL;

        bool        icu_isgraph = u_hasBinaryProperty(code,

                                                      UCHAR_POSIX_GRAPH);

        bool        icu_isprint = u_hasBinaryProperty(code,

                                                      UCHAR_POSIX_PRINT);


        /*

         * A version mismatch means that some assigned codepoints in the newer

         * version may be unassigned in the older version. That's OK, though

         * the test will not cover those codepoints marked unassigned in the

         * older version (that is, it will no longer be an exhaustive test).

         */

        if (pg_category == PG_U_UNASSIGNED &&

            icu_category != PG_U_UNASSIGNED &&

            pg_unicode_version < icu_unicode_version)

        {

            pg_skipped_codepoints++;

            continue;

        }


        if (icu_category == PG_U_UNASSIGNED &&

            pg_category != PG_U_UNASSIGNED &&

            icu_unicode_version < pg_unicode_version)

        {

            icu_skipped_codepoints++;

            continue;

        }


        if (pg_category != icu_category)

        {

            printf("category_test: FAILURE for codepoint 0x%06x\n", code);

            printf("category_test: Postgres category:   %02d %s %s\n", pg_category,

                   unicode_category_abbrev(pg_category),

                   unicode_category_string(pg_category));

            printf("category_test: ICU category:        %02d %s %s\n", icu_category,

                   unicode_category_abbrev(icu_category),

                   unicode_category_string(icu_category));

            printf("\n");

            exit(1);

        }


        if (prop_alphabetic != icu_prop_alphabetic ||

            prop_lowercase != icu_prop_lowercase ||

            prop_uppercase != icu_prop_uppercase ||

            prop_cased != icu_prop_cased ||

            prop_case_ignorable != icu_prop_case_ignorable ||

            prop_white_space != icu_prop_white_space ||

            prop_hex_digit != icu_prop_hex_digit ||

            prop_join_control != icu_prop_join_control)

        {

            printf("category_test: FAILURE for codepoint 0x%06x\n", code);

            printf("category_test: Postgres property    alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",

                   prop_alphabetic, prop_lowercase, prop_uppercase,

                   prop_cased, prop_case_ignorable,

                   prop_white_space, prop_hex_digit, prop_join_control);

            printf("category_test: ICU  property    alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",

                   icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,

                   icu_prop_cased, icu_prop_case_ignorable,

                   icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);

            printf("\n");

            exit(1);

        }


        if (isalpha != icu_isalpha ||

            islower != icu_islower ||

            isupper != icu_isupper ||

            ispunct != icu_ispunct ||

            isdigit != icu_isdigit ||

            isxdigit != icu_isxdigit ||

            isalnum != icu_isalnum ||

            isspace != icu_isspace ||

            isblank != icu_isblank ||

            iscntrl != icu_iscntrl ||

            isgraph != icu_isgraph ||

            isprint != icu_isprint)

        {

            printf("category_test: FAILURE for codepoint 0x%06x\n", code);

            printf("category_test: Postgres class   alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",

                   isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);

            printf("category_test: ICU class    alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",

                   icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);

            printf("\n");

            exit(1);

        }


        if (pg_category != PG_U_UNASSIGNED)

            successful++;

    }


    if (pg_skipped_codepoints > 0)

        printf("category_test: skipped %d codepoints unassigned in Postgres due to Unicode version mismatch\n",

               pg_skipped_codepoints);

    if (icu_skipped_codepoints > 0)

        printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",

               icu_skipped_codepoints);


    printf("category_test: ICU test: %d codepoints successful\n", successful);

}

#endif


int

main(int argc, char **argv)

{

    pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);

    printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);


#ifdef USE_ICU

    icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);

    printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);


    icu_test();

#else

    printf("category_test: ICU not available; skipping\n");

#endif

}

PG_USED_FOR_ASSERTS_ONLY
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224

pg_unicode_version
static int pg_unicode_version
Definition: category_test.c:26

main
int main(int argc, char **argv)
Definition: category_test.c:223

parse_unicode_version
static int parse_unicode_version(const char *version)
Definition: category_test.c:35

Assert
Assert(PointerIsAligned(start, uint64))

pg_wchar
unsigned int pg_wchar
Definition: mbprint.c:31

printf
#define printf(...)
Definition: port.h:245

postgres_fe.h

string.h

pg_u_prop_uppercase
bool pg_u_prop_uppercase(pg_wchar code)
Definition: unicode_category.c:133

pg_u_isspace
bool pg_u_isspace(pg_wchar code)
Definition: unicode_category.c:311

pg_u_isxdigit
bool pg_u_isxdigit(pg_wchar code, bool posix)
Definition: unicode_category.c:317

pg_u_ispunct
bool pg_u_ispunct(pg_wchar code, bool posix)
Definition: unicode_category.c:290

unicode_category_string
const char * unicode_category_string(pg_unicode_category category)
Definition: unicode_category.c:332

pg_u_isprint
bool pg_u_isprint(pg_wchar code)
Definition: unicode_category.c:279

pg_u_islower
bool pg_u_islower(pg_wchar code)
Definition: unicode_category.c:249

unicode_category_abbrev
const char * unicode_category_abbrev(pg_unicode_category category)
Definition: unicode_category.c:406

pg_u_prop_white_space
bool pg_u_prop_white_space(pg_wchar code)
Definition: unicode_category.c:170

pg_u_isblank
bool pg_u_isblank(pg_wchar code)
Definition: unicode_category.c:255

pg_u_prop_cased
bool pg_u_prop_cased(pg_wchar code)
Definition: unicode_category.c:144

pg_u_isalpha
bool pg_u_isalpha(pg_wchar code)
Definition: unicode_category.c:220

pg_u_prop_lowercase
bool pg_u_prop_lowercase(pg_wchar code)
Definition: unicode_category.c:122

pg_u_isalnum
bool pg_u_isalnum(pg_wchar code, bool posix)
Definition: unicode_category.c:226

pg_u_isupper
bool pg_u_isupper(pg_wchar code)
Definition: unicode_category.c:243

pg_u_prop_alphabetic
bool pg_u_prop_alphabetic(pg_wchar code)
Definition: unicode_category.c:111

pg_u_isdigit
bool pg_u_isdigit(pg_wchar code, bool posix)
Definition: unicode_category.c:211

pg_u_iscntrl
bool pg_u_iscntrl(pg_wchar code)
Definition: unicode_category.c:262

pg_u_prop_join_control
bool pg_u_prop_join_control(pg_wchar code)
Definition: unicode_category.c:192

pg_u_isgraph
bool pg_u_isgraph(pg_wchar code)
Definition: unicode_category.c:268

pg_u_prop_case_ignorable
bool pg_u_prop_case_ignorable(pg_wchar code)
Definition: unicode_category.c:159

pg_u_prop_hex_digit
bool pg_u_prop_hex_digit(pg_wchar code)
Definition: unicode_category.c:181

unicode_category
pg_unicode_category unicode_category(pg_wchar code)
Definition: unicode_category.c:85

unicode_category.h

PG_U_CONTROL
@ PG_U_CONTROL
Definition: unicode_category.h:47

PG_U_UNASSIGNED
@ PG_U_UNASSIGNED
Definition: unicode_category.h:32

unicode_version.h

PG_UNICODE_VERSION
#define PG_UNICODE_VERSION
Definition: unicode_version.h:14

icu_unicode_version
Datum icu_unicode_version(PG_FUNCTION_ARGS)
Definition: varlena.c:6504