chklocale_8c_source.html

/*-------------------------------------------------------------------------

 *

 * chklocale.c

 *      Functions for handling locale-related info

 *

 *

 * Copyright (c) 1996-2025, PostgreSQL Global Development Group

 *

 *

 * IDENTIFICATION

 *    src/port/chklocale.c

 *

 *-------------------------------------------------------------------------

 */


#ifndef FRONTEND

#include "postgres.h"

#else

#include "postgres_fe.h"

#endif


#ifndef WIN32

#include <langinfo.h>

#endif


#include "mb/pg_wchar.h"


/*

 * This table needs to recognize all the CODESET spellings for supported

 * backend encodings, as well as frontend-only encodings where possible

 * (the latter case is currently only needed for initdb to recognize

 * error situations).  On Windows, we rely on entries for codepage

 * numbers (CPnnn).

 *

 * Note that we search the table with pg_strcasecmp(), so variant

 * capitalizations don't need their own entries.

 */

struct encoding_match

{

    enum pg_enc pg_enc_code;

    const char *system_enc_name;

};


static const struct encoding_match encoding_match_list[] = {

    {PG_EUC_JP, "EUC-JP"},

    {PG_EUC_JP, "eucJP"},

    {PG_EUC_JP, "IBM-eucJP"},

    {PG_EUC_JP, "sdeckanji"},

    {PG_EUC_JP, "CP20932"},


    {PG_EUC_CN, "EUC-CN"},

    {PG_EUC_CN, "eucCN"},

    {PG_EUC_CN, "IBM-eucCN"},

    {PG_EUC_CN, "GB2312"},

    {PG_EUC_CN, "dechanzi"},

    {PG_EUC_CN, "CP20936"},


    {PG_EUC_KR, "EUC-KR"},

    {PG_EUC_KR, "eucKR"},

    {PG_EUC_KR, "IBM-eucKR"},

    {PG_EUC_KR, "deckorean"},

    {PG_EUC_KR, "5601"},

    {PG_EUC_KR, "CP51949"},


    {PG_EUC_TW, "EUC-TW"},

    {PG_EUC_TW, "eucTW"},

    {PG_EUC_TW, "IBM-eucTW"},

    {PG_EUC_TW, "cns11643"},

    /* No codepage for EUC-TW ? */


    {PG_UTF8, "UTF-8"},

    {PG_UTF8, "utf8"},

    {PG_UTF8, "CP65001"},


    {PG_LATIN1, "ISO-8859-1"},

    {PG_LATIN1, "ISO8859-1"},

    {PG_LATIN1, "iso88591"},

    {PG_LATIN1, "CP28591"},


    {PG_LATIN2, "ISO-8859-2"},

    {PG_LATIN2, "ISO8859-2"},

    {PG_LATIN2, "iso88592"},

    {PG_LATIN2, "CP28592"},


    {PG_LATIN3, "ISO-8859-3"},

    {PG_LATIN3, "ISO8859-3"},

    {PG_LATIN3, "iso88593"},

    {PG_LATIN3, "CP28593"},


    {PG_LATIN4, "ISO-8859-4"},

    {PG_LATIN4, "ISO8859-4"},

    {PG_LATIN4, "iso88594"},

    {PG_LATIN4, "CP28594"},


    {PG_LATIN5, "ISO-8859-9"},

    {PG_LATIN5, "ISO8859-9"},

    {PG_LATIN5, "iso88599"},

    {PG_LATIN5, "CP28599"},


    {PG_LATIN6, "ISO-8859-10"},

    {PG_LATIN6, "ISO8859-10"},

    {PG_LATIN6, "iso885910"},


    {PG_LATIN7, "ISO-8859-13"},

    {PG_LATIN7, "ISO8859-13"},

    {PG_LATIN7, "iso885913"},


    {PG_LATIN8, "ISO-8859-14"},

    {PG_LATIN8, "ISO8859-14"},

    {PG_LATIN8, "iso885914"},


    {PG_LATIN9, "ISO-8859-15"},

    {PG_LATIN9, "ISO8859-15"},

    {PG_LATIN9, "iso885915"},

    {PG_LATIN9, "CP28605"},


    {PG_LATIN10, "ISO-8859-16"},

    {PG_LATIN10, "ISO8859-16"},

    {PG_LATIN10, "iso885916"},


    {PG_KOI8R, "KOI8-R"},

    {PG_KOI8R, "CP20866"},


    {PG_KOI8U, "KOI8-U"},

    {PG_KOI8U, "CP21866"},


    {PG_WIN866, "CP866"},

    {PG_WIN874, "CP874"},

    {PG_WIN1250, "CP1250"},

    {PG_WIN1251, "CP1251"},

    {PG_WIN1251, "ansi-1251"},

    {PG_WIN1252, "CP1252"},

    {PG_WIN1253, "CP1253"},

    {PG_WIN1254, "CP1254"},

    {PG_WIN1255, "CP1255"},

    {PG_WIN1256, "CP1256"},

    {PG_WIN1257, "CP1257"},

    {PG_WIN1258, "CP1258"},


    {PG_ISO_8859_5, "ISO-8859-5"},

    {PG_ISO_8859_5, "ISO8859-5"},

    {PG_ISO_8859_5, "iso88595"},

    {PG_ISO_8859_5, "CP28595"},


    {PG_ISO_8859_6, "ISO-8859-6"},

    {PG_ISO_8859_6, "ISO8859-6"},

    {PG_ISO_8859_6, "iso88596"},

    {PG_ISO_8859_6, "CP28596"},


    {PG_ISO_8859_7, "ISO-8859-7"},

    {PG_ISO_8859_7, "ISO8859-7"},

    {PG_ISO_8859_7, "iso88597"},

    {PG_ISO_8859_7, "CP28597"},


    {PG_ISO_8859_8, "ISO-8859-8"},

    {PG_ISO_8859_8, "ISO8859-8"},

    {PG_ISO_8859_8, "iso88598"},

    {PG_ISO_8859_8, "CP28598"},


    {PG_SJIS, "SJIS"},

    {PG_SJIS, "PCK"},

    {PG_SJIS, "CP932"},

    {PG_SJIS, "SHIFT_JIS"},


    {PG_BIG5, "BIG5"},

    {PG_BIG5, "BIG5HKSCS"},

    {PG_BIG5, "Big5-HKSCS"},

    {PG_BIG5, "CP950"},


    {PG_GBK, "GBK"},

    {PG_GBK, "CP936"},


    {PG_UHC, "UHC"},

    {PG_UHC, "CP949"},


    {PG_JOHAB, "JOHAB"},

    {PG_JOHAB, "CP1361"},


    {PG_GB18030, "GB18030"},

    {PG_GB18030, "CP54936"},


    {PG_SHIFT_JIS_2004, "SJIS_2004"},


    {PG_SQL_ASCII, "US-ASCII"},


    {PG_SQL_ASCII, NULL}        /* end marker */

};


#ifdef WIN32

/*

 * On Windows, use CP<code page number> instead of CODESET.

 *

 * This routine uses GetLocaleInfoEx() to parse short locale names like

 * "de-DE", "fr-FR", etc.  If those cannot be parsed correctly process falls

 * back to the pre-VS-2010 manual parsing done with using

 * <Language>_<Country>.<CodePage> as a base.

 *

 * Returns a malloc()'d string for the caller to free.

 */

static char *

win32_get_codeset(const char *ctype)

{

    char       *r = NULL;

    char       *codepage;

    uint32      cp;

    WCHAR       wctype[LOCALE_NAME_MAX_LENGTH];


    memset(wctype, 0, sizeof(wctype));

    MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);


    if (GetLocaleInfoEx(wctype,

                        LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,

                        (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)

    {

        r = malloc(16);         /* excess */

        if (r != NULL)

        {

            /*

             * If the return value is CP_ACP that means no ANSI code page is

             * available, so only Unicode can be used for the locale.

             */

            if (cp == CP_ACP)

                strcpy(r, "utf8");

            else

                sprintf(r, "CP%u", cp);

        }

    }

    else

    {

        /*

         * Locale format on Win32 is <Language>_<Country>.<CodePage>.  For

         * example, English_United States.1252.  If we see digits after the

         * last dot, assume it's a codepage number.  Otherwise, we might be

         * dealing with a Unix-style locale string; Windows' setlocale() will

         * take those even though GetLocaleInfoEx() won't, so we end up here.

         * In that case, just return what's after the last dot and hope we can

         * find it in our table.

         */

        codepage = strrchr(ctype, '.');

        if (codepage != NULL)

        {

            size_t      ln;


            codepage++;

            ln = strlen(codepage);

            r = malloc(ln + 3);

            if (r != NULL)

            {

                if (strspn(codepage, "0123456789") == ln)

                    sprintf(r, "CP%s", codepage);

                else

                    strcpy(r, codepage);

            }

        }

    }


    return r;

}


#ifndef FRONTEND

/*

 * Given a Windows code page identifier, find the corresponding PostgreSQL

 * encoding.  Issue a warning and return -1 if none found.

 */

int

pg_codepage_to_encoding(UINT cp)

{

    char        sys[16];

    int         i;


    sprintf(sys, "CP%u", cp);


    /* Check the table */

    for (i = 0; encoding_match_list[i].system_enc_name; i++)

        if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)

            return encoding_match_list[i].pg_enc_code;


    ereport(WARNING,

            (errmsg("could not determine encoding for codeset \"%s\"", sys)));


    return -1;

}

#endif

#endif                          /* WIN32 */


/*

 * Given a setting for LC_CTYPE, return the Postgres ID of the associated

 * encoding, if we can determine it.  Return -1 if we can't determine it.

 *

 * Pass in NULL to get the encoding for the current locale setting.

 * Pass "" to get the encoding selected by the server's environment.

 *

 * If the result is PG_SQL_ASCII, callers should treat it as being compatible

 * with any desired encoding.

 *

 * If running in the backend and write_message is false, this function must

 * cope with the possibility that elog() and palloc() are not yet usable.

 */

int

pg_get_encoding_from_locale(const char *ctype, bool write_message)

{

    char       *sys;

    int         i;


#ifndef WIN32

    locale_t    loc;

#endif


    /* Get the CODESET property, and also LC_CTYPE if not passed in */

    if (!ctype)

        ctype = setlocale(LC_CTYPE, NULL);


    /* If locale is C or POSIX, we can allow all encodings */

    if (pg_strcasecmp(ctype, "C") == 0 ||

        pg_strcasecmp(ctype, "POSIX") == 0)

        return PG_SQL_ASCII;


#ifndef WIN32

    loc = newlocale(LC_CTYPE_MASK, ctype, (locale_t) 0);

    if (loc == (locale_t) 0)

        return -1;              /* bogus ctype passed in? */


    sys = nl_langinfo_l(CODESET, loc);

    if (sys)

        sys = strdup(sys);


    freelocale(loc);

#else

    sys = win32_get_codeset(ctype);

#endif


    if (!sys)

        return -1;              /* out of memory; unlikely */


    /* Check the table */

    for (i = 0; encoding_match_list[i].system_enc_name; i++)

    {

        if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)

        {

            free(sys);

            return encoding_match_list[i].pg_enc_code;

        }

    }


    /* Special-case kluges for particular platforms go here */


#ifdef __darwin__


    /*

     * Current macOS has many locales that report an empty string for CODESET,

     * but they all seem to actually use UTF-8.

     */

    if (strlen(sys) == 0)

    {

        free(sys);

        return PG_UTF8;

    }

#endif


    /*

     * We print a warning if we got a CODESET string but couldn't recognize

     * it.  This means we need another entry in the table.

     */

    if (write_message)

    {

#ifdef FRONTEND

        fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),

                ctype, sys);

        /* keep newline separate so there's only one translatable string */

        fputc('\n', stderr);

#else

        ereport(WARNING,

                (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",

                        ctype, sys)));

#endif

    }


    free(sys);

    return -1;

}

uint32
uint32_t uint32
Definition: c.h:502

encoding_match_list
static const struct encoding_match encoding_match_list[]
Definition: chklocale.c:45

pg_get_encoding_from_locale
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:301

fprintf
#define fprintf(file, fmt, msg)
Definition: cubescan.l:21

errmsg
int errmsg(const char *fmt,...)
Definition: elog.c:1071

_
#define _(x)
Definition: elog.c:91

WARNING
#define WARNING
Definition: elog.h:36

ereport
#define ereport(elevel,...)
Definition: elog.h:149

free
#define free(a)
Definition: header.h:65

malloc
#define malloc(a)
Definition: header.h:50

i
int i
Definition: isn.c:77

pg_wchar.h

pg_enc
pg_enc
Definition: pg_wchar.h:225

PG_WIN1254
@ PG_WIN1254
Definition: pg_wchar.h:257

PG_LATIN4
@ PG_LATIN4
Definition: pg_wchar.h:237

PG_LATIN9
@ PG_LATIN9
Definition: pg_wchar.h:242

PG_JOHAB
@ PG_JOHAB
Definition: pg_wchar.h:269

PG_GB18030
@ PG_GB18030
Definition: pg_wchar.h:268

PG_SQL_ASCII
@ PG_SQL_ASCII
Definition: pg_wchar.h:226

PG_KOI8R
@ PG_KOI8R
Definition: pg_wchar.h:248

PG_ISO_8859_6
@ PG_ISO_8859_6
Definition: pg_wchar.h:252

PG_WIN1253
@ PG_WIN1253
Definition: pg_wchar.h:256

PG_KOI8U
@ PG_KOI8U
Definition: pg_wchar.h:260

PG_LATIN6
@ PG_LATIN6
Definition: pg_wchar.h:239

PG_LATIN5
@ PG_LATIN5
Definition: pg_wchar.h:238

PG_EUC_CN
@ PG_EUC_CN
Definition: pg_wchar.h:228

PG_UHC
@ PG_UHC
Definition: pg_wchar.h:267

PG_LATIN2
@ PG_LATIN2
Definition: pg_wchar.h:235

PG_ISO_8859_5
@ PG_ISO_8859_5
Definition: pg_wchar.h:251

PG_LATIN10
@ PG_LATIN10
Definition: pg_wchar.h:243

PG_WIN1250
@ PG_WIN1250
Definition: pg_wchar.h:255

PG_ISO_8859_7
@ PG_ISO_8859_7
Definition: pg_wchar.h:253

PG_SJIS
@ PG_SJIS
Definition: pg_wchar.h:264

PG_LATIN8
@ PG_LATIN8
Definition: pg_wchar.h:241

PG_EUC_JP
@ PG_EUC_JP
Definition: pg_wchar.h:227

PG_GBK
@ PG_GBK
Definition: pg_wchar.h:266

PG_LATIN3
@ PG_LATIN3
Definition: pg_wchar.h:236

PG_WIN1256
@ PG_WIN1256
Definition: pg_wchar.h:244

PG_LATIN1
@ PG_LATIN1
Definition: pg_wchar.h:234

PG_EUC_TW
@ PG_EUC_TW
Definition: pg_wchar.h:230

PG_WIN1258
@ PG_WIN1258
Definition: pg_wchar.h:245

PG_SHIFT_JIS_2004
@ PG_SHIFT_JIS_2004
Definition: pg_wchar.h:270

PG_WIN1252
@ PG_WIN1252
Definition: pg_wchar.h:250

PG_LATIN7
@ PG_LATIN7
Definition: pg_wchar.h:240

PG_UTF8
@ PG_UTF8
Definition: pg_wchar.h:232

PG_WIN1255
@ PG_WIN1255
Definition: pg_wchar.h:258

PG_WIN1257
@ PG_WIN1257
Definition: pg_wchar.h:259

PG_WIN1251
@ PG_WIN1251
Definition: pg_wchar.h:249

PG_EUC_KR
@ PG_EUC_KR
Definition: pg_wchar.h:229

PG_WIN866
@ PG_WIN866
Definition: pg_wchar.h:246

PG_ISO_8859_8
@ PG_ISO_8859_8
Definition: pg_wchar.h:254

PG_WIN874
@ PG_WIN874
Definition: pg_wchar.h:247

PG_BIG5
@ PG_BIG5
Definition: pg_wchar.h:265

pg_strcasecmp
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36

sprintf
#define sprintf
Definition: port.h:241

postgres.h

postgres_fe.h

encoding_match
Definition: chklocale.c:40

encoding_match::pg_enc_code
enum pg_enc pg_enc_code
Definition: chklocale.c:41

encoding_match::system_enc_name
const char * system_enc_name
Definition: chklocale.c:42

locale_t
#define locale_t
Definition: win32_port.h:432

setlocale
#define setlocale(a, b)
Definition: win32_port.h:475