PostgreSQL Source Code  git master
chklocale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * chklocale.c
4  * Functions for handling locale-related info
5  *
6  *
7  * Copyright (c) 1996-2024, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/port/chklocale.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #ifndef FRONTEND
17 #include "postgres.h"
18 #else
19 #include "postgres_fe.h"
20 #endif
21 
22 #ifndef WIN32
23 #include <langinfo.h>
24 #endif
25 
26 #ifdef LOCALE_T_IN_XLOCALE
27 #include <xlocale.h>
28 #endif
29 
30 #include "mb/pg_wchar.h"
31 
32 
33 /*
34  * This table needs to recognize all the CODESET spellings for supported
35  * backend encodings, as well as frontend-only encodings where possible
36  * (the latter case is currently only needed for initdb to recognize
37  * error situations). On Windows, we rely on entries for codepage
38  * numbers (CPnnn).
39  *
40  * Note that we search the table with pg_strcasecmp(), so variant
41  * capitalizations don't need their own entries.
42  */
44 {
45  enum pg_enc pg_enc_code;
46  const char *system_enc_name;
47 };
48 
49 static const struct encoding_match encoding_match_list[] = {
50  {PG_EUC_JP, "EUC-JP"},
51  {PG_EUC_JP, "eucJP"},
52  {PG_EUC_JP, "IBM-eucJP"},
53  {PG_EUC_JP, "sdeckanji"},
54  {PG_EUC_JP, "CP20932"},
55 
56  {PG_EUC_CN, "EUC-CN"},
57  {PG_EUC_CN, "eucCN"},
58  {PG_EUC_CN, "IBM-eucCN"},
59  {PG_EUC_CN, "GB2312"},
60  {PG_EUC_CN, "dechanzi"},
61  {PG_EUC_CN, "CP20936"},
62 
63  {PG_EUC_KR, "EUC-KR"},
64  {PG_EUC_KR, "eucKR"},
65  {PG_EUC_KR, "IBM-eucKR"},
66  {PG_EUC_KR, "deckorean"},
67  {PG_EUC_KR, "5601"},
68  {PG_EUC_KR, "CP51949"},
69 
70  {PG_EUC_TW, "EUC-TW"},
71  {PG_EUC_TW, "eucTW"},
72  {PG_EUC_TW, "IBM-eucTW"},
73  {PG_EUC_TW, "cns11643"},
74  /* No codepage for EUC-TW ? */
75 
76  {PG_UTF8, "UTF-8"},
77  {PG_UTF8, "utf8"},
78  {PG_UTF8, "CP65001"},
79 
80  {PG_LATIN1, "ISO-8859-1"},
81  {PG_LATIN1, "ISO8859-1"},
82  {PG_LATIN1, "iso88591"},
83  {PG_LATIN1, "CP28591"},
84 
85  {PG_LATIN2, "ISO-8859-2"},
86  {PG_LATIN2, "ISO8859-2"},
87  {PG_LATIN2, "iso88592"},
88  {PG_LATIN2, "CP28592"},
89 
90  {PG_LATIN3, "ISO-8859-3"},
91  {PG_LATIN3, "ISO8859-3"},
92  {PG_LATIN3, "iso88593"},
93  {PG_LATIN3, "CP28593"},
94 
95  {PG_LATIN4, "ISO-8859-4"},
96  {PG_LATIN4, "ISO8859-4"},
97  {PG_LATIN4, "iso88594"},
98  {PG_LATIN4, "CP28594"},
99 
100  {PG_LATIN5, "ISO-8859-9"},
101  {PG_LATIN5, "ISO8859-9"},
102  {PG_LATIN5, "iso88599"},
103  {PG_LATIN5, "CP28599"},
104 
105  {PG_LATIN6, "ISO-8859-10"},
106  {PG_LATIN6, "ISO8859-10"},
107  {PG_LATIN6, "iso885910"},
108 
109  {PG_LATIN7, "ISO-8859-13"},
110  {PG_LATIN7, "ISO8859-13"},
111  {PG_LATIN7, "iso885913"},
112 
113  {PG_LATIN8, "ISO-8859-14"},
114  {PG_LATIN8, "ISO8859-14"},
115  {PG_LATIN8, "iso885914"},
116 
117  {PG_LATIN9, "ISO-8859-15"},
118  {PG_LATIN9, "ISO8859-15"},
119  {PG_LATIN9, "iso885915"},
120  {PG_LATIN9, "CP28605"},
121 
122  {PG_LATIN10, "ISO-8859-16"},
123  {PG_LATIN10, "ISO8859-16"},
124  {PG_LATIN10, "iso885916"},
125 
126  {PG_KOI8R, "KOI8-R"},
127  {PG_KOI8R, "CP20866"},
128 
129  {PG_KOI8U, "KOI8-U"},
130  {PG_KOI8U, "CP21866"},
131 
132  {PG_WIN866, "CP866"},
133  {PG_WIN874, "CP874"},
134  {PG_WIN1250, "CP1250"},
135  {PG_WIN1251, "CP1251"},
136  {PG_WIN1251, "ansi-1251"},
137  {PG_WIN1252, "CP1252"},
138  {PG_WIN1253, "CP1253"},
139  {PG_WIN1254, "CP1254"},
140  {PG_WIN1255, "CP1255"},
141  {PG_WIN1256, "CP1256"},
142  {PG_WIN1257, "CP1257"},
143  {PG_WIN1258, "CP1258"},
144 
145  {PG_ISO_8859_5, "ISO-8859-5"},
146  {PG_ISO_8859_5, "ISO8859-5"},
147  {PG_ISO_8859_5, "iso88595"},
148  {PG_ISO_8859_5, "CP28595"},
149 
150  {PG_ISO_8859_6, "ISO-8859-6"},
151  {PG_ISO_8859_6, "ISO8859-6"},
152  {PG_ISO_8859_6, "iso88596"},
153  {PG_ISO_8859_6, "CP28596"},
154 
155  {PG_ISO_8859_7, "ISO-8859-7"},
156  {PG_ISO_8859_7, "ISO8859-7"},
157  {PG_ISO_8859_7, "iso88597"},
158  {PG_ISO_8859_7, "CP28597"},
159 
160  {PG_ISO_8859_8, "ISO-8859-8"},
161  {PG_ISO_8859_8, "ISO8859-8"},
162  {PG_ISO_8859_8, "iso88598"},
163  {PG_ISO_8859_8, "CP28598"},
164 
165  {PG_SJIS, "SJIS"},
166  {PG_SJIS, "PCK"},
167  {PG_SJIS, "CP932"},
168  {PG_SJIS, "SHIFT_JIS"},
169 
170  {PG_BIG5, "BIG5"},
171  {PG_BIG5, "BIG5HKSCS"},
172  {PG_BIG5, "Big5-HKSCS"},
173  {PG_BIG5, "CP950"},
174 
175  {PG_GBK, "GBK"},
176  {PG_GBK, "CP936"},
177 
178  {PG_UHC, "UHC"},
179  {PG_UHC, "CP949"},
180 
181  {PG_JOHAB, "JOHAB"},
182  {PG_JOHAB, "CP1361"},
183 
184  {PG_GB18030, "GB18030"},
185  {PG_GB18030, "CP54936"},
186 
187  {PG_SHIFT_JIS_2004, "SJIS_2004"},
188 
189  {PG_SQL_ASCII, "US-ASCII"},
190 
191  {PG_SQL_ASCII, NULL} /* end marker */
192 };
193 
194 #ifdef WIN32
195 /*
196  * On Windows, use CP<code page number> instead of CODESET.
197  *
198  * This routine uses GetLocaleInfoEx() to parse short locale names like
199  * "de-DE", "fr-FR", etc. If those cannot be parsed correctly process falls
200  * back to the pre-VS-2010 manual parsing done with using
201  * <Language>_<Country>.<CodePage> as a base.
202  *
203  * Returns a malloc()'d string for the caller to free.
204  */
205 static char *
206 win32_get_codeset(const char *ctype)
207 {
208  char *r = NULL;
209  char *codepage;
210  uint32 cp;
211  WCHAR wctype[LOCALE_NAME_MAX_LENGTH];
212 
213  memset(wctype, 0, sizeof(wctype));
214  MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
215 
216  if (GetLocaleInfoEx(wctype,
217  LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
218  (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
219  {
220  r = malloc(16); /* excess */
221  if (r != NULL)
222  {
223  /*
224  * If the return value is CP_ACP that means no ANSI code page is
225  * available, so only Unicode can be used for the locale.
226  */
227  if (cp == CP_ACP)
228  strcpy(r, "utf8");
229  else
230  sprintf(r, "CP%u", cp);
231  }
232  }
233  else
234  {
235  /*
236  * Locale format on Win32 is <Language>_<Country>.<CodePage>. For
237  * example, English_United States.1252. If we see digits after the
238  * last dot, assume it's a codepage number. Otherwise, we might be
239  * dealing with a Unix-style locale string; Windows' setlocale() will
240  * take those even though GetLocaleInfoEx() won't, so we end up here.
241  * In that case, just return what's after the last dot and hope we can
242  * find it in our table.
243  */
244  codepage = strrchr(ctype, '.');
245  if (codepage != NULL)
246  {
247  size_t ln;
248 
249  codepage++;
250  ln = strlen(codepage);
251  r = malloc(ln + 3);
252  if (r != NULL)
253  {
254  if (strspn(codepage, "0123456789") == ln)
255  sprintf(r, "CP%s", codepage);
256  else
257  strcpy(r, codepage);
258  }
259  }
260  }
261 
262  return r;
263 }
264 
265 #ifndef FRONTEND
266 /*
267  * Given a Windows code page identifier, find the corresponding PostgreSQL
268  * encoding. Issue a warning and return -1 if none found.
269  */
270 int
271 pg_codepage_to_encoding(UINT cp)
272 {
273  char sys[16];
274  int i;
275 
276  sprintf(sys, "CP%u", cp);
277 
278  /* Check the table */
279  for (i = 0; encoding_match_list[i].system_enc_name; i++)
282 
284  (errmsg("could not determine encoding for codeset \"%s\"", sys)));
285 
286  return -1;
287 }
288 #endif
289 #endif /* WIN32 */
290 
291 /*
292  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
293  * encoding, if we can determine it. Return -1 if we can't determine it.
294  *
295  * Pass in NULL to get the encoding for the current locale setting.
296  * Pass "" to get the encoding selected by the server's environment.
297  *
298  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
299  * with any desired encoding.
300  *
301  * If running in the backend and write_message is false, this function must
302  * cope with the possibility that elog() and palloc() are not yet usable.
303  */
304 int
305 pg_get_encoding_from_locale(const char *ctype, bool write_message)
306 {
307  char *sys;
308  int i;
309 
310 #ifndef WIN32
311  locale_t loc;
312 #endif
313 
314  /* Get the CODESET property, and also LC_CTYPE if not passed in */
315  if (!ctype)
316  ctype = setlocale(LC_CTYPE, NULL);
317 
318 
319  /* If locale is C or POSIX, we can allow all encodings */
320  if (pg_strcasecmp(ctype, "C") == 0 ||
321  pg_strcasecmp(ctype, "POSIX") == 0)
322  return PG_SQL_ASCII;
323 
324 
325 #ifndef WIN32
326  loc = newlocale(LC_CTYPE_MASK, ctype, (locale_t) 0);
327  if (loc == (locale_t) 0)
328  return -1; /* bogus ctype passed in? */
329 
330  sys = nl_langinfo_l(CODESET, loc);
331  if (sys)
332  sys = strdup(sys);
333 
334  freelocale(loc);
335 #else
336  sys = win32_get_codeset(ctype);
337 #endif
338 
339  if (!sys)
340  return -1; /* out of memory; unlikely */
341 
342  /* Check the table */
343  for (i = 0; encoding_match_list[i].system_enc_name; i++)
344  {
346  {
347  free(sys);
349  }
350  }
351 
352  /* Special-case kluges for particular platforms go here */
353 
354 #ifdef __darwin__
355 
356  /*
357  * Current macOS has many locales that report an empty string for CODESET,
358  * but they all seem to actually use UTF-8.
359  */
360  if (strlen(sys) == 0)
361  {
362  free(sys);
363  return PG_UTF8;
364  }
365 #endif
366 
367  /*
368  * We print a warning if we got a CODESET string but couldn't recognize
369  * it. This means we need another entry in the table.
370  */
371  if (write_message)
372  {
373 #ifdef FRONTEND
374  fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
375  ctype, sys);
376  /* keep newline separate so there's only one translatable string */
377  fputc('\n', stderr);
378 #else
380  (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
381  ctype, sys)));
382 #endif
383  }
384 
385  free(sys);
386  return -1;
387 }
unsigned int uint32
Definition: c.h:506
static const struct encoding_match encoding_match_list[]
Definition: chklocale.c:49
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:305
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define _(x)
Definition: elog.c:90
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
int i
Definition: isn.c:73
pg_enc
Definition: pg_wchar.h:225
@ PG_WIN1254
Definition: pg_wchar.h:257
@ PG_LATIN4
Definition: pg_wchar.h:237
@ PG_LATIN9
Definition: pg_wchar.h:242
@ PG_JOHAB
Definition: pg_wchar.h:269
@ PG_GB18030
Definition: pg_wchar.h:268
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_ISO_8859_6
Definition: pg_wchar.h:252
@ PG_WIN1253
Definition: pg_wchar.h:256
@ PG_KOI8U
Definition: pg_wchar.h:260
@ PG_LATIN6
Definition: pg_wchar.h:239
@ PG_LATIN5
Definition: pg_wchar.h:238
@ PG_EUC_CN
Definition: pg_wchar.h:228
@ PG_UHC
Definition: pg_wchar.h:267
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_ISO_8859_5
Definition: pg_wchar.h:251
@ PG_LATIN10
Definition: pg_wchar.h:243
@ PG_WIN1250
Definition: pg_wchar.h:255
@ PG_ISO_8859_7
Definition: pg_wchar.h:253
@ PG_SJIS
Definition: pg_wchar.h:264
@ PG_LATIN8
Definition: pg_wchar.h:241
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_GBK
Definition: pg_wchar.h:266
@ PG_LATIN3
Definition: pg_wchar.h:236
@ PG_WIN1256
Definition: pg_wchar.h:244
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_EUC_TW
Definition: pg_wchar.h:230
@ PG_WIN1258
Definition: pg_wchar.h:245
@ PG_SHIFT_JIS_2004
Definition: pg_wchar.h:270
@ PG_WIN1252
Definition: pg_wchar.h:250
@ PG_LATIN7
Definition: pg_wchar.h:240
@ PG_UTF8
Definition: pg_wchar.h:232
@ PG_WIN1255
Definition: pg_wchar.h:258
@ PG_WIN1257
Definition: pg_wchar.h:259
@ PG_WIN1251
Definition: pg_wchar.h:249
@ PG_EUC_KR
Definition: pg_wchar.h:229
@ PG_WIN866
Definition: pg_wchar.h:246
@ PG_ISO_8859_8
Definition: pg_wchar.h:254
@ PG_WIN874
Definition: pg_wchar.h:247
@ PG_BIG5
Definition: pg_wchar.h:265
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define sprintf
Definition: port.h:240
#define fprintf
Definition: port.h:242
enum pg_enc pg_enc_code
Definition: chklocale.c:45
const char * system_enc_name
Definition: chklocale.c:46
#define locale_t
Definition: win32_port.h:442
#define setlocale(a, b)
Definition: win32_port.h:485