PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
chklocale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * chklocale.c
4  * Functions for handling locale-related info
5  *
6  *
7  * Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/port/chklocale.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #ifndef FRONTEND
17 #include "postgres.h"
18 #else
19 #include "postgres_fe.h"
20 #endif
21 
22 #if defined(WIN32) && (_MSC_VER >= 1900)
23 #include <windows.h>
24 #endif
25 
26 #include <locale.h>
27 #ifdef HAVE_LANGINFO_H
28 #include <langinfo.h>
29 #endif
30 
31 #include "mb/pg_wchar.h"
32 
33 
34 /*
35  * This table needs to recognize all the CODESET spellings for supported
36  * backend encodings, as well as frontend-only encodings where possible
37  * (the latter case is currently only needed for initdb to recognize
38  * error situations). On Windows, we rely on entries for codepage
39  * numbers (CPnnn).
40  *
41  * Note that we search the table with pg_strcasecmp(), so variant
42  * capitalizations don't need their own entries.
43  */
45 {
47  const char *system_enc_name;
48 };
49 
50 static const struct encoding_match encoding_match_list[] = {
51  {PG_EUC_JP, "EUC-JP"},
52  {PG_EUC_JP, "eucJP"},
53  {PG_EUC_JP, "IBM-eucJP"},
54  {PG_EUC_JP, "sdeckanji"},
55  {PG_EUC_JP, "CP20932"},
56 
57  {PG_EUC_CN, "EUC-CN"},
58  {PG_EUC_CN, "eucCN"},
59  {PG_EUC_CN, "IBM-eucCN"},
60  {PG_EUC_CN, "GB2312"},
61  {PG_EUC_CN, "dechanzi"},
62  {PG_EUC_CN, "CP20936"},
63 
64  {PG_EUC_KR, "EUC-KR"},
65  {PG_EUC_KR, "eucKR"},
66  {PG_EUC_KR, "IBM-eucKR"},
67  {PG_EUC_KR, "deckorean"},
68  {PG_EUC_KR, "5601"},
69  {PG_EUC_KR, "CP51949"},
70 
71  {PG_EUC_TW, "EUC-TW"},
72  {PG_EUC_TW, "eucTW"},
73  {PG_EUC_TW, "IBM-eucTW"},
74  {PG_EUC_TW, "cns11643"},
75  /* No codepage for EUC-TW ? */
76 
77  {PG_UTF8, "UTF-8"},
78  {PG_UTF8, "utf8"},
79  {PG_UTF8, "CP65001"},
80 
81  {PG_LATIN1, "ISO-8859-1"},
82  {PG_LATIN1, "ISO8859-1"},
83  {PG_LATIN1, "iso88591"},
84  {PG_LATIN1, "CP28591"},
85 
86  {PG_LATIN2, "ISO-8859-2"},
87  {PG_LATIN2, "ISO8859-2"},
88  {PG_LATIN2, "iso88592"},
89  {PG_LATIN2, "CP28592"},
90 
91  {PG_LATIN3, "ISO-8859-3"},
92  {PG_LATIN3, "ISO8859-3"},
93  {PG_LATIN3, "iso88593"},
94  {PG_LATIN3, "CP28593"},
95 
96  {PG_LATIN4, "ISO-8859-4"},
97  {PG_LATIN4, "ISO8859-4"},
98  {PG_LATIN4, "iso88594"},
99  {PG_LATIN4, "CP28594"},
100 
101  {PG_LATIN5, "ISO-8859-9"},
102  {PG_LATIN5, "ISO8859-9"},
103  {PG_LATIN5, "iso88599"},
104  {PG_LATIN5, "CP28599"},
105 
106  {PG_LATIN6, "ISO-8859-10"},
107  {PG_LATIN6, "ISO8859-10"},
108  {PG_LATIN6, "iso885910"},
109 
110  {PG_LATIN7, "ISO-8859-13"},
111  {PG_LATIN7, "ISO8859-13"},
112  {PG_LATIN7, "iso885913"},
113 
114  {PG_LATIN8, "ISO-8859-14"},
115  {PG_LATIN8, "ISO8859-14"},
116  {PG_LATIN8, "iso885914"},
117 
118  {PG_LATIN9, "ISO-8859-15"},
119  {PG_LATIN9, "ISO8859-15"},
120  {PG_LATIN9, "iso885915"},
121  {PG_LATIN9, "CP28605"},
122 
123  {PG_LATIN10, "ISO-8859-16"},
124  {PG_LATIN10, "ISO8859-16"},
125  {PG_LATIN10, "iso885916"},
126 
127  {PG_KOI8R, "KOI8-R"},
128  {PG_KOI8R, "CP20866"},
129 
130  {PG_KOI8U, "KOI8-U"},
131  {PG_KOI8U, "CP21866"},
132 
133  {PG_WIN866, "CP866"},
134  {PG_WIN874, "CP874"},
135  {PG_WIN1250, "CP1250"},
136  {PG_WIN1251, "CP1251"},
137  {PG_WIN1251, "ansi-1251"},
138  {PG_WIN1252, "CP1252"},
139  {PG_WIN1253, "CP1253"},
140  {PG_WIN1254, "CP1254"},
141  {PG_WIN1255, "CP1255"},
142  {PG_WIN1256, "CP1256"},
143  {PG_WIN1257, "CP1257"},
144  {PG_WIN1258, "CP1258"},
145 
146  {PG_ISO_8859_5, "ISO-8859-5"},
147  {PG_ISO_8859_5, "ISO8859-5"},
148  {PG_ISO_8859_5, "iso88595"},
149  {PG_ISO_8859_5, "CP28595"},
150 
151  {PG_ISO_8859_6, "ISO-8859-6"},
152  {PG_ISO_8859_6, "ISO8859-6"},
153  {PG_ISO_8859_6, "iso88596"},
154  {PG_ISO_8859_6, "CP28596"},
155 
156  {PG_ISO_8859_7, "ISO-8859-7"},
157  {PG_ISO_8859_7, "ISO8859-7"},
158  {PG_ISO_8859_7, "iso88597"},
159  {PG_ISO_8859_7, "CP28597"},
160 
161  {PG_ISO_8859_8, "ISO-8859-8"},
162  {PG_ISO_8859_8, "ISO8859-8"},
163  {PG_ISO_8859_8, "iso88598"},
164  {PG_ISO_8859_8, "CP28598"},
165 
166  {PG_SJIS, "SJIS"},
167  {PG_SJIS, "PCK"},
168  {PG_SJIS, "CP932"},
169  {PG_SJIS, "SHIFT_JIS"},
170 
171  {PG_BIG5, "BIG5"},
172  {PG_BIG5, "BIG5HKSCS"},
173  {PG_BIG5, "Big5-HKSCS"},
174  {PG_BIG5, "CP950"},
175 
176  {PG_GBK, "GBK"},
177  {PG_GBK, "CP936"},
178 
179  {PG_UHC, "UHC"},
180  {PG_UHC, "CP949"},
181 
182  {PG_JOHAB, "JOHAB"},
183  {PG_JOHAB, "CP1361"},
184 
185  {PG_GB18030, "GB18030"},
186  {PG_GB18030, "CP54936"},
187 
188  {PG_SHIFT_JIS_2004, "SJIS_2004"},
189 
190  {PG_SQL_ASCII, "US-ASCII"},
191 
192  {PG_SQL_ASCII, NULL} /* end marker */
193 };
194 
195 #ifdef WIN32
196 /*
197  * On Windows, use CP<code page number> instead of the nl_langinfo() result
198  *
199  * Visual Studio 2012 expanded the set of valid LC_CTYPE values, so have its
200  * locale machinery determine the code page. See comments at IsoLocaleName().
201  * For other compilers, follow the locale's predictable format.
202  *
203  * Visual Studio 2015 should still be able to do the same, but the declaration
204  * of lc_codepage is missing in _locale_t, causing this code compilation to
205  * fail, hence this falls back instead on GetLocaleInfoEx. VS 2015 may be an
206  * exception and post-VS2015 versions should be able to handle properly the
207  * codepage number using _create_locale(). So, instead of the same logic as
208  * VS 2012 and VS 2013, this routine uses GetLocaleInfoEx to parse short
209  * locale names like "de-DE", "fr-FR", etc. If those cannot be parsed correctly
210  * process falls back to the pre-VS-2010 manual parsing done with
211  * using <Language>_<Country>.<CodePage> as a base.
212  *
213  * Returns a malloc()'d string for the caller to free.
214  */
215 static char *
216 win32_langinfo(const char *ctype)
217 {
218  char *r = NULL;
219 
220 #if (_MSC_VER >= 1700) && (_MSC_VER < 1900)
221  _locale_t loct = NULL;
222 
223  loct = _create_locale(LC_CTYPE, ctype);
224  if (loct != NULL)
225  {
226  r = malloc(16); /* excess */
227  if (r != NULL)
228  sprintf(r, "CP%u", loct->locinfo->lc_codepage);
229  _free_locale(loct);
230  }
231 #else
232  char *codepage;
233 
234 #if (_MSC_VER >= 1900)
235  uint32 cp;
236  WCHAR wctype[LOCALE_NAME_MAX_LENGTH];
237 
238  memset(wctype, 0, sizeof(wctype));
239  MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
240 
241  if (GetLocaleInfoEx(wctype,
242  LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
243  (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
244  {
245  r = malloc(16); /* excess */
246  if (r != NULL)
247  sprintf(r, "CP%u", cp);
248  }
249  else
250 #endif
251  {
252  /*
253  * Locale format on Win32 is <Language>_<Country>.<CodePage> . For
254  * example, English_United States.1252.
255  */
256  codepage = strrchr(ctype, '.');
257  if (codepage != NULL)
258  {
259  int ln;
260 
261  codepage++;
262  ln = strlen(codepage);
263  r = malloc(ln + 3);
264  if (r != NULL)
265  sprintf(r, "CP%s", codepage);
266  }
267 
268  }
269 #endif
270 
271  return r;
272 }
273 
274 #ifndef FRONTEND
275 /*
276  * Given a Windows code page identifier, find the corresponding PostgreSQL
277  * encoding. Issue a warning and return -1 if none found.
278  */
279 int
280 pg_codepage_to_encoding(UINT cp)
281 {
282  char sys[16];
283  int i;
284 
285  sprintf(sys, "CP%u", cp);
286 
287  /* Check the table */
288  for (i = 0; encoding_match_list[i].system_enc_name; i++)
289  if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
290  return encoding_match_list[i].pg_enc_code;
291 
293  (errmsg("could not determine encoding for codeset \"%s\"", sys)));
294 
295  return -1;
296 }
297 #endif
298 #endif /* WIN32 */
299 
300 #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32)
301 
302 /*
303  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
304  * encoding, if we can determine it. Return -1 if we can't determine it.
305  *
306  * Pass in NULL to get the encoding for the current locale setting.
307  * Pass "" to get the encoding selected by the server's environment.
308  *
309  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
310  * with any desired encoding.
311  *
312  * If running in the backend and write_message is false, this function must
313  * cope with the possibility that elog() and palloc() are not yet usable.
314  */
315 int
316 pg_get_encoding_from_locale(const char *ctype, bool write_message)
317 {
318  char *sys;
319  int i;
320 
321  /* Get the CODESET property, and also LC_CTYPE if not passed in */
322  if (ctype)
323  {
324  char *save;
325  char *name;
326 
327  /* If locale is C or POSIX, we can allow all encodings */
328  if (pg_strcasecmp(ctype, "C") == 0 ||
329  pg_strcasecmp(ctype, "POSIX") == 0)
330  return PG_SQL_ASCII;
331 
332  save = setlocale(LC_CTYPE, NULL);
333  if (!save)
334  return -1; /* setlocale() broken? */
335  /* must copy result, or it might change after setlocale */
336  save = strdup(save);
337  if (!save)
338  return -1; /* out of memory; unlikely */
339 
340  name = setlocale(LC_CTYPE, ctype);
341  if (!name)
342  {
343  free(save);
344  return -1; /* bogus ctype passed in? */
345  }
346 
347 #ifndef WIN32
348  sys = nl_langinfo(CODESET);
349  if (sys)
350  sys = strdup(sys);
351 #else
352  sys = win32_langinfo(name);
353 #endif
354 
355  setlocale(LC_CTYPE, save);
356  free(save);
357  }
358  else
359  {
360  /* much easier... */
361  ctype = setlocale(LC_CTYPE, NULL);
362  if (!ctype)
363  return -1; /* setlocale() broken? */
364 
365  /* If locale is C or POSIX, we can allow all encodings */
366  if (pg_strcasecmp(ctype, "C") == 0 ||
367  pg_strcasecmp(ctype, "POSIX") == 0)
368  return PG_SQL_ASCII;
369 
370 #ifndef WIN32
371  sys = nl_langinfo(CODESET);
372  if (sys)
373  sys = strdup(sys);
374 #else
375  sys = win32_langinfo(ctype);
376 #endif
377  }
378 
379  if (!sys)
380  return -1; /* out of memory; unlikely */
381 
382  /* Check the table */
383  for (i = 0; encoding_match_list[i].system_enc_name; i++)
384  {
385  if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
386  {
387  free(sys);
388  return encoding_match_list[i].pg_enc_code;
389  }
390  }
391 
392  /* Special-case kluges for particular platforms go here */
393 
394 #ifdef __darwin__
395 
396  /*
397  * Current macOS has many locales that report an empty string for CODESET,
398  * but they all seem to actually use UTF-8.
399  */
400  if (strlen(sys) == 0)
401  {
402  free(sys);
403  return PG_UTF8;
404  }
405 #endif
406 
407  /*
408  * We print a warning if we got a CODESET string but couldn't recognize
409  * it. This means we need another entry in the table.
410  */
411  if (write_message)
412  {
413 #ifdef FRONTEND
414  fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
415  ctype, sys);
416  /* keep newline separate so there's only one translatable string */
417  fputc('\n', stderr);
418 #else
420  (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
421  ctype, sys)));
422 #endif
423  }
424 
425  free(sys);
426  return -1;
427 }
428 #else /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
429 
430 /*
431  * stub if no multi-language platform support
432  *
433  * Note: we could return -1 here, but that would have the effect of
434  * forcing users to specify an encoding to initdb on such platforms.
435  * It seems better to silently default to SQL_ASCII.
436  */
437 int
438 pg_get_encoding_from_locale(const char *ctype, bool write_message)
439 {
440  return PG_SQL_ASCII;
441 }
442 
443 #endif /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
enum pg_enc pg_enc_code
Definition: chklocale.c:46
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define malloc(a)
Definition: header.h:45
const char * system_enc_name
Definition: chklocale.c:47
unsigned int uint32
Definition: c.h:265
#define ereport(elevel, rest)
Definition: elog.h:122
static const struct encoding_match encoding_match_list[]
Definition: chklocale.c:50
#define WARNING
Definition: elog.h:40
#define free(a)
Definition: header.h:60
#define NULL
Definition: c.h:226
pg_enc
Definition: pg_wchar.h:236
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:438
const char * name
Definition: encode.c:521
int errmsg(const char *fmt,...)
Definition: elog.c:797
int i
#define _(x)
Definition: elog.c:84