PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
encnames.c
Go to the documentation of this file.
1 /*
2  * Encoding names and routines for work with it. All
3  * in this file is shared between FE and BE.
4  *
5  * src/backend/utils/mb/encnames.c
6  */
7 #ifdef FRONTEND
8 #include "postgres_fe.h"
9 #else
10 #include "postgres.h"
11 #include "utils/builtins.h"
12 #endif
13 
14 #include <ctype.h>
15 #include <unistd.h>
16 
17 #include "mb/pg_wchar.h"
18 
19 
20 /* ----------
21  * All encoding names, sorted: *** A L P H A B E T I C ***
22  *
23  * All names must be without irrelevant chars, search routines use
24  * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
25  * are always converted to 'iso88591'. All must be lower case.
26  *
27  * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
28  *
29  * Karel Zak, Aug 2001
30  * ----------
31  */
32 typedef struct pg_encname
33 {
34  const char *name;
36 } pg_encname;
37 
38 static const pg_encname pg_encname_tbl[] =
39 {
40  {
41  "abc", PG_WIN1258
42  }, /* alias for WIN1258 */
43  {
44  "alt", PG_WIN866
45  }, /* IBM866 */
46  {
47  "big5", PG_BIG5
48  }, /* Big5; Chinese for Taiwan multibyte set */
49  {
50  "euccn", PG_EUC_CN
51  }, /* EUC-CN; Extended Unix Code for simplified
52  * Chinese */
53  {
54  "eucjis2004", PG_EUC_JIS_2004
55  }, /* EUC-JIS-2004; Extended UNIX Code fixed
56  * Width for Japanese, standard JIS X 0213 */
57  {
58  "eucjp", PG_EUC_JP
59  }, /* EUC-JP; Extended UNIX Code fixed Width for
60  * Japanese, standard OSF */
61  {
62  "euckr", PG_EUC_KR
63  }, /* EUC-KR; Extended Unix Code for Korean , KS
64  * X 1001 standard */
65  {
66  "euctw", PG_EUC_TW
67  }, /* EUC-TW; Extended Unix Code for
68  *
69  * traditional Chinese */
70  {
71  "gb18030", PG_GB18030
72  }, /* GB18030;GB18030 */
73  {
74  "gbk", PG_GBK
75  }, /* GBK; Chinese Windows CodePage 936
76  * simplified Chinese */
77  {
78  "iso88591", PG_LATIN1
79  }, /* ISO-8859-1; RFC1345,KXS2 */
80  {
81  "iso885910", PG_LATIN6
82  }, /* ISO-8859-10; RFC1345,KXS2 */
83  {
84  "iso885913", PG_LATIN7
85  }, /* ISO-8859-13; RFC1345,KXS2 */
86  {
87  "iso885914", PG_LATIN8
88  }, /* ISO-8859-14; RFC1345,KXS2 */
89  {
90  "iso885915", PG_LATIN9
91  }, /* ISO-8859-15; RFC1345,KXS2 */
92  {
93  "iso885916", PG_LATIN10
94  }, /* ISO-8859-16; RFC1345,KXS2 */
95  {
96  "iso88592", PG_LATIN2
97  }, /* ISO-8859-2; RFC1345,KXS2 */
98  {
99  "iso88593", PG_LATIN3
100  }, /* ISO-8859-3; RFC1345,KXS2 */
101  {
102  "iso88594", PG_LATIN4
103  }, /* ISO-8859-4; RFC1345,KXS2 */
104  {
105  "iso88595", PG_ISO_8859_5
106  }, /* ISO-8859-5; RFC1345,KXS2 */
107  {
108  "iso88596", PG_ISO_8859_6
109  }, /* ISO-8859-6; RFC1345,KXS2 */
110  {
111  "iso88597", PG_ISO_8859_7
112  }, /* ISO-8859-7; RFC1345,KXS2 */
113  {
114  "iso88598", PG_ISO_8859_8
115  }, /* ISO-8859-8; RFC1345,KXS2 */
116  {
117  "iso88599", PG_LATIN5
118  }, /* ISO-8859-9; RFC1345,KXS2 */
119  {
120  "johab", PG_JOHAB
121  }, /* JOHAB; Extended Unix Code for simplified
122  * Chinese */
123  {
124  "koi8", PG_KOI8R
125  }, /* _dirty_ alias for KOI8-R (backward
126  * compatibility) */
127  {
128  "koi8r", PG_KOI8R
129  }, /* KOI8-R; RFC1489 */
130  {
131  "koi8u", PG_KOI8U
132  }, /* KOI8-U; RFC2319 */
133  {
134  "latin1", PG_LATIN1
135  }, /* alias for ISO-8859-1 */
136  {
137  "latin10", PG_LATIN10
138  }, /* alias for ISO-8859-16 */
139  {
140  "latin2", PG_LATIN2
141  }, /* alias for ISO-8859-2 */
142  {
143  "latin3", PG_LATIN3
144  }, /* alias for ISO-8859-3 */
145  {
146  "latin4", PG_LATIN4
147  }, /* alias for ISO-8859-4 */
148  {
149  "latin5", PG_LATIN5
150  }, /* alias for ISO-8859-9 */
151  {
152  "latin6", PG_LATIN6
153  }, /* alias for ISO-8859-10 */
154  {
155  "latin7", PG_LATIN7
156  }, /* alias for ISO-8859-13 */
157  {
158  "latin8", PG_LATIN8
159  }, /* alias for ISO-8859-14 */
160  {
161  "latin9", PG_LATIN9
162  }, /* alias for ISO-8859-15 */
163  {
164  "mskanji", PG_SJIS
165  }, /* alias for Shift_JIS */
166  {
167  "muleinternal", PG_MULE_INTERNAL
168  },
169  {
170  "shiftjis", PG_SJIS
171  }, /* Shift_JIS; JIS X 0202-1991 */
172 
173  {
174  "shiftjis2004", PG_SHIFT_JIS_2004
175  }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
176  * standard JIS X 0213 */
177  {
178  "sjis", PG_SJIS
179  }, /* alias for Shift_JIS */
180  {
181  "sqlascii", PG_SQL_ASCII
182  },
183  {
184  "tcvn", PG_WIN1258
185  }, /* alias for WIN1258 */
186  {
187  "tcvn5712", PG_WIN1258
188  }, /* alias for WIN1258 */
189  {
190  "uhc", PG_UHC
191  }, /* UHC; Korean Windows CodePage 949 */
192  {
193  "unicode", PG_UTF8
194  }, /* alias for UTF8 */
195  {
196  "utf8", PG_UTF8
197  }, /* alias for UTF8 */
198  {
199  "vscii", PG_WIN1258
200  }, /* alias for WIN1258 */
201  {
202  "win", PG_WIN1251
203  }, /* _dirty_ alias for windows-1251 (backward
204  * compatibility) */
205  {
206  "win1250", PG_WIN1250
207  }, /* alias for Windows-1250 */
208  {
209  "win1251", PG_WIN1251
210  }, /* alias for Windows-1251 */
211  {
212  "win1252", PG_WIN1252
213  }, /* alias for Windows-1252 */
214  {
215  "win1253", PG_WIN1253
216  }, /* alias for Windows-1253 */
217  {
218  "win1254", PG_WIN1254
219  }, /* alias for Windows-1254 */
220  {
221  "win1255", PG_WIN1255
222  }, /* alias for Windows-1255 */
223  {
224  "win1256", PG_WIN1256
225  }, /* alias for Windows-1256 */
226  {
227  "win1257", PG_WIN1257
228  }, /* alias for Windows-1257 */
229  {
230  "win1258", PG_WIN1258
231  }, /* alias for Windows-1258 */
232  {
233  "win866", PG_WIN866
234  }, /* IBM866 */
235  {
236  "win874", PG_WIN874
237  }, /* alias for Windows-874 */
238  {
239  "win932", PG_SJIS
240  }, /* alias for Shift_JIS */
241  {
242  "win936", PG_GBK
243  }, /* alias for GBK */
244  {
245  "win949", PG_UHC
246  }, /* alias for UHC */
247  {
248  "win950", PG_BIG5
249  }, /* alias for BIG5 */
250  {
251  "windows1250", PG_WIN1250
252  }, /* Windows-1251; Microsoft */
253  {
254  "windows1251", PG_WIN1251
255  }, /* Windows-1251; Microsoft */
256  {
257  "windows1252", PG_WIN1252
258  }, /* Windows-1252; Microsoft */
259  {
260  "windows1253", PG_WIN1253
261  }, /* Windows-1253; Microsoft */
262  {
263  "windows1254", PG_WIN1254
264  }, /* Windows-1254; Microsoft */
265  {
266  "windows1255", PG_WIN1255
267  }, /* Windows-1255; Microsoft */
268  {
269  "windows1256", PG_WIN1256
270  }, /* Windows-1256; Microsoft */
271  {
272  "windows1257", PG_WIN1257
273  }, /* Windows-1257; Microsoft */
274  {
275  "windows1258", PG_WIN1258
276  }, /* Windows-1258; Microsoft */
277  {
278  "windows866", PG_WIN866
279  }, /* IBM866 */
280  {
281  "windows874", PG_WIN874
282  }, /* Windows-874; Microsoft */
283  {
284  "windows932", PG_SJIS
285  }, /* alias for Shift_JIS */
286  {
287  "windows936", PG_GBK
288  }, /* alias for GBK */
289  {
290  "windows949", PG_UHC
291  }, /* alias for UHC */
292  {
293  "windows950", PG_BIG5
294  } /* alias for BIG5 */
295 };
296 
297 /* ----------
298  * These are "official" encoding names.
299  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
300  * ----------
301  */
302 #ifndef WIN32
303 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304 #else
305 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306 #endif
308 {
309  DEF_ENC2NAME(SQL_ASCII, 0),
310  DEF_ENC2NAME(EUC_JP, 20932),
311  DEF_ENC2NAME(EUC_CN, 20936),
312  DEF_ENC2NAME(EUC_KR, 51949),
313  DEF_ENC2NAME(EUC_TW, 0),
314  DEF_ENC2NAME(EUC_JIS_2004, 20932),
315  DEF_ENC2NAME(UTF8, 65001),
316  DEF_ENC2NAME(MULE_INTERNAL, 0),
317  DEF_ENC2NAME(LATIN1, 28591),
318  DEF_ENC2NAME(LATIN2, 28592),
319  DEF_ENC2NAME(LATIN3, 28593),
320  DEF_ENC2NAME(LATIN4, 28594),
321  DEF_ENC2NAME(LATIN5, 28599),
322  DEF_ENC2NAME(LATIN6, 0),
323  DEF_ENC2NAME(LATIN7, 0),
324  DEF_ENC2NAME(LATIN8, 0),
325  DEF_ENC2NAME(LATIN9, 28605),
326  DEF_ENC2NAME(LATIN10, 0),
327  DEF_ENC2NAME(WIN1256, 1256),
328  DEF_ENC2NAME(WIN1258, 1258),
329  DEF_ENC2NAME(WIN866, 866),
330  DEF_ENC2NAME(WIN874, 874),
331  DEF_ENC2NAME(KOI8R, 20866),
332  DEF_ENC2NAME(WIN1251, 1251),
333  DEF_ENC2NAME(WIN1252, 1252),
334  DEF_ENC2NAME(ISO_8859_5, 28595),
335  DEF_ENC2NAME(ISO_8859_6, 28596),
336  DEF_ENC2NAME(ISO_8859_7, 28597),
337  DEF_ENC2NAME(ISO_8859_8, 28598),
338  DEF_ENC2NAME(WIN1250, 1250),
339  DEF_ENC2NAME(WIN1253, 1253),
340  DEF_ENC2NAME(WIN1254, 1254),
341  DEF_ENC2NAME(WIN1255, 1255),
342  DEF_ENC2NAME(WIN1257, 1257),
343  DEF_ENC2NAME(KOI8U, 21866),
344  DEF_ENC2NAME(SJIS, 932),
345  DEF_ENC2NAME(BIG5, 950),
346  DEF_ENC2NAME(GBK, 936),
347  DEF_ENC2NAME(UHC, 949),
348  DEF_ENC2NAME(GB18030, 54936),
349  DEF_ENC2NAME(JOHAB, 0),
350  DEF_ENC2NAME(SHIFT_JIS_2004, 932)
351 };
352 
353 /* ----------
354  * These are encoding names for gettext.
355  *
356  * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
357  * ----------
358  */
360 {
361  {PG_SQL_ASCII, "US-ASCII"},
362  {PG_UTF8, "UTF-8"},
363  {PG_LATIN1, "LATIN1"},
364  {PG_LATIN2, "LATIN2"},
365  {PG_LATIN3, "LATIN3"},
366  {PG_LATIN4, "LATIN4"},
367  {PG_ISO_8859_5, "ISO-8859-5"},
368  {PG_ISO_8859_6, "ISO_8859-6"},
369  {PG_ISO_8859_7, "ISO-8859-7"},
370  {PG_ISO_8859_8, "ISO-8859-8"},
371  {PG_LATIN5, "LATIN5"},
372  {PG_LATIN6, "LATIN6"},
373  {PG_LATIN7, "LATIN7"},
374  {PG_LATIN8, "LATIN8"},
375  {PG_LATIN9, "LATIN-9"},
376  {PG_LATIN10, "LATIN10"},
377  {PG_KOI8R, "KOI8-R"},
378  {PG_KOI8U, "KOI8-U"},
379  {PG_WIN1250, "CP1250"},
380  {PG_WIN1251, "CP1251"},
381  {PG_WIN1252, "CP1252"},
382  {PG_WIN1253, "CP1253"},
383  {PG_WIN1254, "CP1254"},
384  {PG_WIN1255, "CP1255"},
385  {PG_WIN1256, "CP1256"},
386  {PG_WIN1257, "CP1257"},
387  {PG_WIN1258, "CP1258"},
388  {PG_WIN866, "CP866"},
389  {PG_WIN874, "CP874"},
390  {PG_EUC_CN, "EUC-CN"},
391  {PG_EUC_JP, "EUC-JP"},
392  {PG_EUC_KR, "EUC-KR"},
393  {PG_EUC_TW, "EUC-TW"},
394  {PG_EUC_JIS_2004, "EUC-JP"},
395  {PG_SJIS, "SHIFT-JIS"},
396  {PG_BIG5, "BIG5"},
397  {PG_GBK, "GBK"},
398  {PG_UHC, "UHC"},
399  {PG_GB18030, "GB18030"},
400  {PG_JOHAB, "JOHAB"},
401  {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
402  {0, NULL}
403 };
404 
405 
406 #ifndef FRONTEND
407 
408 /*
409  * Table of encoding names for ICU
410  *
411  * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412  *
413  * NULL entries are not supported by ICU, or their mapping is unclear.
414  */
415 static const char *const pg_enc2icu_tbl[] =
416 {
417  NULL, /* PG_SQL_ASCII */
418  "EUC-JP", /* PG_EUC_JP */
419  "EUC-CN", /* PG_EUC_CN */
420  "EUC-KR", /* PG_EUC_KR */
421  "EUC-TW", /* PG_EUC_TW */
422  NULL, /* PG_EUC_JIS_2004 */
423  "UTF-8", /* PG_UTF8 */
424  NULL, /* PG_MULE_INTERNAL */
425  "ISO-8859-1", /* PG_LATIN1 */
426  "ISO-8859-2", /* PG_LATIN2 */
427  "ISO-8859-3", /* PG_LATIN3 */
428  "ISO-8859-4", /* PG_LATIN4 */
429  "ISO-8859-9", /* PG_LATIN5 */
430  "ISO-8859-10", /* PG_LATIN6 */
431  "ISO-8859-13", /* PG_LATIN7 */
432  "ISO-8859-14", /* PG_LATIN8 */
433  "ISO-8859-15", /* PG_LATIN9 */
434  NULL, /* PG_LATIN10 */
435  "CP1256", /* PG_WIN1256 */
436  "CP1258", /* PG_WIN1258 */
437  "CP866", /* PG_WIN866 */
438  NULL, /* PG_WIN874 */
439  "KOI8-R", /* PG_KOI8R */
440  "CP1251", /* PG_WIN1251 */
441  "CP1252", /* PG_WIN1252 */
442  "ISO-8859-5", /* PG_ISO_8859_5 */
443  "ISO-8859-6", /* PG_ISO_8859_6 */
444  "ISO-8859-7", /* PG_ISO_8859_7 */
445  "ISO-8859-8", /* PG_ISO_8859_8 */
446  "CP1250", /* PG_WIN1250 */
447  "CP1253", /* PG_WIN1253 */
448  "CP1254", /* PG_WIN1254 */
449  "CP1255", /* PG_WIN1255 */
450  "CP1257", /* PG_WIN1257 */
451  "KOI8-U", /* PG_KOI8U */
452 };
453 
454 bool
456 {
457  return (pg_enc2icu_tbl[encoding] != NULL);
458 }
459 
460 const char *
462 {
463  const char *icu_encoding_name;
464 
466  "pg_enc2icu_tbl incomplete");
467 
468  icu_encoding_name = pg_enc2icu_tbl[encoding];
469 
470  if (!icu_encoding_name)
471  ereport(ERROR,
472  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
473  errmsg("encoding \"%s\" not supported by ICU",
474  pg_encoding_to_char(encoding))));
475 
476  return icu_encoding_name;
477 }
478 
479 #endif /* not FRONTEND */
480 
481 
482 /* ----------
483  * Encoding checks, for error returns -1 else encoding id
484  * ----------
485  */
486 int
488 {
489  int enc;
490 
491  if ((enc = pg_char_to_encoding(name)) < 0)
492  return -1;
493 
494  if (!PG_VALID_FE_ENCODING(enc))
495  return -1;
496 
497  return enc;
498 }
499 
500 int
502 {
503  int enc;
504 
505  if ((enc = pg_char_to_encoding(name)) < 0)
506  return -1;
507 
508  if (!PG_VALID_BE_ENCODING(enc))
509  return -1;
510 
511  return enc;
512 }
513 
514 int
516 {
517  return PG_VALID_BE_ENCODING(encoding);
518 }
519 
520 /* ----------
521  * Remove irrelevant chars from encoding name
522  * ----------
523  */
524 static char *
525 clean_encoding_name(const char *key, char *newkey)
526 {
527  const char *p;
528  char *np;
529 
530  for (p = key, np = newkey; *p != '\0'; p++)
531  {
532  if (isalnum((unsigned char) *p))
533  {
534  if (*p >= 'A' && *p <= 'Z')
535  *np++ = *p + 'a' - 'A';
536  else
537  *np++ = *p;
538  }
539  }
540  *np = '\0';
541  return newkey;
542 }
543 
544 /* ----------
545  * Search encoding by encoding name
546  *
547  * Returns encoding ID, or -1 for error
548  * ----------
549  */
550 int
552 {
553  unsigned int nel = lengthof(pg_encname_tbl);
554  const pg_encname *base = pg_encname_tbl,
555  *last = base + nel - 1,
556  *position;
557  int result;
558  char buff[NAMEDATALEN],
559  *key;
560 
561  if (name == NULL || *name == '\0')
562  return -1;
563 
564  if (strlen(name) >= NAMEDATALEN)
565  {
566 #ifdef FRONTEND
567  fprintf(stderr, "encoding name too long\n");
568  return -1;
569 #else
570  ereport(ERROR,
571  (errcode(ERRCODE_NAME_TOO_LONG),
572  errmsg("encoding name too long")));
573 #endif
574  }
575  key = clean_encoding_name(name, buff);
576 
577  while (last >= base)
578  {
579  position = base + ((last - base) >> 1);
580  result = key[0] - position->name[0];
581 
582  if (result == 0)
583  {
584  result = strcmp(key, position->name);
585  if (result == 0)
586  return position->encoding;
587  }
588  if (result < 0)
589  last = position - 1;
590  else
591  base = position + 1;
592  }
593  return -1;
594 }
595 
596 #ifndef FRONTEND
597 Datum
599 {
600  Name s = PG_GETARG_NAME(0);
601 
603 }
604 #endif
605 
606 const char *
608 {
609  if (PG_VALID_ENCODING(encoding))
610  {
611  const pg_enc2name *p = &pg_enc2name_tbl[encoding];
612 
613  Assert(encoding == p->encoding);
614  return p->name;
615  }
616  return "";
617 }
618 
619 #ifndef FRONTEND
620 Datum
622 {
624  const char *encoding_name = pg_encoding_to_char(encoding);
625 
626  return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
627 }
628 
629 #endif
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:46
int pg_char_to_encoding(const char *name)
Definition: encnames.c:551
int pg_valid_server_encoding(const char *name)
Definition: encnames.c:501
bool is_encoding_supported_by_icu(int encoding)
Definition: encnames.c:455
#define PG_RETURN_INT32(x)
Definition: fmgr.h:314
int pg_valid_client_encoding(const char *name)
Definition: encnames.c:487
int pg_valid_server_encoding_id(int encoding)
Definition: encnames.c:515
int errcode(int sqlerrcode)
Definition: elog.c:575
Datum PG_encoding_to_char(PG_FUNCTION_ARGS)
Definition: encnames.c:621
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:585
#define lengthof(array)
Definition: c.h:556
signed int int32
Definition: c.h:246
Datum PG_char_to_encoding(PG_FUNCTION_ARGS)
Definition: encnames.c:598
const pg_enc2gettext pg_enc2gettext_tbl[]
Definition: encnames.c:359
const char * name
Definition: encnames.c:34
#define NAMEDATALEN
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:763
#define ERROR
Definition: elog.h:43
struct pg_encoding enc
Definition: encode.c:522
pg_enc encoding
Definition: encnames.c:35
Definition: c.h:487
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:461
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:307
#define CStringGetDatum(X)
Definition: postgres.h:584
const char * name
Definition: pg_wchar.h:328
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:305
static const pg_encname pg_encname_tbl[]
Definition: encnames.c:38
#define ereport(elevel, rest)
Definition: elog.h:122
#define PG_ENCODING_BE_LAST
Definition: pg_wchar.h:289
uintptr_t Datum
Definition: postgres.h:372
static const char *const pg_enc2icu_tbl[]
Definition: encnames.c:415
pg_enc encoding
Definition: pg_wchar.h:329
static char * encoding
Definition: initdb.c:123
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:607
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:295
#define Assert(condition)
Definition: c.h:681
pg_enc
Definition: pg_wchar.h:238
const char * name
Definition: encode.c:521
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define DEF_ENC2NAME(name, codepage)
Definition: encnames.c:303
#define NameStr(name)
Definition: c.h:493
static char * clean_encoding_name(const char *key, char *newkey)
Definition: encnames.c:525
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define PG_GETARG_NAME(n)
Definition: fmgr.h:243
struct pg_encname pg_encname