PostgreSQL Source Code  git master
encnames.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * encnames.c
4  * Encoding names and routines for working with them.
5  *
6  * Portions Copyright (c) 2001-2020, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/common/encnames.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include <ctype.h>
16 #include <unistd.h>
17 
18 #include "mb/pg_wchar.h"
19 
20 
21 /* ----------
22  * All encoding names, sorted: *** A L P H A B E T I C ***
23  *
24  * All names must be without irrelevant chars, search routines use
25  * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26  * are always converted to 'iso88591'. All must be lower case.
27  *
28  * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29  *
30  * Karel Zak, Aug 2001
31  * ----------
32  */
33 typedef struct pg_encname
34 {
35  const char *name;
37 } pg_encname;
38 
39 static const pg_encname pg_encname_tbl[] =
40 {
41  {
42  "abc", PG_WIN1258
43  }, /* alias for WIN1258 */
44  {
45  "alt", PG_WIN866
46  }, /* IBM866 */
47  {
48  "big5", PG_BIG5
49  }, /* Big5; Chinese for Taiwan multibyte set */
50  {
51  "euccn", PG_EUC_CN
52  }, /* EUC-CN; Extended Unix Code for simplified
53  * Chinese */
54  {
55  "eucjis2004", PG_EUC_JIS_2004
56  }, /* EUC-JIS-2004; Extended UNIX Code fixed
57  * Width for Japanese, standard JIS X 0213 */
58  {
59  "eucjp", PG_EUC_JP
60  }, /* EUC-JP; Extended UNIX Code fixed Width for
61  * Japanese, standard OSF */
62  {
63  "euckr", PG_EUC_KR
64  }, /* EUC-KR; Extended Unix Code for Korean , KS
65  * X 1001 standard */
66  {
67  "euctw", PG_EUC_TW
68  }, /* EUC-TW; Extended Unix Code for
69  *
70  * traditional Chinese */
71  {
72  "gb18030", PG_GB18030
73  }, /* GB18030;GB18030 */
74  {
75  "gbk", PG_GBK
76  }, /* GBK; Chinese Windows CodePage 936
77  * simplified Chinese */
78  {
79  "iso88591", PG_LATIN1
80  }, /* ISO-8859-1; RFC1345,KXS2 */
81  {
82  "iso885910", PG_LATIN6
83  }, /* ISO-8859-10; RFC1345,KXS2 */
84  {
85  "iso885913", PG_LATIN7
86  }, /* ISO-8859-13; RFC1345,KXS2 */
87  {
88  "iso885914", PG_LATIN8
89  }, /* ISO-8859-14; RFC1345,KXS2 */
90  {
91  "iso885915", PG_LATIN9
92  }, /* ISO-8859-15; RFC1345,KXS2 */
93  {
94  "iso885916", PG_LATIN10
95  }, /* ISO-8859-16; RFC1345,KXS2 */
96  {
97  "iso88592", PG_LATIN2
98  }, /* ISO-8859-2; RFC1345,KXS2 */
99  {
100  "iso88593", PG_LATIN3
101  }, /* ISO-8859-3; RFC1345,KXS2 */
102  {
103  "iso88594", PG_LATIN4
104  }, /* ISO-8859-4; RFC1345,KXS2 */
105  {
106  "iso88595", PG_ISO_8859_5
107  }, /* ISO-8859-5; RFC1345,KXS2 */
108  {
109  "iso88596", PG_ISO_8859_6
110  }, /* ISO-8859-6; RFC1345,KXS2 */
111  {
112  "iso88597", PG_ISO_8859_7
113  }, /* ISO-8859-7; RFC1345,KXS2 */
114  {
115  "iso88598", PG_ISO_8859_8
116  }, /* ISO-8859-8; RFC1345,KXS2 */
117  {
118  "iso88599", PG_LATIN5
119  }, /* ISO-8859-9; RFC1345,KXS2 */
120  {
121  "johab", PG_JOHAB
122  }, /* JOHAB; Extended Unix Code for simplified
123  * Chinese */
124  {
125  "koi8", PG_KOI8R
126  }, /* _dirty_ alias for KOI8-R (backward
127  * compatibility) */
128  {
129  "koi8r", PG_KOI8R
130  }, /* KOI8-R; RFC1489 */
131  {
132  "koi8u", PG_KOI8U
133  }, /* KOI8-U; RFC2319 */
134  {
135  "latin1", PG_LATIN1
136  }, /* alias for ISO-8859-1 */
137  {
138  "latin10", PG_LATIN10
139  }, /* alias for ISO-8859-16 */
140  {
141  "latin2", PG_LATIN2
142  }, /* alias for ISO-8859-2 */
143  {
144  "latin3", PG_LATIN3
145  }, /* alias for ISO-8859-3 */
146  {
147  "latin4", PG_LATIN4
148  }, /* alias for ISO-8859-4 */
149  {
150  "latin5", PG_LATIN5
151  }, /* alias for ISO-8859-9 */
152  {
153  "latin6", PG_LATIN6
154  }, /* alias for ISO-8859-10 */
155  {
156  "latin7", PG_LATIN7
157  }, /* alias for ISO-8859-13 */
158  {
159  "latin8", PG_LATIN8
160  }, /* alias for ISO-8859-14 */
161  {
162  "latin9", PG_LATIN9
163  }, /* alias for ISO-8859-15 */
164  {
165  "mskanji", PG_SJIS
166  }, /* alias for Shift_JIS */
167  {
168  "muleinternal", PG_MULE_INTERNAL
169  },
170  {
171  "shiftjis", PG_SJIS
172  }, /* Shift_JIS; JIS X 0202-1991 */
173 
174  {
175  "shiftjis2004", PG_SHIFT_JIS_2004
176  }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
177  * standard JIS X 0213 */
178  {
179  "sjis", PG_SJIS
180  }, /* alias for Shift_JIS */
181  {
182  "sqlascii", PG_SQL_ASCII
183  },
184  {
185  "tcvn", PG_WIN1258
186  }, /* alias for WIN1258 */
187  {
188  "tcvn5712", PG_WIN1258
189  }, /* alias for WIN1258 */
190  {
191  "uhc", PG_UHC
192  }, /* UHC; Korean Windows CodePage 949 */
193  {
194  "unicode", PG_UTF8
195  }, /* alias for UTF8 */
196  {
197  "utf8", PG_UTF8
198  }, /* alias for UTF8 */
199  {
200  "vscii", PG_WIN1258
201  }, /* alias for WIN1258 */
202  {
203  "win", PG_WIN1251
204  }, /* _dirty_ alias for windows-1251 (backward
205  * compatibility) */
206  {
207  "win1250", PG_WIN1250
208  }, /* alias for Windows-1250 */
209  {
210  "win1251", PG_WIN1251
211  }, /* alias for Windows-1251 */
212  {
213  "win1252", PG_WIN1252
214  }, /* alias for Windows-1252 */
215  {
216  "win1253", PG_WIN1253
217  }, /* alias for Windows-1253 */
218  {
219  "win1254", PG_WIN1254
220  }, /* alias for Windows-1254 */
221  {
222  "win1255", PG_WIN1255
223  }, /* alias for Windows-1255 */
224  {
225  "win1256", PG_WIN1256
226  }, /* alias for Windows-1256 */
227  {
228  "win1257", PG_WIN1257
229  }, /* alias for Windows-1257 */
230  {
231  "win1258", PG_WIN1258
232  }, /* alias for Windows-1258 */
233  {
234  "win866", PG_WIN866
235  }, /* IBM866 */
236  {
237  "win874", PG_WIN874
238  }, /* alias for Windows-874 */
239  {
240  "win932", PG_SJIS
241  }, /* alias for Shift_JIS */
242  {
243  "win936", PG_GBK
244  }, /* alias for GBK */
245  {
246  "win949", PG_UHC
247  }, /* alias for UHC */
248  {
249  "win950", PG_BIG5
250  }, /* alias for BIG5 */
251  {
252  "windows1250", PG_WIN1250
253  }, /* Windows-1251; Microsoft */
254  {
255  "windows1251", PG_WIN1251
256  }, /* Windows-1251; Microsoft */
257  {
258  "windows1252", PG_WIN1252
259  }, /* Windows-1252; Microsoft */
260  {
261  "windows1253", PG_WIN1253
262  }, /* Windows-1253; Microsoft */
263  {
264  "windows1254", PG_WIN1254
265  }, /* Windows-1254; Microsoft */
266  {
267  "windows1255", PG_WIN1255
268  }, /* Windows-1255; Microsoft */
269  {
270  "windows1256", PG_WIN1256
271  }, /* Windows-1256; Microsoft */
272  {
273  "windows1257", PG_WIN1257
274  }, /* Windows-1257; Microsoft */
275  {
276  "windows1258", PG_WIN1258
277  }, /* Windows-1258; Microsoft */
278  {
279  "windows866", PG_WIN866
280  }, /* IBM866 */
281  {
282  "windows874", PG_WIN874
283  }, /* Windows-874; Microsoft */
284  {
285  "windows932", PG_SJIS
286  }, /* alias for Shift_JIS */
287  {
288  "windows936", PG_GBK
289  }, /* alias for GBK */
290  {
291  "windows949", PG_UHC
292  }, /* alias for UHC */
293  {
294  "windows950", PG_BIG5
295  } /* alias for BIG5 */
296 };
297 
298 /* ----------
299  * These are "official" encoding names.
300  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
301  * ----------
302  */
303 #ifndef WIN32
304 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
305 #else
306 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
307 #endif
308 
310 {
311  DEF_ENC2NAME(SQL_ASCII, 0),
312  DEF_ENC2NAME(EUC_JP, 20932),
313  DEF_ENC2NAME(EUC_CN, 20936),
314  DEF_ENC2NAME(EUC_KR, 51949),
315  DEF_ENC2NAME(EUC_TW, 0),
316  DEF_ENC2NAME(EUC_JIS_2004, 20932),
317  DEF_ENC2NAME(UTF8, 65001),
318  DEF_ENC2NAME(MULE_INTERNAL, 0),
319  DEF_ENC2NAME(LATIN1, 28591),
320  DEF_ENC2NAME(LATIN2, 28592),
321  DEF_ENC2NAME(LATIN3, 28593),
322  DEF_ENC2NAME(LATIN4, 28594),
323  DEF_ENC2NAME(LATIN5, 28599),
324  DEF_ENC2NAME(LATIN6, 0),
325  DEF_ENC2NAME(LATIN7, 0),
326  DEF_ENC2NAME(LATIN8, 0),
327  DEF_ENC2NAME(LATIN9, 28605),
328  DEF_ENC2NAME(LATIN10, 0),
329  DEF_ENC2NAME(WIN1256, 1256),
330  DEF_ENC2NAME(WIN1258, 1258),
331  DEF_ENC2NAME(WIN866, 866),
332  DEF_ENC2NAME(WIN874, 874),
333  DEF_ENC2NAME(KOI8R, 20866),
334  DEF_ENC2NAME(WIN1251, 1251),
335  DEF_ENC2NAME(WIN1252, 1252),
336  DEF_ENC2NAME(ISO_8859_5, 28595),
337  DEF_ENC2NAME(ISO_8859_6, 28596),
338  DEF_ENC2NAME(ISO_8859_7, 28597),
339  DEF_ENC2NAME(ISO_8859_8, 28598),
340  DEF_ENC2NAME(WIN1250, 1250),
341  DEF_ENC2NAME(WIN1253, 1253),
342  DEF_ENC2NAME(WIN1254, 1254),
343  DEF_ENC2NAME(WIN1255, 1255),
344  DEF_ENC2NAME(WIN1257, 1257),
345  DEF_ENC2NAME(KOI8U, 21866),
346  DEF_ENC2NAME(SJIS, 932),
347  DEF_ENC2NAME(BIG5, 950),
348  DEF_ENC2NAME(GBK, 936),
349  DEF_ENC2NAME(UHC, 949),
350  DEF_ENC2NAME(GB18030, 54936),
351  DEF_ENC2NAME(JOHAB, 0),
352  DEF_ENC2NAME(SHIFT_JIS_2004, 932)
353 };
354 
355 /* ----------
356  * These are encoding names for gettext.
357  *
358  * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
359  * ----------
360  */
362 {
363  {PG_SQL_ASCII, "US-ASCII"},
364  {PG_UTF8, "UTF-8"},
365  {PG_LATIN1, "LATIN1"},
366  {PG_LATIN2, "LATIN2"},
367  {PG_LATIN3, "LATIN3"},
368  {PG_LATIN4, "LATIN4"},
369  {PG_ISO_8859_5, "ISO-8859-5"},
370  {PG_ISO_8859_6, "ISO_8859-6"},
371  {PG_ISO_8859_7, "ISO-8859-7"},
372  {PG_ISO_8859_8, "ISO-8859-8"},
373  {PG_LATIN5, "LATIN5"},
374  {PG_LATIN6, "LATIN6"},
375  {PG_LATIN7, "LATIN7"},
376  {PG_LATIN8, "LATIN8"},
377  {PG_LATIN9, "LATIN-9"},
378  {PG_LATIN10, "LATIN10"},
379  {PG_KOI8R, "KOI8-R"},
380  {PG_KOI8U, "KOI8-U"},
381  {PG_WIN1250, "CP1250"},
382  {PG_WIN1251, "CP1251"},
383  {PG_WIN1252, "CP1252"},
384  {PG_WIN1253, "CP1253"},
385  {PG_WIN1254, "CP1254"},
386  {PG_WIN1255, "CP1255"},
387  {PG_WIN1256, "CP1256"},
388  {PG_WIN1257, "CP1257"},
389  {PG_WIN1258, "CP1258"},
390  {PG_WIN866, "CP866"},
391  {PG_WIN874, "CP874"},
392  {PG_EUC_CN, "EUC-CN"},
393  {PG_EUC_JP, "EUC-JP"},
394  {PG_EUC_KR, "EUC-KR"},
395  {PG_EUC_TW, "EUC-TW"},
396  {PG_EUC_JIS_2004, "EUC-JP"},
397  {PG_SJIS, "SHIFT-JIS"},
398  {PG_BIG5, "BIG5"},
399  {PG_GBK, "GBK"},
400  {PG_UHC, "UHC"},
401  {PG_GB18030, "GB18030"},
402  {PG_JOHAB, "JOHAB"},
403  {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
404  {0, NULL}
405 };
406 
407 
408 /*
409  * Table of encoding names for ICU (currently covers backend encodings only)
410  *
411  * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412  *
413  * NULL entries are not supported by ICU, or their mapping is unclear.
414  */
415 static const char *const pg_enc2icu_tbl[] =
416 {
417  NULL, /* PG_SQL_ASCII */
418  "EUC-JP", /* PG_EUC_JP */
419  "EUC-CN", /* PG_EUC_CN */
420  "EUC-KR", /* PG_EUC_KR */
421  "EUC-TW", /* PG_EUC_TW */
422  NULL, /* PG_EUC_JIS_2004 */
423  "UTF-8", /* PG_UTF8 */
424  NULL, /* PG_MULE_INTERNAL */
425  "ISO-8859-1", /* PG_LATIN1 */
426  "ISO-8859-2", /* PG_LATIN2 */
427  "ISO-8859-3", /* PG_LATIN3 */
428  "ISO-8859-4", /* PG_LATIN4 */
429  "ISO-8859-9", /* PG_LATIN5 */
430  "ISO-8859-10", /* PG_LATIN6 */
431  "ISO-8859-13", /* PG_LATIN7 */
432  "ISO-8859-14", /* PG_LATIN8 */
433  "ISO-8859-15", /* PG_LATIN9 */
434  NULL, /* PG_LATIN10 */
435  "CP1256", /* PG_WIN1256 */
436  "CP1258", /* PG_WIN1258 */
437  "CP866", /* PG_WIN866 */
438  NULL, /* PG_WIN874 */
439  "KOI8-R", /* PG_KOI8R */
440  "CP1251", /* PG_WIN1251 */
441  "CP1252", /* PG_WIN1252 */
442  "ISO-8859-5", /* PG_ISO_8859_5 */
443  "ISO-8859-6", /* PG_ISO_8859_6 */
444  "ISO-8859-7", /* PG_ISO_8859_7 */
445  "ISO-8859-8", /* PG_ISO_8859_8 */
446  "CP1250", /* PG_WIN1250 */
447  "CP1253", /* PG_WIN1253 */
448  "CP1254", /* PG_WIN1254 */
449  "CP1255", /* PG_WIN1255 */
450  "CP1257", /* PG_WIN1257 */
451  "KOI8-U", /* PG_KOI8U */
452 };
453 
454 
455 /*
456  * Is this encoding supported by ICU?
457  */
458 bool
460 {
461  if (!PG_VALID_BE_ENCODING(encoding))
462  return false;
463  return (pg_enc2icu_tbl[encoding] != NULL);
464 }
465 
466 /*
467  * Returns ICU's name for encoding, or NULL if not supported
468  */
469 const char *
471 {
473  "pg_enc2icu_tbl incomplete");
474 
475  if (!PG_VALID_BE_ENCODING(encoding))
476  return NULL;
477  return pg_enc2icu_tbl[encoding];
478 }
479 
480 
481 /* ----------
482  * Encoding checks, for error returns -1 else encoding id
483  * ----------
484  */
485 int
487 {
488  int enc;
489 
490  if ((enc = pg_char_to_encoding(name)) < 0)
491  return -1;
492 
493  if (!PG_VALID_FE_ENCODING(enc))
494  return -1;
495 
496  return enc;
497 }
498 
499 int
501 {
502  int enc;
503 
504  if ((enc = pg_char_to_encoding(name)) < 0)
505  return -1;
506 
507  if (!PG_VALID_BE_ENCODING(enc))
508  return -1;
509 
510  return enc;
511 }
512 
513 int
515 {
516  return PG_VALID_BE_ENCODING(encoding);
517 }
518 
519 /*
520  * Remove irrelevant chars from encoding name, store at *newkey
521  *
522  * (Caller's responsibility to provide a large enough buffer)
523  */
524 static char *
525 clean_encoding_name(const char *key, char *newkey)
526 {
527  const char *p;
528  char *np;
529 
530  for (p = key, np = newkey; *p != '\0'; p++)
531  {
532  if (isalnum((unsigned char) *p))
533  {
534  if (*p >= 'A' && *p <= 'Z')
535  *np++ = *p + 'a' - 'A';
536  else
537  *np++ = *p;
538  }
539  }
540  *np = '\0';
541  return newkey;
542 }
543 
544 /*
545  * Search encoding by encoding name
546  *
547  * Returns encoding ID, or -1 if not recognized
548  */
549 int
551 {
552  unsigned int nel = lengthof(pg_encname_tbl);
553  const pg_encname *base = pg_encname_tbl,
554  *last = base + nel - 1,
555  *position;
556  int result;
557  char buff[NAMEDATALEN],
558  *key;
559 
560  if (name == NULL || *name == '\0')
561  return -1;
562 
563  if (strlen(name) >= NAMEDATALEN)
564  return -1; /* it's certainly not in the table */
565 
566  key = clean_encoding_name(name, buff);
567 
568  while (last >= base)
569  {
570  position = base + ((last - base) >> 1);
571  result = key[0] - position->name[0];
572 
573  if (result == 0)
574  {
575  result = strcmp(key, position->name);
576  if (result == 0)
577  return position->encoding;
578  }
579  if (result < 0)
580  last = position - 1;
581  else
582  base = position + 1;
583  }
584  return -1;
585 }
586 
587 const char *
589 {
590  if (PG_VALID_ENCODING(encoding))
591  {
592  const pg_enc2name *p = &pg_enc2name_tbl[encoding];
593 
594  Assert(encoding == p->encoding);
595  return p->name;
596  }
597  return "";
598 }
int pg_char_to_encoding(const char *name)
Definition: encnames.c:550
int pg_valid_server_encoding(const char *name)
Definition: encnames.c:500
bool is_encoding_supported_by_icu(int encoding)
Definition: encnames.c:459
int pg_valid_client_encoding(const char *name)
Definition: encnames.c:486
int pg_valid_server_encoding_id(int encoding)
Definition: encnames.c:514
#define lengthof(array)
Definition: c.h:675
const pg_enc2gettext pg_enc2gettext_tbl[]
Definition: encnames.c:361
const char * name
Definition: encnames.c:35
#define NAMEDATALEN
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:859
struct pg_encoding enc
Definition: encode.c:562
pg_enc encoding
Definition: encnames.c:36
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:470
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:309
const char * name
Definition: pg_wchar.h:337
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:305
static const pg_encname pg_encname_tbl[]
Definition: encnames.c:39
#define PG_ENCODING_BE_LAST
Definition: pg_wchar.h:289
static const char *const pg_enc2icu_tbl[]
Definition: encnames.c:415
pg_enc encoding
Definition: pg_wchar.h:338
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:588
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:295
#define Assert(condition)
Definition: c.h:745
pg_enc
Definition: pg_wchar.h:238
#define DEF_ENC2NAME(name, codepage)
Definition: encnames.c:304
static char * clean_encoding_name(const char *key, char *newkey)
Definition: encnames.c:525
struct pg_encname pg_encname