PostgreSQL Source Code git master
Loading...
Searching...
No Matches
encnames.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * encnames.c
4 * Encoding names and routines for working with them.
5 *
6 * Portions Copyright (c) 2001-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/encnames.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include <ctype.h>
16#include <unistd.h>
17
18#include "mb/pg_wchar.h"
19
20
21/* ----------
22 * All encoding names, sorted: *** A L P H A B E T I C ***
23 *
24 * All names must be without irrelevant chars, search routines use
25 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26 * are always converted to 'iso88591'. All must be lower case.
27 *
28 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29 *
30 * Karel Zak, Aug 2001
31 * ----------
32 */
33typedef struct pg_encname
34{
35 const char *name;
38
39static const pg_encname pg_encname_tbl[] =
40{
41 {
42 "abc", PG_WIN1258
43 }, /* alias for WIN1258 */
44 {
45 "alt", PG_WIN866
46 }, /* IBM866 */
47 {
48 "big5", PG_BIG5
49 }, /* Big5; Chinese for Taiwan multibyte set */
50 {
51 "euccn", PG_EUC_CN
52 }, /* EUC-CN; Extended Unix Code for simplified
53 * Chinese */
54 {
55 "eucjis2004", PG_EUC_JIS_2004
56 }, /* EUC-JIS-2004; Extended UNIX Code fixed
57 * Width for Japanese, standard JIS X 0213 */
58 {
59 "eucjp", PG_EUC_JP
60 }, /* EUC-JP; Extended UNIX Code fixed Width for
61 * Japanese, standard OSF */
62 {
63 "euckr", PG_EUC_KR
64 }, /* EUC-KR; Extended Unix Code for Korean
65 * precomposed (Wansung) encoding, standard KS
66 * X 1001 */
67 {
68 "euctw", PG_EUC_TW
69 }, /* EUC-TW; Extended Unix Code for
70 *
71 * traditional Chinese */
72 {
73 "gb18030", PG_GB18030
74 }, /* GB18030;GB18030 */
75 {
76 "gbk", PG_GBK
77 }, /* GBK; Chinese Windows CodePage 936
78 * simplified Chinese */
79 {
80 "iso88591", PG_LATIN1
81 }, /* ISO-8859-1; RFC1345,KXS2 */
82 {
83 "iso885910", PG_LATIN6
84 }, /* ISO-8859-10; RFC1345,KXS2 */
85 {
86 "iso885913", PG_LATIN7
87 }, /* ISO-8859-13; RFC1345,KXS2 */
88 {
89 "iso885914", PG_LATIN8
90 }, /* ISO-8859-14; RFC1345,KXS2 */
91 {
92 "iso885915", PG_LATIN9
93 }, /* ISO-8859-15; RFC1345,KXS2 */
94 {
95 "iso885916", PG_LATIN10
96 }, /* ISO-8859-16; RFC1345,KXS2 */
97 {
98 "iso88592", PG_LATIN2
99 }, /* ISO-8859-2; RFC1345,KXS2 */
100 {
101 "iso88593", PG_LATIN3
102 }, /* ISO-8859-3; RFC1345,KXS2 */
103 {
104 "iso88594", PG_LATIN4
105 }, /* ISO-8859-4; RFC1345,KXS2 */
106 {
107 "iso88595", PG_ISO_8859_5
108 }, /* ISO-8859-5; RFC1345,KXS2 */
109 {
110 "iso88596", PG_ISO_8859_6
111 }, /* ISO-8859-6; RFC1345,KXS2 */
112 {
113 "iso88597", PG_ISO_8859_7
114 }, /* ISO-8859-7; RFC1345,KXS2 */
115 {
116 "iso88598", PG_ISO_8859_8
117 }, /* ISO-8859-8; RFC1345,KXS2 */
118 {
119 "iso88599", PG_LATIN5
120 }, /* ISO-8859-9; RFC1345,KXS2 */
121 {
122 "johab", PG_JOHAB
123 }, /* JOHAB; Korean combining (Johab) encoding,
124 * standard KS X 1001 annex 3 */
125 {
126 "koi8", PG_KOI8R
127 }, /* _dirty_ alias for KOI8-R (backward
128 * compatibility) */
129 {
130 "koi8r", PG_KOI8R
131 }, /* KOI8-R; RFC1489 */
132 {
133 "koi8u", PG_KOI8U
134 }, /* KOI8-U; RFC2319 */
135 {
136 "latin1", PG_LATIN1
137 }, /* alias for ISO-8859-1 */
138 {
139 "latin10", PG_LATIN10
140 }, /* alias for ISO-8859-16 */
141 {
142 "latin2", PG_LATIN2
143 }, /* alias for ISO-8859-2 */
144 {
145 "latin3", PG_LATIN3
146 }, /* alias for ISO-8859-3 */
147 {
148 "latin4", PG_LATIN4
149 }, /* alias for ISO-8859-4 */
150 {
151 "latin5", PG_LATIN5
152 }, /* alias for ISO-8859-9 */
153 {
154 "latin6", PG_LATIN6
155 }, /* alias for ISO-8859-10 */
156 {
157 "latin7", PG_LATIN7
158 }, /* alias for ISO-8859-13 */
159 {
160 "latin8", PG_LATIN8
161 }, /* alias for ISO-8859-14 */
162 {
163 "latin9", PG_LATIN9
164 }, /* alias for ISO-8859-15 */
165 {
166 "mskanji", PG_SJIS
167 }, /* alias for Shift_JIS */
168 {
169 "shiftjis", PG_SJIS
170 }, /* Shift_JIS; JIS X 0202-1991 */
171
172 {
173 "shiftjis2004", PG_SHIFT_JIS_2004
174 }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
175 * standard JIS X 0213 */
176 {
177 "sjis", PG_SJIS
178 }, /* alias for Shift_JIS */
179 {
180 "sqlascii", PG_SQL_ASCII
181 },
182 {
183 "tcvn", PG_WIN1258
184 }, /* alias for WIN1258 */
185 {
186 "tcvn5712", PG_WIN1258
187 }, /* alias for WIN1258 */
188 {
189 "uhc", PG_UHC
190 }, /* UHC; Unified Hangul Code, Microsoft Windows
191 * CodePage 949; superset of EUC-KR covering
192 * all 11,172 precomposed Hangul syllables */
193 {
194 "unicode", PG_UTF8
195 }, /* alias for UTF8 */
196 {
197 "utf8", PG_UTF8
198 }, /* alias for UTF8 */
199 {
200 "vscii", PG_WIN1258
201 }, /* alias for WIN1258 */
202 {
203 "win", PG_WIN1251
204 }, /* _dirty_ alias for windows-1251 (backward
205 * compatibility) */
206 {
207 "win1250", PG_WIN1250
208 }, /* alias for Windows-1250 */
209 {
210 "win1251", PG_WIN1251
211 }, /* alias for Windows-1251 */
212 {
213 "win1252", PG_WIN1252
214 }, /* alias for Windows-1252 */
215 {
216 "win1253", PG_WIN1253
217 }, /* alias for Windows-1253 */
218 {
219 "win1254", PG_WIN1254
220 }, /* alias for Windows-1254 */
221 {
222 "win1255", PG_WIN1255
223 }, /* alias for Windows-1255 */
224 {
225 "win1256", PG_WIN1256
226 }, /* alias for Windows-1256 */
227 {
228 "win1257", PG_WIN1257
229 }, /* alias for Windows-1257 */
230 {
231 "win1258", PG_WIN1258
232 }, /* alias for Windows-1258 */
233 {
234 "win866", PG_WIN866
235 }, /* IBM866 */
236 {
237 "win874", PG_WIN874
238 }, /* alias for Windows-874 */
239 {
240 "win932", PG_SJIS
241 }, /* alias for Shift_JIS */
242 {
243 "win936", PG_GBK
244 }, /* alias for GBK */
245 {
246 "win949", PG_UHC
247 }, /* alias for UHC */
248 {
249 "win950", PG_BIG5
250 }, /* alias for BIG5 */
251 {
252 "windows1250", PG_WIN1250
253 }, /* Windows-1251; Microsoft */
254 {
255 "windows1251", PG_WIN1251
256 }, /* Windows-1251; Microsoft */
257 {
258 "windows1252", PG_WIN1252
259 }, /* Windows-1252; Microsoft */
260 {
261 "windows1253", PG_WIN1253
262 }, /* Windows-1253; Microsoft */
263 {
264 "windows1254", PG_WIN1254
265 }, /* Windows-1254; Microsoft */
266 {
267 "windows1255", PG_WIN1255
268 }, /* Windows-1255; Microsoft */
269 {
270 "windows1256", PG_WIN1256
271 }, /* Windows-1256; Microsoft */
272 {
273 "windows1257", PG_WIN1257
274 }, /* Windows-1257; Microsoft */
275 {
276 "windows1258", PG_WIN1258
277 }, /* Windows-1258; Microsoft */
278 {
279 "windows866", PG_WIN866
280 }, /* IBM866 */
281 {
282 "windows874", PG_WIN874
283 }, /* Windows-874; Microsoft */
284 {
285 "windows932", PG_SJIS
286 }, /* alias for Shift_JIS */
287 {
288 "windows936", PG_GBK
289 }, /* alias for GBK */
290 {
291 "windows949", PG_UHC
292 }, /* alias for UHC */
293 {
294 "windows950", PG_BIG5
295 } /* alias for BIG5 */
296};
297
298/* ----------
299 * These are "official" encoding names.
300 * ----------
301 */
302#ifndef WIN32
303#define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304#else
305#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306#endif
307
309{
311 [PG_EUC_JP] = DEF_ENC2NAME(EUC_JP, 20932),
312 [PG_EUC_CN] = DEF_ENC2NAME(EUC_CN, 20936),
313 [PG_EUC_KR] = DEF_ENC2NAME(EUC_KR, 51949),
316 [PG_UTF8] = DEF_ENC2NAME(UTF8, 65001),
317 [PG_LATIN1] = DEF_ENC2NAME(LATIN1, 28591),
318 [PG_LATIN2] = DEF_ENC2NAME(LATIN2, 28592),
319 [PG_LATIN3] = DEF_ENC2NAME(LATIN3, 28593),
320 [PG_LATIN4] = DEF_ENC2NAME(LATIN4, 28594),
321 [PG_LATIN5] = DEF_ENC2NAME(LATIN5, 28599),
325 [PG_LATIN9] = DEF_ENC2NAME(LATIN9, 28605),
329 [PG_WIN866] = DEF_ENC2NAME(WIN866, 866),
330 [PG_WIN874] = DEF_ENC2NAME(WIN874, 874),
331 [PG_KOI8R] = DEF_ENC2NAME(KOI8R, 20866),
343 [PG_KOI8U] = DEF_ENC2NAME(KOI8U, 21866),
344 [PG_SJIS] = DEF_ENC2NAME(SJIS, 932),
345 [PG_BIG5] = DEF_ENC2NAME(BIG5, 950),
346 [PG_GBK] = DEF_ENC2NAME(GBK, 936),
347 [PG_UHC] = DEF_ENC2NAME(UHC, 949),
348 [PG_GB18030] = DEF_ENC2NAME(GB18030, 54936),
351};
352
353/* ----------
354 * These are encoding names for gettext.
355 * ----------
356 */
357const char *pg_enc2gettext_tbl[] =
358{
359 [PG_SQL_ASCII] = "US-ASCII",
360 [PG_UTF8] = "UTF-8",
361 [PG_LATIN1] = "LATIN1",
362 [PG_LATIN2] = "LATIN2",
363 [PG_LATIN3] = "LATIN3",
364 [PG_LATIN4] = "LATIN4",
365 [PG_ISO_8859_5] = "ISO-8859-5",
366 [PG_ISO_8859_6] = "ISO_8859-6",
367 [PG_ISO_8859_7] = "ISO-8859-7",
368 [PG_ISO_8859_8] = "ISO-8859-8",
369 [PG_LATIN5] = "LATIN5",
370 [PG_LATIN6] = "LATIN6",
371 [PG_LATIN7] = "LATIN7",
372 [PG_LATIN8] = "LATIN8",
373 [PG_LATIN9] = "LATIN-9",
374 [PG_LATIN10] = "LATIN10",
375 [PG_KOI8R] = "KOI8-R",
376 [PG_KOI8U] = "KOI8-U",
377 [PG_WIN1250] = "CP1250",
378 [PG_WIN1251] = "CP1251",
379 [PG_WIN1252] = "CP1252",
380 [PG_WIN1253] = "CP1253",
381 [PG_WIN1254] = "CP1254",
382 [PG_WIN1255] = "CP1255",
383 [PG_WIN1256] = "CP1256",
384 [PG_WIN1257] = "CP1257",
385 [PG_WIN1258] = "CP1258",
386 [PG_WIN866] = "CP866",
387 [PG_WIN874] = "CP874",
388 [PG_EUC_CN] = "EUC-CN",
389 [PG_EUC_JP] = "EUC-JP",
390 [PG_EUC_KR] = "EUC-KR",
391 [PG_EUC_TW] = "EUC-TW",
392 [PG_EUC_JIS_2004] = "EUC-JP",
393 [PG_SJIS] = "SHIFT-JIS",
394 [PG_BIG5] = "BIG5",
395 [PG_GBK] = "GBK",
396 [PG_UHC] = "UHC",
397 [PG_GB18030] = "GB18030",
398 [PG_JOHAB] = "JOHAB",
399 [PG_SHIFT_JIS_2004] = "SHIFT_JISX0213",
400};
401
402
403/*
404 * Table of encoding names for ICU (currently covers backend encodings only)
405 *
406 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
407 *
408 * NULL entries are not supported by ICU, or their mapping is unclear.
409 */
410static const char *const pg_enc2icu_tbl[] =
411{
412 [PG_SQL_ASCII] = NULL,
413 [PG_EUC_JP] = "EUC-JP",
414 [PG_EUC_CN] = "EUC-CN",
415 [PG_EUC_KR] = "EUC-KR",
416 [PG_EUC_TW] = "EUC-TW",
418 [PG_UTF8] = "UTF-8",
419 [PG_LATIN1] = "ISO-8859-1",
420 [PG_LATIN2] = "ISO-8859-2",
421 [PG_LATIN3] = "ISO-8859-3",
422 [PG_LATIN4] = "ISO-8859-4",
423 [PG_LATIN5] = "ISO-8859-9",
424 [PG_LATIN6] = "ISO-8859-10",
425 [PG_LATIN7] = "ISO-8859-13",
426 [PG_LATIN8] = "ISO-8859-14",
427 [PG_LATIN9] = "ISO-8859-15",
428 [PG_LATIN10] = NULL,
429 [PG_WIN1256] = "CP1256",
430 [PG_WIN1258] = "CP1258",
431 [PG_WIN866] = "CP866",
432 [PG_WIN874] = NULL,
433 [PG_KOI8R] = "KOI8-R",
434 [PG_WIN1251] = "CP1251",
435 [PG_WIN1252] = "CP1252",
436 [PG_ISO_8859_5] = "ISO-8859-5",
437 [PG_ISO_8859_6] = "ISO-8859-6",
438 [PG_ISO_8859_7] = "ISO-8859-7",
439 [PG_ISO_8859_8] = "ISO-8859-8",
440 [PG_WIN1250] = "CP1250",
441 [PG_WIN1253] = "CP1253",
442 [PG_WIN1254] = "CP1254",
443 [PG_WIN1255] = "CP1255",
444 [PG_WIN1257] = "CP1257",
445 [PG_KOI8U] = "KOI8-U",
446};
447
449 "pg_enc2icu_tbl incomplete");
450
451
452/*
453 * Is this encoding supported by ICU?
454 */
455bool
457{
459 return false;
460 return (pg_enc2icu_tbl[encoding] != NULL);
461}
462
463/*
464 * Returns ICU's name for encoding, or NULL if not supported
465 */
466const char *
473
474
475/* ----------
476 * Encoding checks, for error returns -1 else encoding id
477 * ----------
478 */
479int
481{
482 int enc;
483
484 if ((enc = pg_char_to_encoding(name)) < 0)
485 return -1;
486
488 return -1;
489
490 return enc;
491}
492
493int
495{
496 int enc;
497
498 if ((enc = pg_char_to_encoding(name)) < 0)
499 return -1;
500
502 return -1;
503
504 return enc;
505}
506
507int
512
513/*
514 * Remove irrelevant chars from encoding name, store at *newkey
515 *
516 * (Caller's responsibility to provide a large enough buffer)
517 */
518static char *
519clean_encoding_name(const char *key, char *newkey)
520{
521 const char *p;
522 char *np;
523
524 for (p = key, np = newkey; *p != '\0'; p++)
525 {
526 if (isalnum((unsigned char) *p))
527 {
528 if (*p >= 'A' && *p <= 'Z')
529 *np++ = *p + 'a' - 'A';
530 else
531 *np++ = *p;
532 }
533 }
534 *np = '\0';
535 return newkey;
536}
537
538/*
539 * Search encoding by encoding name
540 *
541 * Returns encoding ID, or -1 if not recognized
542 */
543int
545{
546 unsigned int nel = lengthof(pg_encname_tbl);
547 const pg_encname *base = pg_encname_tbl,
548 *last = base + nel - 1,
549 *position;
550 int result;
551 char buff[NAMEDATALEN],
552 *key;
553
554 if (name == NULL || *name == '\0')
555 return -1;
556
557 if (strlen(name) >= NAMEDATALEN)
558 return -1; /* it's certainly not in the table */
559
561
562 while (last >= base)
563 {
564 position = base + ((last - base) >> 1);
565 result = key[0] - position->name[0];
566
567 if (result == 0)
568 {
569 result = strcmp(key, position->name);
570 if (result == 0)
571 return position->encoding;
572 }
573 if (result < 0)
574 last = position - 1;
575 else
576 base = position + 1;
577 }
578 return -1;
579}
580
581const char *
583{
585 {
587
588 Assert(encoding == p->encoding);
589 return p->name;
590 }
591 return "";
592}
#define Assert(condition)
Definition c.h:943
#define lengthof(array)
Definition c.h:873
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1008
uint32 result
const char * pg_enc2gettext_tbl[]
Definition encnames.c:357
const pg_enc2name pg_enc2name_tbl[]
Definition encnames.c:308
static char * clean_encoding_name(const char *key, char *newkey)
Definition encnames.c:519
const char * get_encoding_name_for_icu(int encoding)
Definition encnames.c:467
static const char *const pg_enc2icu_tbl[]
Definition encnames.c:410
#define DEF_ENC2NAME(name, codepage)
Definition encnames.c:303
static const pg_encname pg_encname_tbl[]
Definition encnames.c:39
int pg_valid_client_encoding(const char *name)
Definition encnames.c:480
bool is_encoding_supported_by_icu(int encoding)
Definition encnames.c:456
static char * encoding
Definition initdb.c:139
#define PG_UTF8
Definition mbprint.c:43
#define NAMEDATALEN
pg_enc
Definition pg_wchar.h:75
@ PG_WIN1254
Definition pg_wchar.h:107
@ PG_LATIN4
Definition pg_wchar.h:87
@ PG_LATIN9
Definition pg_wchar.h:92
@ PG_JOHAB
Definition pg_wchar.h:119
@ PG_GB18030
Definition pg_wchar.h:118
@ PG_SQL_ASCII
Definition pg_wchar.h:76
@ PG_KOI8R
Definition pg_wchar.h:98
@ PG_ISO_8859_6
Definition pg_wchar.h:102
@ PG_WIN1253
Definition pg_wchar.h:106
@ PG_KOI8U
Definition pg_wchar.h:110
@ PG_LATIN6
Definition pg_wchar.h:89
@ PG_LATIN5
Definition pg_wchar.h:88
@ PG_EUC_CN
Definition pg_wchar.h:78
@ PG_UHC
Definition pg_wchar.h:117
@ PG_LATIN2
Definition pg_wchar.h:85
@ PG_ISO_8859_5
Definition pg_wchar.h:101
@ PG_LATIN10
Definition pg_wchar.h:93
@ PG_WIN1250
Definition pg_wchar.h:105
@ PG_ISO_8859_7
Definition pg_wchar.h:103
@ PG_SJIS
Definition pg_wchar.h:114
@ PG_LATIN8
Definition pg_wchar.h:91
@ PG_EUC_JP
Definition pg_wchar.h:77
@ PG_GBK
Definition pg_wchar.h:116
@ PG_LATIN3
Definition pg_wchar.h:86
@ PG_WIN1256
Definition pg_wchar.h:94
@ PG_LATIN1
Definition pg_wchar.h:84
@ PG_EUC_TW
Definition pg_wchar.h:80
@ PG_WIN1258
Definition pg_wchar.h:95
@ PG_SHIFT_JIS_2004
Definition pg_wchar.h:120
@ PG_WIN1252
Definition pg_wchar.h:100
@ PG_LATIN7
Definition pg_wchar.h:90
@ PG_WIN1255
Definition pg_wchar.h:108
@ PG_WIN1257
Definition pg_wchar.h:109
@ PG_WIN1251
Definition pg_wchar.h:99
@ PG_EUC_KR
Definition pg_wchar.h:79
@ PG_WIN866
Definition pg_wchar.h:96
@ PG_ISO_8859_8
Definition pg_wchar.h:104
@ PG_WIN874
Definition pg_wchar.h:97
@ PG_EUC_JIS_2004
Definition pg_wchar.h:81
@ PG_BIG5
Definition pg_wchar.h:115
#define PG_VALID_ENCODING(_enc)
Definition pg_wchar.h:140
#define PG_VALID_FE_ENCODING(_enc)
Definition pg_wchar.h:144
#define PG_VALID_BE_ENCODING(_enc)
Definition pg_wchar.h:134
#define pg_encoding_to_char
Definition pg_wchar.h:483
#define pg_valid_server_encoding_id
Definition pg_wchar.h:485
#define pg_valid_server_encoding
Definition pg_wchar.h:484
#define PG_ENCODING_BE_LAST
Definition pg_wchar.h:125
#define pg_char_to_encoding
Definition pg_wchar.h:482
static int fb(int x)
pg_enc encoding
Definition pg_wchar.h:195
const char * name
Definition pg_wchar.h:194
const char * name
Definition encnames.c:35
pg_enc encoding
Definition encnames.c:36
const char * name