PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
conv.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Utility functions for conversion procs.
4  *
5  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  * src/backend/utils/mb/conv.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
15 
16 
17 /*
18  * local2local: a generic single byte charset encoding
19  * conversion between two ASCII-superset encodings.
20  *
21  * l points to the source string of length len
22  * p is the output area (must be large enough!)
23  * src_encoding is the PG identifier for the source encoding
24  * dest_encoding is the PG identifier for the target encoding
25  * tab holds conversion entries for the source charset
26  * starting from 128 (0x80). each entry in the table holds the corresponding
27  * code point for the target charset, or 0 if there is no equivalent code.
28  */
29 void
30 local2local(const unsigned char *l,
31  unsigned char *p,
32  int len,
33  int src_encoding,
34  int dest_encoding,
35  const unsigned char *tab)
36 {
37  unsigned char c1,
38  c2;
39 
40  while (len > 0)
41  {
42  c1 = *l;
43  if (c1 == 0)
44  report_invalid_encoding(src_encoding, (const char *) l, len);
45  if (!IS_HIGHBIT_SET(c1))
46  *p++ = c1;
47  else
48  {
49  c2 = tab[c1 - HIGHBIT];
50  if (c2)
51  *p++ = c2;
52  else
53  report_untranslatable_char(src_encoding, dest_encoding,
54  (const char *) l, len);
55  }
56  l++;
57  len--;
58  }
59  *p = '\0';
60 }
61 
62 /*
63  * LATINn ---> MIC when the charset's local codes map directly to MIC
64  *
65  * l points to the source string of length len
66  * p is the output area (must be large enough!)
67  * lc is the mule character set id for the local encoding
68  * encoding is the PG identifier for the local encoding
69  */
70 void
71 latin2mic(const unsigned char *l, unsigned char *p, int len,
72  int lc, int encoding)
73 {
74  int c1;
75 
76  while (len > 0)
77  {
78  c1 = *l;
79  if (c1 == 0)
80  report_invalid_encoding(encoding, (const char *) l, len);
81  if (IS_HIGHBIT_SET(c1))
82  *p++ = lc;
83  *p++ = c1;
84  l++;
85  len--;
86  }
87  *p = '\0';
88 }
89 
90 /*
91  * MIC ---> LATINn when the charset's local codes map directly to MIC
92  *
93  * mic points to the source string of length len
94  * p is the output area (must be large enough!)
95  * lc is the mule character set id for the local encoding
96  * encoding is the PG identifier for the local encoding
97  */
98 void
99 mic2latin(const unsigned char *mic, unsigned char *p, int len,
100  int lc, int encoding)
101 {
102  int c1;
103 
104  while (len > 0)
105  {
106  c1 = *mic;
107  if (c1 == 0)
108  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109  if (!IS_HIGHBIT_SET(c1))
110  {
111  /* easy for ASCII */
112  *p++ = c1;
113  mic++;
114  len--;
115  }
116  else
117  {
118  int l = pg_mic_mblen(mic);
119 
120  if (len < l)
121  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122  len);
123  if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
125  (const char *) mic, len);
126  *p++ = mic[1];
127  mic += 2;
128  len -= 2;
129  }
130  }
131  *p = '\0';
132 }
133 
134 
135 /*
136  * ASCII ---> MIC
137  *
138  * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139  * characters, here we must take a hard line because we don't know
140  * the appropriate MIC equivalent.
141  */
142 void
143 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
144 {
145  int c1;
146 
147  while (len > 0)
148  {
149  c1 = *l;
150  if (c1 == 0 || IS_HIGHBIT_SET(c1))
151  report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
152  *p++ = c1;
153  l++;
154  len--;
155  }
156  *p = '\0';
157 }
158 
159 /*
160  * MIC ---> ASCII
161  */
162 void
163 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
164 {
165  int c1;
166 
167  while (len > 0)
168  {
169  c1 = *mic;
170  if (c1 == 0 || IS_HIGHBIT_SET(c1))
172  (const char *) mic, len);
173  *p++ = c1;
174  mic++;
175  len--;
176  }
177  *p = '\0';
178 }
179 
180 /*
181  * latin2mic_with_table: a generic single byte charset encoding
182  * conversion from a local charset to the mule internal code.
183  *
184  * l points to the source string of length len
185  * p is the output area (must be large enough!)
186  * lc is the mule character set id for the local encoding
187  * encoding is the PG identifier for the local encoding
188  * tab holds conversion entries for the local charset
189  * starting from 128 (0x80). each entry in the table holds the corresponding
190  * code point for the mule encoding, or 0 if there is no equivalent code.
191  */
192 void
193 latin2mic_with_table(const unsigned char *l,
194  unsigned char *p,
195  int len,
196  int lc,
197  int encoding,
198  const unsigned char *tab)
199 {
200  unsigned char c1,
201  c2;
202 
203  while (len > 0)
204  {
205  c1 = *l;
206  if (c1 == 0)
207  report_invalid_encoding(encoding, (const char *) l, len);
208  if (!IS_HIGHBIT_SET(c1))
209  *p++ = c1;
210  else
211  {
212  c2 = tab[c1 - HIGHBIT];
213  if (c2)
214  {
215  *p++ = lc;
216  *p++ = c2;
217  }
218  else
220  (const char *) l, len);
221  }
222  l++;
223  len--;
224  }
225  *p = '\0';
226 }
227 
228 /*
229  * mic2latin_with_table: a generic single byte charset encoding
230  * conversion from the mule internal code to a local charset.
231  *
232  * mic points to the source string of length len
233  * p is the output area (must be large enough!)
234  * lc is the mule character set id for the local encoding
235  * encoding is the PG identifier for the local encoding
236  * tab holds conversion entries for the mule internal code's second byte,
237  * starting from 128 (0x80). each entry in the table holds the corresponding
238  * code point for the local charset, or 0 if there is no equivalent code.
239  */
240 void
241 mic2latin_with_table(const unsigned char *mic,
242  unsigned char *p,
243  int len,
244  int lc,
245  int encoding,
246  const unsigned char *tab)
247 {
248  unsigned char c1,
249  c2;
250 
251  while (len > 0)
252  {
253  c1 = *mic;
254  if (c1 == 0)
255  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256  if (!IS_HIGHBIT_SET(c1))
257  {
258  /* easy for ASCII */
259  *p++ = c1;
260  mic++;
261  len--;
262  }
263  else
264  {
265  int l = pg_mic_mblen(mic);
266 
267  if (len < l)
268  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
269  len);
270  if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
271  (c2 = tab[mic[1] - HIGHBIT]) == 0)
272  {
274  (const char *) mic, len);
275  break; /* keep compiler quiet */
276  }
277  *p++ = c2;
278  mic += 2;
279  len -= 2;
280  }
281  }
282  *p = '\0';
283 }
284 
285 /*
286  * comparison routine for bsearch()
287  * this routine is intended for UTF8 -> local code
288  */
289 static int
290 compare1(const void *p1, const void *p2)
291 {
292  uint32 v1,
293  v2;
294 
295  v1 = *(const uint32 *) p1;
296  v2 = ((const pg_utf_to_local *) p2)->utf;
297  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
298 }
299 
300 /*
301  * comparison routine for bsearch()
302  * this routine is intended for local code -> UTF8
303  */
304 static int
305 compare2(const void *p1, const void *p2)
306 {
307  uint32 v1,
308  v2;
309 
310  v1 = *(const uint32 *) p1;
311  v2 = ((const pg_local_to_utf *) p2)->code;
312  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
313 }
314 
315 /*
316  * comparison routine for bsearch()
317  * this routine is intended for combined UTF8 -> local code
318  */
319 static int
320 compare3(const void *p1, const void *p2)
321 {
322  uint32 s1,
323  s2,
324  d1,
325  d2;
326 
327  s1 = *(const uint32 *) p1;
328  s2 = *((const uint32 *) p1 + 1);
329  d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330  d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331  return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 }
333 
334 /*
335  * comparison routine for bsearch()
336  * this routine is intended for local code -> combined UTF8
337  */
338 static int
339 compare4(const void *p1, const void *p2)
340 {
341  uint32 v1,
342  v2;
343 
344  v1 = *(const uint32 *) p1;
345  v2 = ((const pg_local_to_utf_combined *) p2)->code;
346  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 }
348 
349 /*
350  * store 32bit character representation into multibyte stream
351  */
352 static inline unsigned char *
353 store_coded_char(unsigned char *dest, uint32 code)
354 {
355  if (code & 0xff000000)
356  *dest++ = code >> 24;
357  if (code & 0x00ff0000)
358  *dest++ = code >> 16;
359  if (code & 0x0000ff00)
360  *dest++ = code >> 8;
361  if (code & 0x000000ff)
362  *dest++ = code;
363  return dest;
364 }
365 
366 /*
367  * UTF8 ---> local code
368  *
369  * utf: input string in UTF8 encoding (need not be null-terminated)
370  * len: length of input string (in bytes)
371  * iso: pointer to the output area (must be large enough!)
372  (output string will be null-terminated)
373  * map: conversion map for single characters
374  * mapsize: number of entries in the conversion map
375  * cmap: conversion map for combined characters
376  * (optional, pass NULL if none)
377  * cmapsize: number of entries in the conversion map for combined characters
378  * (optional, pass 0 if none)
379  * conv_func: algorithmic encoding conversion function
380  * (optional, pass NULL if none)
381  * encoding: PG identifier for the local encoding
382  *
383  * For each character, the cmap (if provided) is consulted first; if no match,
384  * the map is consulted next; if still no match, the conv_func (if provided)
385  * is applied. An error is raised if no match is found.
386  *
387  * See pg_wchar.h for more details about the data structures used here.
388  */
389 void
390 UtfToLocal(const unsigned char *utf, int len,
391  unsigned char *iso,
392  const pg_utf_to_local *map, int mapsize,
393  const pg_utf_to_local_combined *cmap, int cmapsize,
394  utf_local_conversion_func conv_func,
395  int encoding)
396 {
397  uint32 iutf;
398  int l;
399  const pg_utf_to_local *p;
400  const pg_utf_to_local_combined *cp;
401 
402  if (!PG_VALID_ENCODING(encoding))
403  ereport(ERROR,
404  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
405  errmsg("invalid encoding number: %d", encoding)));
406 
407  for (; len > 0; len -= l)
408  {
409  /* "break" cases all represent errors */
410  if (*utf == '\0')
411  break;
412 
413  l = pg_utf_mblen(utf);
414  if (len < l)
415  break;
416 
417  if (!pg_utf8_islegal(utf, l))
418  break;
419 
420  if (l == 1)
421  {
422  /* ASCII case is easy, assume it's one-to-one conversion */
423  *iso++ = *utf++;
424  continue;
425  }
426 
427  /* collect coded char of length l */
428  if (l == 2)
429  {
430  iutf = *utf++ << 8;
431  iutf |= *utf++;
432  }
433  else if (l == 3)
434  {
435  iutf = *utf++ << 16;
436  iutf |= *utf++ << 8;
437  iutf |= *utf++;
438  }
439  else if (l == 4)
440  {
441  iutf = *utf++ << 24;
442  iutf |= *utf++ << 16;
443  iutf |= *utf++ << 8;
444  iutf |= *utf++;
445  }
446  else
447  {
448  elog(ERROR, "unsupported character length %d", l);
449  iutf = 0; /* keep compiler quiet */
450  }
451 
452  /* First, try with combined map if possible */
453  if (cmap && len > l)
454  {
455  const unsigned char *utf_save = utf;
456  int len_save = len;
457  int l_save = l;
458 
459  /* collect next character, same as above */
460  len -= l;
461 
462  l = pg_utf_mblen(utf);
463  if (len < l)
464  break;
465 
466  if (!pg_utf8_islegal(utf, l))
467  break;
468 
469  /* We assume ASCII character cannot be in combined map */
470  if (l > 1)
471  {
472  uint32 iutf2;
473  uint32 cutf[2];
474 
475  if (l == 2)
476  {
477  iutf2 = *utf++ << 8;
478  iutf2 |= *utf++;
479  }
480  else if (l == 3)
481  {
482  iutf2 = *utf++ << 16;
483  iutf2 |= *utf++ << 8;
484  iutf2 |= *utf++;
485  }
486  else if (l == 4)
487  {
488  iutf2 = *utf++ << 24;
489  iutf2 |= *utf++ << 16;
490  iutf2 |= *utf++ << 8;
491  iutf2 |= *utf++;
492  }
493  else
494  {
495  elog(ERROR, "unsupported character length %d", l);
496  iutf2 = 0; /* keep compiler quiet */
497  }
498 
499  cutf[0] = iutf;
500  cutf[1] = iutf2;
501 
502  cp = bsearch(cutf, cmap, cmapsize,
504 
505  if (cp)
506  {
507  iso = store_coded_char(iso, cp->code);
508  continue;
509  }
510  }
511 
512  /* fail, so back up to reprocess second character next time */
513  utf = utf_save;
514  len = len_save;
515  l = l_save;
516  }
517 
518  /* Now check ordinary map */
519  p = bsearch(&iutf, map, mapsize,
520  sizeof(pg_utf_to_local), compare1);
521 
522  if (p)
523  {
524  iso = store_coded_char(iso, p->code);
525  continue;
526  }
527 
528  /* if there's a conversion function, try that */
529  if (conv_func)
530  {
531  uint32 converted = (*conv_func) (iutf);
532 
533  if (converted)
534  {
535  iso = store_coded_char(iso, converted);
536  continue;
537  }
538  }
539 
540  /* failed to translate this character */
542  (const char *) (utf - l), len);
543  }
544 
545  /* if we broke out of loop early, must be invalid input */
546  if (len > 0)
547  report_invalid_encoding(PG_UTF8, (const char *) utf, len);
548 
549  *iso = '\0';
550 }
551 
552 /*
553  * local code ---> UTF8
554  *
555  * iso: input string in local encoding (need not be null-terminated)
556  * len: length of input string (in bytes)
557  * utf: pointer to the output area (must be large enough!)
558  (output string will be null-terminated)
559  * map: conversion map for single characters
560  * mapsize: number of entries in the conversion map
561  * cmap: conversion map for combined characters
562  * (optional, pass NULL if none)
563  * cmapsize: number of entries in the conversion map for combined characters
564  * (optional, pass 0 if none)
565  * conv_func: algorithmic encoding conversion function
566  * (optional, pass NULL if none)
567  * encoding: PG identifier for the local encoding
568  *
569  * For each character, the map is consulted first; if no match, the cmap
570  * (if provided) is consulted next; if still no match, the conv_func
571  * (if provided) is applied. An error is raised if no match is found.
572  *
573  * See pg_wchar.h for more details about the data structures used here.
574  */
575 void
576 LocalToUtf(const unsigned char *iso, int len,
577  unsigned char *utf,
578  const pg_local_to_utf *map, int mapsize,
579  const pg_local_to_utf_combined *cmap, int cmapsize,
580  utf_local_conversion_func conv_func,
581  int encoding)
582 {
583  uint32 iiso;
584  int l;
585  const pg_local_to_utf *p;
586  const pg_local_to_utf_combined *cp;
587 
588  if (!PG_VALID_ENCODING(encoding))
589  ereport(ERROR,
590  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
591  errmsg("invalid encoding number: %d", encoding)));
592 
593  for (; len > 0; len -= l)
594  {
595  /* "break" cases all represent errors */
596  if (*iso == '\0')
597  break;
598 
599  if (!IS_HIGHBIT_SET(*iso))
600  {
601  /* ASCII case is easy, assume it's one-to-one conversion */
602  *utf++ = *iso++;
603  l = 1;
604  continue;
605  }
606 
607  l = pg_encoding_verifymb(encoding, (const char *) iso, len);
608  if (l < 0)
609  break;
610 
611  /* collect coded char of length l */
612  if (l == 1)
613  iiso = *iso++;
614  else if (l == 2)
615  {
616  iiso = *iso++ << 8;
617  iiso |= *iso++;
618  }
619  else if (l == 3)
620  {
621  iiso = *iso++ << 16;
622  iiso |= *iso++ << 8;
623  iiso |= *iso++;
624  }
625  else if (l == 4)
626  {
627  iiso = *iso++ << 24;
628  iiso |= *iso++ << 16;
629  iiso |= *iso++ << 8;
630  iiso |= *iso++;
631  }
632  else
633  {
634  elog(ERROR, "unsupported character length %d", l);
635  iiso = 0; /* keep compiler quiet */
636  }
637 
638  /* First check ordinary map */
639  p = bsearch(&iiso, map, mapsize,
640  sizeof(pg_local_to_utf), compare2);
641 
642  if (p)
643  {
644  utf = store_coded_char(utf, p->utf);
645  continue;
646  }
647 
648  /* If there's a combined character map, try that */
649  if (cmap)
650  {
651  cp = bsearch(&iiso, cmap, cmapsize,
653 
654  if (cp)
655  {
656  utf = store_coded_char(utf, cp->utf1);
657  utf = store_coded_char(utf, cp->utf2);
658  continue;
659  }
660  }
661 
662  /* if there's a conversion function, try that */
663  if (conv_func)
664  {
665  uint32 converted = (*conv_func) (iiso);
666 
667  if (converted)
668  {
669  utf = store_coded_char(utf, converted);
670  continue;
671  }
672  }
673 
674  /* failed to translate this character */
676  (const char *) (iso - l), len);
677  }
678 
679  /* if we broke out of loop early, must be invalid input */
680  if (len > 0)
681  report_invalid_encoding(encoding, (const char *) iso, len);
682 
683  *utf = '\0';
684 }
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1458
static int compare4(const void *p1, const void *p2)
Definition: conv.c:339
int errcode(int sqlerrcode)
Definition: elog.c:575
void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
Definition: conv.c:143
static unsigned char * store_coded_char(unsigned char *dest, uint32 code)
Definition: conv.c:353
int pg_mic_mblen(const unsigned char *mbstr)
Definition: wchar.c:1776
int pg_encoding_verifymb(int encoding, const char *mbstr, int len)
Definition: wchar.c:1809
void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_utf_to_local *map, int mapsize, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding)
Definition: conv.c:390
void latin2mic_with_table(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab)
Definition: conv.c:193
uint32(* utf_local_conversion_func)(uint32 code)
Definition: pg_wchar.h:420
#define IS_HIGHBIT_SET(ch)
Definition: c.h:969
#define ERROR
Definition: elog.h:43
char * s1
void latin2mic(const unsigned char *l, unsigned char *p, int len, int lc, int encoding)
Definition: conv.c:71
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: wchar.c:2027
void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_local_to_utf *map, int mapsize, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding)
Definition: conv.c:576
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:299
unsigned int uint32
Definition: c.h:265
#define ereport(elevel, rest)
Definition: elog.h:122
#define HIGHBIT
Definition: c.h:968
void mic2latin(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding)
Definition: conv.c:99
char * s2
void mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab)
Definition: conv.c:241
static int compare3(const void *p1, const void *p2)
Definition: conv.c:320
static char * encoding
Definition: initdb.c:121
static int compare1(const void *p1, const void *p2)
Definition: conv.c:290
void local2local(const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, const unsigned char *tab)
Definition: conv.c:30
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: wchar.c:1995
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:541
void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
Definition: conv.c:163
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define elog
Definition: elog.h:219
static int compare2(const void *p1, const void *p2)
Definition: conv.c:305