PostgreSQL Source Code  git master
conv.c File Reference
#include "postgres.h"
#include "mb/pg_wchar.h"
Include dependency graph for conv.c:

Go to the source code of this file.

Functions

int local2local (const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, const unsigned char *tab, bool noError)
 
int latin2mic (const unsigned char *l, unsigned char *p, int len, int lc, int encoding, bool noError)
 
int mic2latin (const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, bool noError)
 
int latin2mic_with_table (const unsigned char *l, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab, bool noError)
 
int mic2latin_with_table (const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab, bool noError)
 
static int compare3 (const void *p1, const void *p2)
 
static int compare4 (const void *p1, const void *p2)
 
static unsigned char * store_coded_char (unsigned char *dest, uint32 code)
 
static uint32 pg_mb_radix_conv (const pg_mb_radix_tree *rt, int l, unsigned char b1, unsigned char b2, unsigned char b3, unsigned char b4)
 
int UtfToLocal (const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
 
int LocalToUtf (const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
 

Function Documentation

◆ compare3()

static int compare3 ( const void *  p1,
const void *  p2 
)
static

Definition at line 320 of file conv.c.

References s1, and s2.

Referenced by UtfToLocal().

321 {
322  uint32 s1,
323  s2,
324  d1,
325  d2;
326 
327  s1 = *(const uint32 *) p1;
328  s2 = *((const uint32 *) p1 + 1);
329  d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330  d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331  return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 }
char * s1
unsigned int uint32
Definition: c.h:441
char * s2

◆ compare4()

static int compare4 ( const void *  p1,
const void *  p2 
)
static

Definition at line 339 of file conv.c.

Referenced by LocalToUtf().

340 {
341  uint32 v1,
342  v2;
343 
344  v1 = *(const uint32 *) p1;
345  v2 = ((const pg_local_to_utf_combined *) p2)->code;
346  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 }
unsigned int uint32
Definition: c.h:441

◆ latin2mic()

int latin2mic ( const unsigned char *  l,
unsigned char *  p,
int  len,
int  lc,
int  encoding,
bool  noError 
)

Definition at line 89 of file conv.c.

References IS_HIGHBIT_SET, and report_invalid_encoding().

Referenced by koi8r_to_mic(), latin1_to_mic(), latin2_to_mic(), latin3_to_mic(), latin4_to_mic(), and surrogate_pair_to_codepoint().

91 {
92  const unsigned char *start = l;
93  int c1;
94 
95  while (len > 0)
96  {
97  c1 = *l;
98  if (c1 == 0)
99  {
100  if (noError)
101  break;
102  report_invalid_encoding(encoding, (const char *) l, len);
103  }
104  if (IS_HIGHBIT_SET(c1))
105  *p++ = lc;
106  *p++ = c1;
107  l++;
108  len--;
109  }
110  *p = '\0';
111 
112  return l - start;
113 }
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
int32 encoding
Definition: pg_database.h:41

◆ latin2mic_with_table()

int latin2mic_with_table ( const unsigned char *  l,
unsigned char *  p,
int  len,
int  lc,
int  encoding,
const unsigned char *  tab,
bool  noError 
)

Definition at line 194 of file conv.c.

References HIGHBIT, IS_HIGHBIT_SET, PG_MULE_INTERNAL, report_invalid_encoding(), and report_untranslatable_char().

Referenced by iso_to_mic(), surrogate_pair_to_codepoint(), win1250_to_mic(), win1251_to_mic(), and win866_to_mic().

201 {
202  const unsigned char *start = l;
203  unsigned char c1,
204  c2;
205 
206  while (len > 0)
207  {
208  c1 = *l;
209  if (c1 == 0)
210  {
211  if (noError)
212  break;
213  report_invalid_encoding(encoding, (const char *) l, len);
214  }
215  if (!IS_HIGHBIT_SET(c1))
216  *p++ = c1;
217  else
218  {
219  c2 = tab[c1 - HIGHBIT];
220  if (c2)
221  {
222  *p++ = lc;
223  *p++ = c2;
224  }
225  else
226  {
227  if (noError)
228  break;
230  (const char *) l, len);
231  }
232  }
233  l++;
234  len--;
235  }
236  *p = '\0';
237 
238  return l - start;
239 }
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
#define HIGHBIT
Definition: c.h:1155
int32 encoding
Definition: pg_database.h:41
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679

◆ local2local()

int local2local ( const unsigned char *  l,
unsigned char *  p,
int  len,
int  src_encoding,
int  dest_encoding,
const unsigned char *  tab,
bool  noError 
)

Definition at line 33 of file conv.c.

References HIGHBIT, IS_HIGHBIT_SET, report_invalid_encoding(), and report_untranslatable_char().

Referenced by iso_to_koi8r(), iso_to_win1251(), iso_to_win866(), koi8r_to_iso(), koi8r_to_win1251(), koi8r_to_win866(), latin2_to_win1250(), surrogate_pair_to_codepoint(), win1250_to_latin2(), win1251_to_iso(), win1251_to_koi8r(), win1251_to_win866(), win866_to_iso(), win866_to_koi8r(), and win866_to_win1251().

40 {
41  const unsigned char *start = l;
42  unsigned char c1,
43  c2;
44 
45  while (len > 0)
46  {
47  c1 = *l;
48  if (c1 == 0)
49  {
50  if (noError)
51  break;
52  report_invalid_encoding(src_encoding, (const char *) l, len);
53  }
54  if (!IS_HIGHBIT_SET(c1))
55  *p++ = c1;
56  else
57  {
58  c2 = tab[c1 - HIGHBIT];
59  if (c2)
60  *p++ = c2;
61  else
62  {
63  if (noError)
64  break;
65  report_untranslatable_char(src_encoding, dest_encoding,
66  (const char *) l, len);
67  }
68  }
69  l++;
70  len--;
71  }
72  *p = '\0';
73 
74  return l - start;
75 }
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
#define HIGHBIT
Definition: c.h:1155
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679

◆ LocalToUtf()

int LocalToUtf ( const unsigned char *  iso,
int  len,
unsigned char *  utf,
const pg_mb_radix_tree map,
const pg_local_to_utf_combined cmap,
int  cmapsize,
utf_local_conversion_func  conv_func,
int  encoding,
bool  noError 
)

Definition at line 717 of file conv.c.

References compare4(), elog, ereport, errcode(), errmsg(), ERROR, IS_HIGHBIT_SET, pg_encoding_verifymbchar(), pg_mb_radix_conv(), PG_UTF8, PG_VALID_ENCODING, report_invalid_encoding(), report_untranslatable_char(), store_coded_char(), pg_local_to_utf_combined::utf1, and pg_local_to_utf_combined::utf2.

Referenced by big5_to_utf8(), euc_cn_to_utf8(), euc_jis_2004_to_utf8(), euc_jp_to_utf8(), euc_kr_to_utf8(), euc_tw_to_utf8(), gb18030_to_utf8(), gbk_to_utf8(), iso8859_to_utf8(), johab_to_utf8(), koi8r_to_utf8(), koi8u_to_utf8(), shift_jis_2004_to_utf8(), sjis_to_utf8(), surrogate_pair_to_codepoint(), uhc_to_utf8(), and win_to_utf8().

724 {
725  uint32 iiso;
726  int l;
727  const pg_local_to_utf_combined *cp;
728  const unsigned char *start = iso;
729 
731  ereport(ERROR,
732  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733  errmsg("invalid encoding number: %d", encoding)));
734 
735  for (; len > 0; len -= l)
736  {
737  unsigned char b1 = 0;
738  unsigned char b2 = 0;
739  unsigned char b3 = 0;
740  unsigned char b4 = 0;
741 
742  /* "break" cases all represent errors */
743  if (*iso == '\0')
744  break;
745 
746  if (!IS_HIGHBIT_SET(*iso))
747  {
748  /* ASCII case is easy, assume it's one-to-one conversion */
749  *utf++ = *iso++;
750  l = 1;
751  continue;
752  }
753 
754  l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755  if (l < 0)
756  break;
757 
758  /* collect coded char of length l */
759  if (l == 1)
760  b4 = *iso++;
761  else if (l == 2)
762  {
763  b3 = *iso++;
764  b4 = *iso++;
765  }
766  else if (l == 3)
767  {
768  b2 = *iso++;
769  b3 = *iso++;
770  b4 = *iso++;
771  }
772  else if (l == 4)
773  {
774  b1 = *iso++;
775  b2 = *iso++;
776  b3 = *iso++;
777  b4 = *iso++;
778  }
779  else
780  {
781  elog(ERROR, "unsupported character length %d", l);
782  iiso = 0; /* keep compiler quiet */
783  }
784  iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
785 
786  if (map)
787  {
788  uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789 
790  if (converted)
791  {
792  utf = store_coded_char(utf, converted);
793  continue;
794  }
795 
796  /* If there's a combined character map, try that */
797  if (cmap)
798  {
799  cp = bsearch(&iiso, cmap, cmapsize,
801 
802  if (cp)
803  {
804  utf = store_coded_char(utf, cp->utf1);
805  utf = store_coded_char(utf, cp->utf2);
806  continue;
807  }
808  }
809  }
810 
811  /* if there's a conversion function, try that */
812  if (conv_func)
813  {
814  uint32 converted = (*conv_func) (iiso);
815 
816  if (converted)
817  {
818  utf = store_coded_char(utf, converted);
819  continue;
820  }
821  }
822 
823  /* failed to translate this character */
824  iso -= l;
825  if (noError)
826  break;
828  (const char *) iso, len);
829  }
830 
831  /* if we broke out of loop early, must be invalid input */
832  if (len > 0 && !noError)
833  report_invalid_encoding(encoding, (const char *) iso, len);
834 
835  *utf = '\0';
836 
837  return iso - start;
838 }
static int compare4(const void *p1, const void *p2)
Definition: conv.c:339
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
int errcode(int sqlerrcode)
Definition: elog.c:698
static unsigned char * store_coded_char(unsigned char *dest, uint32 code)
Definition: conv.c:353
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
#define ERROR
Definition: elog.h:46
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
static uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, int l, unsigned char b1, unsigned char b2, unsigned char b3, unsigned char b4)
Definition: conv.c:373
unsigned int uint32
Definition: c.h:441
#define ereport(elevel,...)
Definition: elog.h:157
int32 encoding
Definition: pg_database.h:41
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:1955
int errmsg(const char *fmt,...)
Definition: elog.c:909
#define elog(elevel,...)
Definition: elog.h:232

◆ mic2latin()

int mic2latin ( const unsigned char *  mic,
unsigned char *  p,
int  len,
int  lc,
int  encoding,
bool  noError 
)

Definition at line 127 of file conv.c.

References IS_HIGHBIT_SET, PG_MULE_INTERNAL, pg_mule_mblen(), report_invalid_encoding(), and report_untranslatable_char().

Referenced by mic_to_koi8r(), mic_to_latin1(), mic_to_latin2(), mic_to_latin3(), mic_to_latin4(), and surrogate_pair_to_codepoint().

129 {
130  const unsigned char *start = mic;
131  int c1;
132 
133  while (len > 0)
134  {
135  c1 = *mic;
136  if (c1 == 0)
137  {
138  if (noError)
139  break;
140  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141  }
142  if (!IS_HIGHBIT_SET(c1))
143  {
144  /* easy for ASCII */
145  *p++ = c1;
146  mic++;
147  len--;
148  }
149  else
150  {
151  int l = pg_mule_mblen(mic);
152 
153  if (len < l)
154  {
155  if (noError)
156  break;
157  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158  len);
159  }
160  if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
161  {
162  if (noError)
163  break;
165  (const char *) mic, len);
166  }
167  *p++ = mic[1];
168  mic += 2;
169  len -= 2;
170  }
171  }
172  *p = '\0';
173 
174  return mic - start;
175 }
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:839
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
int32 encoding
Definition: pg_database.h:41
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679

◆ mic2latin_with_table()

int mic2latin_with_table ( const unsigned char *  mic,
unsigned char *  p,
int  len,
int  lc,
int  encoding,
const unsigned char *  tab,
bool  noError 
)

Definition at line 257 of file conv.c.

References HIGHBIT, IS_HIGHBIT_SET, PG_MULE_INTERNAL, pg_mule_mblen(), report_invalid_encoding(), and report_untranslatable_char().

Referenced by mic_to_iso(), mic_to_win1250(), mic_to_win1251(), mic_to_win866(), and surrogate_pair_to_codepoint().

264 {
265  const unsigned char *start = mic;
266  unsigned char c1,
267  c2;
268 
269  while (len > 0)
270  {
271  c1 = *mic;
272  if (c1 == 0)
273  {
274  if (noError)
275  break;
276  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277  }
278  if (!IS_HIGHBIT_SET(c1))
279  {
280  /* easy for ASCII */
281  *p++ = c1;
282  mic++;
283  len--;
284  }
285  else
286  {
287  int l = pg_mule_mblen(mic);
288 
289  if (len < l)
290  {
291  if (noError)
292  break;
293  report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294  len);
295  }
296  if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297  (c2 = tab[mic[1] - HIGHBIT]) == 0)
298  {
299  if (noError)
300  break;
302  (const char *) mic, len);
303  break; /* keep compiler quiet */
304  }
305  *p++ = c2;
306  mic += 2;
307  len -= 2;
308  }
309  }
310  *p = '\0';
311 
312  return mic - start;
313 }
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:839
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
#define HIGHBIT
Definition: c.h:1155
int32 encoding
Definition: pg_database.h:41
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679

◆ pg_mb_radix_conv()

static uint32 pg_mb_radix_conv ( const pg_mb_radix_tree rt,
int  l,
unsigned char  b1,
unsigned char  b2,
unsigned char  b3,
unsigned char  b4 
)
inlinestatic

Definition at line 373 of file conv.c.

References pg_mb_radix_tree::b1_lower, pg_mb_radix_tree::b1_upper, pg_mb_radix_tree::b1root, pg_mb_radix_tree::b2_1_lower, pg_mb_radix_tree::b2_1_upper, pg_mb_radix_tree::b2_2_lower, pg_mb_radix_tree::b2_2_upper, pg_mb_radix_tree::b2root, pg_mb_radix_tree::b3_1_lower, pg_mb_radix_tree::b3_1_upper, pg_mb_radix_tree::b3_2_lower, pg_mb_radix_tree::b3_2_upper, pg_mb_radix_tree::b3_3_lower, pg_mb_radix_tree::b3_3_upper, pg_mb_radix_tree::b3root, pg_mb_radix_tree::b4_1_lower, pg_mb_radix_tree::b4_1_upper, pg_mb_radix_tree::b4_2_lower, pg_mb_radix_tree::b4_2_upper, pg_mb_radix_tree::b4_3_lower, pg_mb_radix_tree::b4_3_upper, pg_mb_radix_tree::b4_4_lower, pg_mb_radix_tree::b4_4_upper, pg_mb_radix_tree::b4root, pg_mb_radix_tree::chars16, pg_mb_radix_tree::chars32, and idx().

Referenced by LocalToUtf(), and UtfToLocal().

379 {
380  if (l == 4)
381  {
382  /* 4-byte code */
383 
384  /* check code validity */
385  if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386  b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387  b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388  b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
389  return 0;
390 
391  /* perform lookup */
392  if (rt->chars32)
393  {
394  uint32 idx = rt->b4root;
395 
396  idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397  idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398  idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399  return rt->chars32[b4 + idx - rt->b4_4_lower];
400  }
401  else
402  {
403  uint16 idx = rt->b4root;
404 
405  idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406  idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407  idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408  return rt->chars16[b4 + idx - rt->b4_4_lower];
409  }
410  }
411  else if (l == 3)
412  {
413  /* 3-byte code */
414 
415  /* check code validity */
416  if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417  b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418  b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419  return 0;
420 
421  /* perform lookup */
422  if (rt->chars32)
423  {
424  uint32 idx = rt->b3root;
425 
426  idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427  idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428  return rt->chars32[b4 + idx - rt->b3_3_lower];
429  }
430  else
431  {
432  uint16 idx = rt->b3root;
433 
434  idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435  idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436  return rt->chars16[b4 + idx - rt->b3_3_lower];
437  }
438  }
439  else if (l == 2)
440  {
441  /* 2-byte code */
442 
443  /* check code validity - first byte */
444  if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445  b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446  return 0;
447 
448  /* perform lookup */
449  if (rt->chars32)
450  {
451  uint32 idx = rt->b2root;
452 
453  idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454  return rt->chars32[b4 + idx - rt->b2_2_lower];
455  }
456  else
457  {
458  uint16 idx = rt->b2root;
459 
460  idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461  return rt->chars16[b4 + idx - rt->b2_2_lower];
462  }
463  }
464  else if (l == 1)
465  {
466  /* 1-byte code */
467 
468  /* check code validity - first byte */
469  if (b4 < rt->b1_lower || b4 > rt->b1_upper)
470  return 0;
471 
472  /* perform lookup */
473  if (rt->chars32)
474  return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475  else
476  return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477  }
478  return 0; /* shouldn't happen */
479 }
uint8 b2_2_upper
Definition: pg_wchar.h:470
uint8 b4_2_lower
Definition: pg_wchar.h:485
const uint16 * chars16
Definition: pg_wchar.h:457
uint8 b2_1_lower
Definition: pg_wchar.h:467
uint8 b4_4_upper
Definition: pg_wchar.h:490
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:259
const uint32 * chars32
Definition: pg_wchar.h:458
uint8 b4_3_upper
Definition: pg_wchar.h:488
uint8 b4_4_lower
Definition: pg_wchar.h:489
uint8 b3_1_lower
Definition: pg_wchar.h:474
unsigned short uint16
Definition: c.h:440
uint8 b3_1_upper
Definition: pg_wchar.h:475
uint8 b4_2_upper
Definition: pg_wchar.h:486
unsigned int uint32
Definition: c.h:441
uint8 b3_2_upper
Definition: pg_wchar.h:477
uint8 b3_2_lower
Definition: pg_wchar.h:476
uint8 b3_3_upper
Definition: pg_wchar.h:479
uint8 b2_2_lower
Definition: pg_wchar.h:469
uint8 b4_1_upper
Definition: pg_wchar.h:484
uint8 b4_3_lower
Definition: pg_wchar.h:487
uint8 b2_1_upper
Definition: pg_wchar.h:468
uint8 b3_3_lower
Definition: pg_wchar.h:478
uint8 b4_1_lower
Definition: pg_wchar.h:483

◆ store_coded_char()

static unsigned char* store_coded_char ( unsigned char *  dest,
uint32  code 
)
inlinestatic

Definition at line 353 of file conv.c.

References generate_unaccent_rules::dest.

Referenced by LocalToUtf(), and UtfToLocal().

354 {
355  if (code & 0xff000000)
356  *dest++ = code >> 24;
357  if (code & 0x00ff0000)
358  *dest++ = code >> 16;
359  if (code & 0x0000ff00)
360  *dest++ = code >> 8;
361  if (code & 0x000000ff)
362  *dest++ = code;
363  return dest;
364 }

◆ UtfToLocal()

int UtfToLocal ( const unsigned char *  utf,
int  len,
unsigned char *  iso,
const pg_mb_radix_tree map,
const pg_utf_to_local_combined cmap,
int  cmapsize,
utf_local_conversion_func  conv_func,
int  encoding,
bool  noError 
)

Definition at line 507 of file conv.c.

References pg_utf_to_local_combined::code, compare3(), elog, ereport, errcode(), errmsg(), ERROR, pg_mb_radix_conv(), PG_UTF8, pg_utf8_islegal(), pg_utf_mblen(), PG_VALID_ENCODING, report_invalid_encoding(), report_untranslatable_char(), and store_coded_char().

Referenced by surrogate_pair_to_codepoint(), utf8_to_big5(), utf8_to_euc_cn(), utf8_to_euc_jis_2004(), utf8_to_euc_jp(), utf8_to_euc_kr(), utf8_to_euc_tw(), utf8_to_gb18030(), utf8_to_gbk(), utf8_to_iso8859(), utf8_to_johab(), utf8_to_koi8r(), utf8_to_koi8u(), utf8_to_shift_jis_2004(), utf8_to_sjis(), utf8_to_uhc(), and utf8_to_win().

513 {
514  uint32 iutf;
515  int l;
516  const pg_utf_to_local_combined *cp;
517  const unsigned char *start = utf;
518 
520  ereport(ERROR,
521  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522  errmsg("invalid encoding number: %d", encoding)));
523 
524  for (; len > 0; len -= l)
525  {
526  unsigned char b1 = 0;
527  unsigned char b2 = 0;
528  unsigned char b3 = 0;
529  unsigned char b4 = 0;
530 
531  /* "break" cases all represent errors */
532  if (*utf == '\0')
533  break;
534 
535  l = pg_utf_mblen(utf);
536  if (len < l)
537  break;
538 
539  if (!pg_utf8_islegal(utf, l))
540  break;
541 
542  if (l == 1)
543  {
544  /* ASCII case is easy, assume it's one-to-one conversion */
545  *iso++ = *utf++;
546  continue;
547  }
548 
549  /* collect coded char of length l */
550  if (l == 2)
551  {
552  b3 = *utf++;
553  b4 = *utf++;
554  }
555  else if (l == 3)
556  {
557  b2 = *utf++;
558  b3 = *utf++;
559  b4 = *utf++;
560  }
561  else if (l == 4)
562  {
563  b1 = *utf++;
564  b2 = *utf++;
565  b3 = *utf++;
566  b4 = *utf++;
567  }
568  else
569  {
570  elog(ERROR, "unsupported character length %d", l);
571  iutf = 0; /* keep compiler quiet */
572  }
573  iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
574 
575  /* First, try with combined map if possible */
576  if (cmap && len > l)
577  {
578  const unsigned char *utf_save = utf;
579  int len_save = len;
580  int l_save = l;
581 
582  /* collect next character, same as above */
583  len -= l;
584 
585  l = pg_utf_mblen(utf);
586  if (len < l)
587  {
588  /* need more data to decide if this is a combined char */
589  utf -= l_save;
590  break;
591  }
592 
593  if (!pg_utf8_islegal(utf, l))
594  {
595  if (!noError)
596  report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597  utf -= l_save;
598  break;
599  }
600 
601  /* We assume ASCII character cannot be in combined map */
602  if (l > 1)
603  {
604  uint32 iutf2;
605  uint32 cutf[2];
606 
607  if (l == 2)
608  {
609  iutf2 = *utf++ << 8;
610  iutf2 |= *utf++;
611  }
612  else if (l == 3)
613  {
614  iutf2 = *utf++ << 16;
615  iutf2 |= *utf++ << 8;
616  iutf2 |= *utf++;
617  }
618  else if (l == 4)
619  {
620  iutf2 = *utf++ << 24;
621  iutf2 |= *utf++ << 16;
622  iutf2 |= *utf++ << 8;
623  iutf2 |= *utf++;
624  }
625  else
626  {
627  elog(ERROR, "unsupported character length %d", l);
628  iutf2 = 0; /* keep compiler quiet */
629  }
630 
631  cutf[0] = iutf;
632  cutf[1] = iutf2;
633 
634  cp = bsearch(cutf, cmap, cmapsize,
636 
637  if (cp)
638  {
639  iso = store_coded_char(iso, cp->code);
640  continue;
641  }
642  }
643 
644  /* fail, so back up to reprocess second character next time */
645  utf = utf_save;
646  len = len_save;
647  l = l_save;
648  }
649 
650  /* Now check ordinary map */
651  if (map)
652  {
653  uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654 
655  if (converted)
656  {
657  iso = store_coded_char(iso, converted);
658  continue;
659  }
660  }
661 
662  /* if there's a conversion function, try that */
663  if (conv_func)
664  {
665  uint32 converted = (*conv_func) (iutf);
666 
667  if (converted)
668  {
669  iso = store_coded_char(iso, converted);
670  continue;
671  }
672  }
673 
674  /* failed to translate this character */
675  utf -= l;
676  if (noError)
677  break;
679  (const char *) utf, len);
680  }
681 
682  /* if we broke out of loop early, must be invalid input */
683  if (len > 0 && !noError)
684  report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685 
686  *iso = '\0';
687 
688  return utf - start;
689 }
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1804
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
int errcode(int sqlerrcode)
Definition: elog.c:698
static unsigned char * store_coded_char(unsigned char *dest, uint32 code)
Definition: conv.c:353
#define ERROR
Definition: elog.h:46
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
static uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, int l, unsigned char b1, unsigned char b2, unsigned char b3, unsigned char b4)
Definition: conv.c:373
unsigned int uint32
Definition: c.h:441
#define ereport(elevel,...)
Definition: elog.h:157
static int compare3(const void *p1, const void *p2)
Definition: conv.c:320
int32 encoding
Definition: pg_database.h:41
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:549
int errmsg(const char *fmt,...)
Definition: elog.c:909
#define elog(elevel,...)
Definition: elog.h:232