PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
euc_jp_and_sjis.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * EUC_JP, SJIS and MULE_INTERNAL
4  *
5  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  * src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 #include "fmgr.h"
16 #include "mb/pg_wchar.h"
17 
18 /*
19  * SJIS alternative code.
20  * this code is used if a mapping EUC -> SJIS is not defined.
21  */
22 #define PGSJISALTCODE 0x81ac
23 #define PGEUCALTCODE 0xa2ae
24 
25 /*
26  * conversion table between SJIS UDC (IBM kanji) and EUC_JP
27  */
28 #include "sjis.map"
29 
31 
38 
39 /* ----------
40  * conv_proc(
41  * INTEGER, -- source encoding id
42  * INTEGER, -- destination encoding id
43  * CSTRING, -- source string (null terminated C string)
44  * CSTRING, -- destination string (null terminated C string)
45  * INTEGER -- source string length
46  * ) returns VOID;
47  * ----------
48  */
49 
50 static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
51 static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
52 static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
53 static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
54 static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
55 static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
56 
57 Datum
59 {
60  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
61  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
62  int len = PG_GETARG_INT32(4);
63 
65 
66  euc_jp2sjis(src, dest, len);
67 
69 }
70 
71 Datum
73 {
74  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
75  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
76  int len = PG_GETARG_INT32(4);
77 
79 
80  sjis2euc_jp(src, dest, len);
81 
83 }
84 
85 Datum
87 {
88  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
89  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
90  int len = PG_GETARG_INT32(4);
91 
93 
94  euc_jp2mic(src, dest, len);
95 
97 }
98 
99 Datum
101 {
102  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
103  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
104  int len = PG_GETARG_INT32(4);
105 
107 
108  mic2euc_jp(src, dest, len);
109 
110  PG_RETURN_VOID();
111 }
112 
113 Datum
115 {
116  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
117  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
118  int len = PG_GETARG_INT32(4);
119 
121 
122  sjis2mic(src, dest, len);
123 
124  PG_RETURN_VOID();
125 }
126 
127 Datum
129 {
130  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
131  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
132  int len = PG_GETARG_INT32(4);
133 
135 
136  mic2sjis(src, dest, len);
137 
138  PG_RETURN_VOID();
139 }
140 
141 /*
142  * SJIS ---> MIC
143  */
144 static void
145 sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
146 {
147  int c1,
148  c2,
149  i,
150  k,
151  k2;
152 
153  while (len > 0)
154  {
155  c1 = *sjis;
156  if (c1 >= 0xa1 && c1 <= 0xdf)
157  {
158  /* JIS X0201 (1 byte kana) */
159  *p++ = LC_JISX0201K;
160  *p++ = c1;
161  sjis++;
162  len--;
163  }
164  else if (IS_HIGHBIT_SET(c1))
165  {
166  /*
167  * JIS X0208, X0212, user defined extended characters
168  */
169  if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
170  report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
171  c2 = sjis[1];
172  k = (c1 << 8) + c2;
173  if (k >= 0xed40 && k < 0xf040)
174  {
175  /* NEC selection IBM kanji */
176  for (i = 0;; i++)
177  {
178  k2 = ibmkanji[i].nec;
179  if (k2 == 0xffff)
180  break;
181  if (k2 == k)
182  {
183  k = ibmkanji[i].sjis;
184  c1 = (k >> 8) & 0xff;
185  c2 = k & 0xff;
186  }
187  }
188  }
189 
190  if (k < 0xeb3f)
191  {
192  /* JIS X0208 */
193  *p++ = LC_JISX0208;
194  *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
195  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
196  }
197  else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
198  {
199  /* NEC selection IBM kanji - Other undecided justice */
200  *p++ = LC_JISX0208;
201  *p++ = PGEUCALTCODE >> 8;
202  *p++ = PGEUCALTCODE & 0xff;
203  }
204  else if (k >= 0xf040 && k < 0xf540)
205  {
206  /*
207  * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
208  * 0x7e7e EUC 0xf5a1 - 0xfefe
209  */
210  *p++ = LC_JISX0208;
211  c1 -= 0x6f;
212  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
213  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
214  }
215  else if (k >= 0xf540 && k < 0xfa40)
216  {
217  /*
218  * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
219  * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
220  */
221  *p++ = LC_JISX0212;
222  c1 -= 0x74;
223  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
224  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
225  }
226  else if (k >= 0xfa40)
227  {
228  /*
229  * mapping IBM kanji to X0208 and X0212
230  */
231  for (i = 0;; i++)
232  {
233  k2 = ibmkanji[i].sjis;
234  if (k2 == 0xffff)
235  break;
236  if (k2 == k)
237  {
238  k = ibmkanji[i].euc;
239  if (k >= 0x8f0000)
240  {
241  *p++ = LC_JISX0212;
242  *p++ = 0x80 | ((k & 0xff00) >> 8);
243  *p++ = 0x80 | (k & 0xff);
244  }
245  else
246  {
247  *p++ = LC_JISX0208;
248  *p++ = 0x80 | (k >> 8);
249  *p++ = 0x80 | (k & 0xff);
250  }
251  }
252  }
253  }
254  sjis += 2;
255  len -= 2;
256  }
257  else
258  { /* should be ASCII */
259  if (c1 == 0)
260  report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
261  *p++ = c1;
262  sjis++;
263  len--;
264  }
265  }
266  *p = '\0';
267 }
268 
269 /*
270  * MIC ---> SJIS
271  */
272 static void
273 mic2sjis(const unsigned char *mic, unsigned char *p, int len)
274 {
275  int c1,
276  c2,
277  k,
278  l;
279 
280  while (len > 0)
281  {
282  c1 = *mic;
283  if (!IS_HIGHBIT_SET(c1))
284  {
285  /* ASCII */
286  if (c1 == 0)
288  (const char *) mic, len);
289  *p++ = c1;
290  mic++;
291  len--;
292  continue;
293  }
294  l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
295  if (l < 0)
297  (const char *) mic, len);
298  if (c1 == LC_JISX0201K)
299  *p++ = mic[1];
300  else if (c1 == LC_JISX0208)
301  {
302  c1 = mic[1];
303  c2 = mic[2];
304  k = (c1 << 8) | (c2 & 0xff);
305  if (k >= 0xf5a1)
306  {
307  /* UDC1 */
308  c1 -= 0x54;
309  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
310  }
311  else
312  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
313  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
314  }
315  else if (c1 == LC_JISX0212)
316  {
317  int i,
318  k2;
319 
320  c1 = mic[1];
321  c2 = mic[2];
322  k = c1 << 8 | c2;
323  if (k >= 0xf5a1)
324  {
325  /* UDC2 */
326  c1 -= 0x54;
327  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
328  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
329  }
330  else
331  {
332  /* IBM kanji */
333  for (i = 0;; i++)
334  {
335  k2 = ibmkanji[i].euc & 0xffff;
336  if (k2 == 0xffff)
337  {
338  *p++ = PGSJISALTCODE >> 8;
339  *p++ = PGSJISALTCODE & 0xff;
340  break;
341  }
342  if (k2 == k)
343  {
344  k = ibmkanji[i].sjis;
345  *p++ = k >> 8;
346  *p++ = k & 0xff;
347  break;
348  }
349  }
350  }
351  }
352  else
354  (const char *) mic, len);
355  mic += l;
356  len -= l;
357  }
358  *p = '\0';
359 }
360 
361 /*
362  * EUC_JP ---> MIC
363  */
364 static void
365 euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
366 {
367  int c1;
368  int l;
369 
370  while (len > 0)
371  {
372  c1 = *euc;
373  if (!IS_HIGHBIT_SET(c1))
374  {
375  /* ASCII */
376  if (c1 == 0)
378  (const char *) euc, len);
379  *p++ = c1;
380  euc++;
381  len--;
382  continue;
383  }
384  l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
385  if (l < 0)
387  (const char *) euc, len);
388  if (c1 == SS2)
389  { /* 1 byte kana? */
390  *p++ = LC_JISX0201K;
391  *p++ = euc[1];
392  }
393  else if (c1 == SS3)
394  { /* JIS X0212 kanji? */
395  *p++ = LC_JISX0212;
396  *p++ = euc[1];
397  *p++ = euc[2];
398  }
399  else
400  { /* kanji? */
401  *p++ = LC_JISX0208;
402  *p++ = c1;
403  *p++ = euc[1];
404  }
405  euc += l;
406  len -= l;
407  }
408  *p = '\0';
409 }
410 
411 /*
412  * MIC ---> EUC_JP
413  */
414 static void
415 mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
416 {
417  int c1;
418  int l;
419 
420  while (len > 0)
421  {
422  c1 = *mic;
423  if (!IS_HIGHBIT_SET(c1))
424  {
425  /* ASCII */
426  if (c1 == 0)
428  (const char *) mic, len);
429  *p++ = c1;
430  mic++;
431  len--;
432  continue;
433  }
434  l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
435  if (l < 0)
437  (const char *) mic, len);
438  if (c1 == LC_JISX0201K)
439  {
440  *p++ = SS2;
441  *p++ = mic[1];
442  }
443  else if (c1 == LC_JISX0212)
444  {
445  *p++ = SS3;
446  *p++ = mic[1];
447  *p++ = mic[2];
448  }
449  else if (c1 == LC_JISX0208)
450  {
451  *p++ = mic[1];
452  *p++ = mic[2];
453  }
454  else
456  (const char *) mic, len);
457  mic += l;
458  len -= l;
459  }
460  *p = '\0';
461 }
462 
463 /*
464  * EUC_JP -> SJIS
465  */
466 static void
467 euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
468 {
469  int c1,
470  c2,
471  k;
472  int l;
473 
474  while (len > 0)
475  {
476  c1 = *euc;
477  if (!IS_HIGHBIT_SET(c1))
478  {
479  /* ASCII */
480  if (c1 == 0)
482  (const char *) euc, len);
483  *p++ = c1;
484  euc++;
485  len--;
486  continue;
487  }
488  l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
489  if (l < 0)
491  (const char *) euc, len);
492  if (c1 == SS2)
493  {
494  /* hankaku kana? */
495  *p++ = euc[1];
496  }
497  else if (c1 == SS3)
498  {
499  /* JIS X0212 kanji? */
500  c1 = euc[1];
501  c2 = euc[2];
502  k = c1 << 8 | c2;
503  if (k >= 0xf5a1)
504  {
505  /* UDC2 */
506  c1 -= 0x54;
507  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
508  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
509  }
510  else
511  {
512  int i,
513  k2;
514 
515  /* IBM kanji */
516  for (i = 0;; i++)
517  {
518  k2 = ibmkanji[i].euc & 0xffff;
519  if (k2 == 0xffff)
520  {
521  *p++ = PGSJISALTCODE >> 8;
522  *p++ = PGSJISALTCODE & 0xff;
523  break;
524  }
525  if (k2 == k)
526  {
527  k = ibmkanji[i].sjis;
528  *p++ = k >> 8;
529  *p++ = k & 0xff;
530  break;
531  }
532  }
533  }
534  }
535  else
536  {
537  /* JIS X0208 kanji? */
538  c2 = euc[1];
539  k = (c1 << 8) | (c2 & 0xff);
540  if (k >= 0xf5a1)
541  {
542  /* UDC1 */
543  c1 -= 0x54;
544  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
545  }
546  else
547  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
548  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
549  }
550  euc += l;
551  len -= l;
552  }
553  *p = '\0';
554 }
555 
556 /*
557  * SJIS ---> EUC_JP
558  */
559 static void
560 sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
561 {
562  int c1,
563  c2,
564  i,
565  k,
566  k2;
567  int l;
568 
569  while (len > 0)
570  {
571  c1 = *sjis;
572  if (!IS_HIGHBIT_SET(c1))
573  {
574  /* ASCII */
575  if (c1 == 0)
577  (const char *) sjis, len);
578  *p++ = c1;
579  sjis++;
580  len--;
581  continue;
582  }
583  l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len);
584  if (l < 0)
586  (const char *) sjis, len);
587  if (c1 >= 0xa1 && c1 <= 0xdf)
588  {
589  /* JIS X0201 (1 byte kana) */
590  *p++ = SS2;
591  *p++ = c1;
592  }
593  else
594  {
595  /*
596  * JIS X0208, X0212, user defined extended characters
597  */
598  c2 = sjis[1];
599  k = (c1 << 8) + c2;
600  if (k >= 0xed40 && k < 0xf040)
601  {
602  /* NEC selection IBM kanji */
603  for (i = 0;; i++)
604  {
605  k2 = ibmkanji[i].nec;
606  if (k2 == 0xffff)
607  break;
608  if (k2 == k)
609  {
610  k = ibmkanji[i].sjis;
611  c1 = (k >> 8) & 0xff;
612  c2 = k & 0xff;
613  }
614  }
615  }
616 
617  if (k < 0xeb3f)
618  {
619  /* JIS X0208 */
620  *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
621  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
622  }
623  else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
624  {
625  /* NEC selection IBM kanji - Other undecided justice */
626  *p++ = PGEUCALTCODE >> 8;
627  *p++ = PGEUCALTCODE & 0xff;
628  }
629  else if (k >= 0xf040 && k < 0xf540)
630  {
631  /*
632  * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
633  * 0x7e7e EUC 0xf5a1 - 0xfefe
634  */
635  c1 -= 0x6f;
636  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
637  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
638  }
639  else if (k >= 0xf540 && k < 0xfa40)
640  {
641  /*
642  * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
643  * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
644  */
645  *p++ = SS3;
646  c1 -= 0x74;
647  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
648  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
649  }
650  else if (k >= 0xfa40)
651  {
652  /*
653  * mapping IBM kanji to X0208 and X0212
654  *
655  */
656  for (i = 0;; i++)
657  {
658  k2 = ibmkanji[i].sjis;
659  if (k2 == 0xffff)
660  break;
661  if (k2 == k)
662  {
663  k = ibmkanji[i].euc;
664  if (k >= 0x8f0000)
665  {
666  *p++ = SS3;
667  *p++ = 0x80 | ((k & 0xff00) >> 8);
668  *p++ = 0x80 | (k & 0xff);
669  }
670  else
671  {
672  *p++ = 0x80 | (k >> 8);
673  *p++ = 0x80 | (k & 0xff);
674  }
675  }
676  }
677  }
678  }
679  sjis += l;
680  len -= l;
681  }
682  *p = '\0';
683 }
static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len)
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
Datum euc_jp_to_sjis(PG_FUNCTION_ARGS)
static void mic2sjis(const unsigned char *mic, unsigned char *p, int len)
static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len)
Datum sjis_to_mic(PG_FUNCTION_ARGS)
#define ISSJISTAIL(c)
Definition: pg_wchar.h:42
static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
#define LC_JISX0212
Definition: pg_wchar.h:133
#define SS3
Definition: pg_wchar.h:36
Datum sjis_to_euc_jp(PG_FUNCTION_ARGS)
int pg_encoding_verifymb(int encoding, const char *mbstr, int len)
Definition: wchar.c:1809
#define PGSJISALTCODE
PG_FUNCTION_INFO_V1(euc_jp_to_sjis)
#define IS_HIGHBIT_SET(ch)
Definition: c.h:973
PG_MODULE_MAGIC
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: wchar.c:2027
#define PGEUCALTCODE
#define LC_JISX0208
Definition: pg_wchar.h:131
static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
uintptr_t Datum
Definition: postgres.h:372
#define PG_RETURN_VOID()
Definition: fmgr.h:309
#define CHECK_ENCODING_CONVERSION_ARGS(srcencoding, destencoding)
Definition: pg_wchar.h:492
#define ISSJISHEAD(c)
Definition: pg_wchar.h:41
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: wchar.c:1995
static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
Datum mic_to_euc_jp(PG_FUNCTION_ARGS)
Datum mic_to_sjis(PG_FUNCTION_ARGS)
int i
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:242
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define SS2
Definition: pg_wchar.h:35
Datum euc_jp_to_mic(PG_FUNCTION_ARGS)
#define LC_JISX0201K
Definition: pg_wchar.h:110