PostgreSQL Source Code  git master
euc_jp_and_sjis.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * EUC_JP, SJIS and MULE_INTERNAL
4  *
5  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  * src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 #include "fmgr.h"
16 #include "mb/pg_wchar.h"
17 
18 /*
19  * SJIS alternative code.
20  * this code is used if a mapping EUC -> SJIS is not defined.
21  */
22 #define PGSJISALTCODE 0x81ac
23 #define PGEUCALTCODE 0xa2ae
24 
25 /*
26  * conversion table between SJIS UDC (IBM kanji) and EUC_JP
27  */
28 #include "sjis.map"
29 
31 
38 
39 /* ----------
40  * conv_proc(
41  * INTEGER, -- source encoding id
42  * INTEGER, -- destination encoding id
43  * CSTRING, -- source string (null terminated C string)
44  * CSTRING, -- destination string (null terminated C string)
45  * INTEGER, -- source string length
46  * BOOL -- if true, don't throw an error if conversion fails
47  * ) returns INTEGER;
48  *
49  * Returns the number of bytes successfully converted.
50  * ----------
51  */
52 
53 static int sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError);
54 static int mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
55 static int euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
56 static int mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
57 static int euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
58 static int sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
59 
60 Datum
62 {
63  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
64  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
65  int len = PG_GETARG_INT32(4);
66  bool noError = PG_GETARG_BOOL(5);
67  int converted;
68 
70 
71  converted = euc_jp2sjis(src, dest, len, noError);
72 
73  PG_RETURN_INT32(converted);
74 }
75 
76 Datum
78 {
79  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
80  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
81  int len = PG_GETARG_INT32(4);
82  bool noError = PG_GETARG_BOOL(5);
83  int converted;
84 
86 
87  converted = sjis2euc_jp(src, dest, len, noError);
88 
89  PG_RETURN_INT32(converted);
90 }
91 
92 Datum
94 {
95  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
96  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
97  int len = PG_GETARG_INT32(4);
98  bool noError = PG_GETARG_BOOL(5);
99  int converted;
100 
102 
103  converted = euc_jp2mic(src, dest, len, noError);
104 
105  PG_RETURN_INT32(converted);
106 }
107 
108 Datum
110 {
111  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
112  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
113  int len = PG_GETARG_INT32(4);
114  bool noError = PG_GETARG_BOOL(5);
115  int converted;
116 
118 
119  converted = mic2euc_jp(src, dest, len, noError);
120 
121  PG_RETURN_INT32(converted);
122 }
123 
124 Datum
126 {
127  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
128  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
129  int len = PG_GETARG_INT32(4);
130  bool noError = PG_GETARG_BOOL(5);
131  int converted;
132 
134 
135  converted = sjis2mic(src, dest, len, noError);
136 
137  PG_RETURN_INT32(converted);
138 }
139 
140 Datum
142 {
143  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
144  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
145  int len = PG_GETARG_INT32(4);
146  bool noError = PG_GETARG_BOOL(5);
147  int converted;
148 
150 
151  converted = mic2sjis(src, dest, len, noError);
152 
153  PG_RETURN_INT32(converted);
154 }
155 
156 /*
157  * SJIS ---> MIC
158  */
159 static int
160 sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
161 {
162  const unsigned char *start = sjis;
163  int c1,
164  c2,
165  i,
166  k,
167  k2;
168 
169  while (len > 0)
170  {
171  c1 = *sjis;
172  if (c1 >= 0xa1 && c1 <= 0xdf)
173  {
174  /* JIS X0201 (1 byte kana) */
175  *p++ = LC_JISX0201K;
176  *p++ = c1;
177  sjis++;
178  len--;
179  }
180  else if (IS_HIGHBIT_SET(c1))
181  {
182  /*
183  * JIS X0208, X0212, user defined extended characters
184  */
185  if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
186  {
187  if (noError)
188  break;
189  report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
190  }
191  c2 = sjis[1];
192  k = (c1 << 8) + c2;
193  if (k >= 0xed40 && k < 0xf040)
194  {
195  /* NEC selection IBM kanji */
196  for (i = 0;; i++)
197  {
198  k2 = ibmkanji[i].nec;
199  if (k2 == 0xffff)
200  break;
201  if (k2 == k)
202  {
203  k = ibmkanji[i].sjis;
204  c1 = (k >> 8) & 0xff;
205  c2 = k & 0xff;
206  }
207  }
208  }
209 
210  if (k < 0xeb3f)
211  {
212  /* JIS X0208 */
213  *p++ = LC_JISX0208;
214  *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
215  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
216  }
217  else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
218  {
219  /* NEC selection IBM kanji - Other undecided justice */
220  *p++ = LC_JISX0208;
221  *p++ = PGEUCALTCODE >> 8;
222  *p++ = PGEUCALTCODE & 0xff;
223  }
224  else if (k >= 0xf040 && k < 0xf540)
225  {
226  /*
227  * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
228  * 0x7e7e EUC 0xf5a1 - 0xfefe
229  */
230  *p++ = LC_JISX0208;
231  c1 -= 0x6f;
232  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
233  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
234  }
235  else if (k >= 0xf540 && k < 0xfa40)
236  {
237  /*
238  * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
239  * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
240  */
241  *p++ = LC_JISX0212;
242  c1 -= 0x74;
243  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
244  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
245  }
246  else if (k >= 0xfa40)
247  {
248  /*
249  * mapping IBM kanji to X0208 and X0212
250  */
251  for (i = 0;; i++)
252  {
253  k2 = ibmkanji[i].sjis;
254  if (k2 == 0xffff)
255  break;
256  if (k2 == k)
257  {
258  k = ibmkanji[i].euc;
259  if (k >= 0x8f0000)
260  {
261  *p++ = LC_JISX0212;
262  *p++ = 0x80 | ((k & 0xff00) >> 8);
263  *p++ = 0x80 | (k & 0xff);
264  }
265  else
266  {
267  *p++ = LC_JISX0208;
268  *p++ = 0x80 | (k >> 8);
269  *p++ = 0x80 | (k & 0xff);
270  }
271  }
272  }
273  }
274  sjis += 2;
275  len -= 2;
276  }
277  else
278  { /* should be ASCII */
279  if (c1 == 0)
280  {
281  if (noError)
282  break;
283  report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
284  }
285  *p++ = c1;
286  sjis++;
287  len--;
288  }
289  }
290  *p = '\0';
291 
292  return sjis - start;
293 }
294 
295 /*
296  * MIC ---> SJIS
297  */
298 static int
299 mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
300 {
301  const unsigned char *start = mic;
302  int c1,
303  c2,
304  k,
305  l;
306 
307  while (len > 0)
308  {
309  c1 = *mic;
310  if (!IS_HIGHBIT_SET(c1))
311  {
312  /* ASCII */
313  if (c1 == 0)
314  {
315  if (noError)
316  break;
318  (const char *) mic, len);
319  }
320  *p++ = c1;
321  mic++;
322  len--;
323  continue;
324  }
325  l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
326  if (l < 0)
327  {
328  if (noError)
329  break;
331  (const char *) mic, len);
332  }
333  if (c1 == LC_JISX0201K)
334  *p++ = mic[1];
335  else if (c1 == LC_JISX0208)
336  {
337  c1 = mic[1];
338  c2 = mic[2];
339  k = (c1 << 8) | (c2 & 0xff);
340  if (k >= 0xf5a1)
341  {
342  /* UDC1 */
343  c1 -= 0x54;
344  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
345  }
346  else
347  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
348  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
349  }
350  else if (c1 == LC_JISX0212)
351  {
352  int i,
353  k2;
354 
355  c1 = mic[1];
356  c2 = mic[2];
357  k = c1 << 8 | c2;
358  if (k >= 0xf5a1)
359  {
360  /* UDC2 */
361  c1 -= 0x54;
362  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
363  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
364  }
365  else
366  {
367  /* IBM kanji */
368  for (i = 0;; i++)
369  {
370  k2 = ibmkanji[i].euc & 0xffff;
371  if (k2 == 0xffff)
372  {
373  *p++ = PGSJISALTCODE >> 8;
374  *p++ = PGSJISALTCODE & 0xff;
375  break;
376  }
377  if (k2 == k)
378  {
379  k = ibmkanji[i].sjis;
380  *p++ = k >> 8;
381  *p++ = k & 0xff;
382  break;
383  }
384  }
385  }
386  }
387  else
388  {
389  if (noError)
390  break;
392  (const char *) mic, len);
393  }
394  mic += l;
395  len -= l;
396  }
397  *p = '\0';
398 
399  return mic - start;
400 }
401 
402 /*
403  * EUC_JP ---> MIC
404  */
405 static int
406 euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
407 {
408  const unsigned char *start = euc;
409  int c1;
410  int l;
411 
412  while (len > 0)
413  {
414  c1 = *euc;
415  if (!IS_HIGHBIT_SET(c1))
416  {
417  /* ASCII */
418  if (c1 == 0)
419  {
420  if (noError)
421  break;
423  (const char *) euc, len);
424  }
425  *p++ = c1;
426  euc++;
427  len--;
428  continue;
429  }
430  l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
431  if (l < 0)
432  {
433  if (noError)
434  break;
436  (const char *) euc, len);
437  }
438  if (c1 == SS2)
439  { /* 1 byte kana? */
440  *p++ = LC_JISX0201K;
441  *p++ = euc[1];
442  }
443  else if (c1 == SS3)
444  { /* JIS X0212 kanji? */
445  *p++ = LC_JISX0212;
446  *p++ = euc[1];
447  *p++ = euc[2];
448  }
449  else
450  { /* kanji? */
451  *p++ = LC_JISX0208;
452  *p++ = c1;
453  *p++ = euc[1];
454  }
455  euc += l;
456  len -= l;
457  }
458  *p = '\0';
459 
460  return euc - start;
461 }
462 
463 /*
464  * MIC ---> EUC_JP
465  */
466 static int
467 mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
468 {
469  const unsigned char *start = mic;
470  int c1;
471  int l;
472 
473  while (len > 0)
474  {
475  c1 = *mic;
476  if (!IS_HIGHBIT_SET(c1))
477  {
478  /* ASCII */
479  if (c1 == 0)
480  {
481  if (noError)
482  break;
484  (const char *) mic, len);
485  }
486  *p++ = c1;
487  mic++;
488  len--;
489  continue;
490  }
491  l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
492  if (l < 0)
493  {
494  if (noError)
495  break;
497  (const char *) mic, len);
498  }
499  if (c1 == LC_JISX0201K)
500  {
501  *p++ = SS2;
502  *p++ = mic[1];
503  }
504  else if (c1 == LC_JISX0212)
505  {
506  *p++ = SS3;
507  *p++ = mic[1];
508  *p++ = mic[2];
509  }
510  else if (c1 == LC_JISX0208)
511  {
512  *p++ = mic[1];
513  *p++ = mic[2];
514  }
515  else
516  {
517  if (noError)
518  break;
520  (const char *) mic, len);
521  }
522  mic += l;
523  len -= l;
524  }
525  *p = '\0';
526 
527  return mic - start;
528 }
529 
530 /*
531  * EUC_JP -> SJIS
532  */
533 static int
534 euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
535 {
536  const unsigned char *start = euc;
537  int c1,
538  c2,
539  k;
540  int l;
541 
542  while (len > 0)
543  {
544  c1 = *euc;
545  if (!IS_HIGHBIT_SET(c1))
546  {
547  /* ASCII */
548  if (c1 == 0)
549  {
550  if (noError)
551  break;
553  (const char *) euc, len);
554  }
555  *p++ = c1;
556  euc++;
557  len--;
558  continue;
559  }
560  l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
561  if (l < 0)
562  {
563  if (noError)
564  break;
566  (const char *) euc, len);
567  }
568  if (c1 == SS2)
569  {
570  /* hankaku kana? */
571  *p++ = euc[1];
572  }
573  else if (c1 == SS3)
574  {
575  /* JIS X0212 kanji? */
576  c1 = euc[1];
577  c2 = euc[2];
578  k = c1 << 8 | c2;
579  if (k >= 0xf5a1)
580  {
581  /* UDC2 */
582  c1 -= 0x54;
583  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
584  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
585  }
586  else
587  {
588  int i,
589  k2;
590 
591  /* IBM kanji */
592  for (i = 0;; i++)
593  {
594  k2 = ibmkanji[i].euc & 0xffff;
595  if (k2 == 0xffff)
596  {
597  *p++ = PGSJISALTCODE >> 8;
598  *p++ = PGSJISALTCODE & 0xff;
599  break;
600  }
601  if (k2 == k)
602  {
603  k = ibmkanji[i].sjis;
604  *p++ = k >> 8;
605  *p++ = k & 0xff;
606  break;
607  }
608  }
609  }
610  }
611  else
612  {
613  /* JIS X0208 kanji? */
614  c2 = euc[1];
615  k = (c1 << 8) | (c2 & 0xff);
616  if (k >= 0xf5a1)
617  {
618  /* UDC1 */
619  c1 -= 0x54;
620  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
621  }
622  else
623  *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
624  *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
625  }
626  euc += l;
627  len -= l;
628  }
629  *p = '\0';
630 
631  return euc - start;
632 }
633 
634 /*
635  * SJIS ---> EUC_JP
636  */
637 static int
638 sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
639 {
640  const unsigned char *start = sjis;
641  int c1,
642  c2,
643  i,
644  k,
645  k2;
646  int l;
647 
648  while (len > 0)
649  {
650  c1 = *sjis;
651  if (!IS_HIGHBIT_SET(c1))
652  {
653  /* ASCII */
654  if (c1 == 0)
655  {
656  if (noError)
657  break;
659  (const char *) sjis, len);
660  }
661  *p++ = c1;
662  sjis++;
663  len--;
664  continue;
665  }
666  l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
667  if (l < 0)
668  {
669  if (noError)
670  break;
672  (const char *) sjis, len);
673  }
674  if (c1 >= 0xa1 && c1 <= 0xdf)
675  {
676  /* JIS X0201 (1 byte kana) */
677  *p++ = SS2;
678  *p++ = c1;
679  }
680  else
681  {
682  /*
683  * JIS X0208, X0212, user defined extended characters
684  */
685  c2 = sjis[1];
686  k = (c1 << 8) + c2;
687  if (k >= 0xed40 && k < 0xf040)
688  {
689  /* NEC selection IBM kanji */
690  for (i = 0;; i++)
691  {
692  k2 = ibmkanji[i].nec;
693  if (k2 == 0xffff)
694  break;
695  if (k2 == k)
696  {
697  k = ibmkanji[i].sjis;
698  c1 = (k >> 8) & 0xff;
699  c2 = k & 0xff;
700  }
701  }
702  }
703 
704  if (k < 0xeb3f)
705  {
706  /* JIS X0208 */
707  *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
708  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
709  }
710  else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
711  {
712  /* NEC selection IBM kanji - Other undecided justice */
713  *p++ = PGEUCALTCODE >> 8;
714  *p++ = PGEUCALTCODE & 0xff;
715  }
716  else if (k >= 0xf040 && k < 0xf540)
717  {
718  /*
719  * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
720  * 0x7e7e EUC 0xf5a1 - 0xfefe
721  */
722  c1 -= 0x6f;
723  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
724  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
725  }
726  else if (k >= 0xf540 && k < 0xfa40)
727  {
728  /*
729  * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
730  * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
731  */
732  *p++ = SS3;
733  c1 -= 0x74;
734  *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
735  *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
736  }
737  else if (k >= 0xfa40)
738  {
739  /*
740  * mapping IBM kanji to X0208 and X0212
741  *
742  */
743  for (i = 0;; i++)
744  {
745  k2 = ibmkanji[i].sjis;
746  if (k2 == 0xffff)
747  break;
748  if (k2 == k)
749  {
750  k = ibmkanji[i].euc;
751  if (k >= 0x8f0000)
752  {
753  *p++ = SS3;
754  *p++ = 0x80 | ((k & 0xff00) >> 8);
755  *p++ = 0x80 | (k & 0xff);
756  }
757  else
758  {
759  *p++ = 0x80 | (k >> 8);
760  *p++ = 0x80 | (k & 0xff);
761  }
762  }
763  }
764  }
765  }
766  sjis += l;
767  len -= l;
768  }
769  *p = '\0';
770 
771  return sjis - start;
772 }
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
static int euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
Datum euc_jp_to_sjis(PG_FUNCTION_ARGS)
static int mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
Datum sjis_to_mic(PG_FUNCTION_ARGS)
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1647
#define ISSJISTAIL(c)
Definition: pg_wchar.h:42
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define LC_JISX0212
Definition: pg_wchar.h:133
#define SS3
Definition: pg_wchar.h:36
#define PG_GETARG_BOOL(n)
Definition: fmgr.h:274
Datum sjis_to_euc_jp(PG_FUNCTION_ARGS)
#define PGSJISALTCODE
static int euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
PG_FUNCTION_INFO_V1(euc_jp_to_sjis)
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
PG_MODULE_MAGIC
#define PGEUCALTCODE
static int sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
#define LC_JISX0208
Definition: pg_wchar.h:131
uintptr_t Datum
Definition: postgres.h:411
#define CHECK_ENCODING_CONVERSION_ARGS(srcencoding, destencoding)
Definition: pg_wchar.h:527
#define ISSJISHEAD(c)
Definition: pg_wchar.h:41
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1679
Datum mic_to_euc_jp(PG_FUNCTION_ARGS)
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:1948
Datum mic_to_sjis(PG_FUNCTION_ARGS)
int i
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:277
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
static int sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
static int mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
#define SS2
Definition: pg_wchar.h:35
Datum euc_jp_to_mic(PG_FUNCTION_ARGS)
#define LC_JISX0201K
Definition: pg_wchar.h:110