PostgreSQL Source Code git master
euc_jp_and_sjis.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * EUC_JP, SJIS and MULE_INTERNAL
4 *
5 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
10 *
11 *-------------------------------------------------------------------------
12 */
13
14#include "postgres.h"
15#include "fmgr.h"
16#include "mb/pg_wchar.h"
17
18/*
19 * SJIS alternative code.
20 * this code is used if a mapping EUC -> SJIS is not defined.
21 */
22#define PGSJISALTCODE 0x81ac
23#define PGEUCALTCODE 0xa2ae
24
25/*
26 * conversion table between SJIS UDC (IBM kanji) and EUC_JP
27 */
28#include "sjis.map"
29
31
38
39/* ----------
40 * conv_proc(
41 * INTEGER, -- source encoding id
42 * INTEGER, -- destination encoding id
43 * CSTRING, -- source string (null terminated C string)
44 * CSTRING, -- destination string (null terminated C string)
45 * INTEGER, -- source string length
46 * BOOL -- if true, don't throw an error if conversion fails
47 * ) returns INTEGER;
48 *
49 * Returns the number of bytes successfully converted.
50 * ----------
51 */
52
53static int sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError);
54static int mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError);
55static int euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
56static int mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError);
57static int euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError);
58static int sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError);
59
62{
63 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
64 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
65 int len = PG_GETARG_INT32(4);
66 bool noError = PG_GETARG_BOOL(5);
67 int converted;
68
70
71 converted = euc_jp2sjis(src, dest, len, noError);
72
73 PG_RETURN_INT32(converted);
74}
75
78{
79 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
80 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
81 int len = PG_GETARG_INT32(4);
82 bool noError = PG_GETARG_BOOL(5);
83 int converted;
84
86
87 converted = sjis2euc_jp(src, dest, len, noError);
88
89 PG_RETURN_INT32(converted);
90}
91
94{
95 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
96 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
97 int len = PG_GETARG_INT32(4);
98 bool noError = PG_GETARG_BOOL(5);
99 int converted;
100
102
103 converted = euc_jp2mic(src, dest, len, noError);
104
105 PG_RETURN_INT32(converted);
106}
107
108Datum
110{
111 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
112 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
113 int len = PG_GETARG_INT32(4);
114 bool noError = PG_GETARG_BOOL(5);
115 int converted;
116
118
119 converted = mic2euc_jp(src, dest, len, noError);
120
121 PG_RETURN_INT32(converted);
122}
123
124Datum
126{
127 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
128 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
129 int len = PG_GETARG_INT32(4);
130 bool noError = PG_GETARG_BOOL(5);
131 int converted;
132
134
135 converted = sjis2mic(src, dest, len, noError);
136
137 PG_RETURN_INT32(converted);
138}
139
140Datum
142{
143 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
144 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
145 int len = PG_GETARG_INT32(4);
146 bool noError = PG_GETARG_BOOL(5);
147 int converted;
148
150
151 converted = mic2sjis(src, dest, len, noError);
152
153 PG_RETURN_INT32(converted);
154}
155
156/*
157 * SJIS ---> MIC
158 */
159static int
160sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
161{
162 const unsigned char *start = sjis;
163 int c1,
164 c2,
165 i,
166 k,
167 k2;
168
169 while (len > 0)
170 {
171 c1 = *sjis;
172 if (c1 >= 0xa1 && c1 <= 0xdf)
173 {
174 /* JIS X0201 (1 byte kana) */
175 *p++ = LC_JISX0201K;
176 *p++ = c1;
177 sjis++;
178 len--;
179 }
180 else if (IS_HIGHBIT_SET(c1))
181 {
182 /*
183 * JIS X0208, X0212, user defined extended characters
184 */
185 if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
186 {
187 if (noError)
188 break;
189 report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
190 }
191 c2 = sjis[1];
192 k = (c1 << 8) + c2;
193 if (k >= 0xed40 && k < 0xf040)
194 {
195 /* NEC selection IBM kanji */
196 for (i = 0;; i++)
197 {
198 k2 = ibmkanji[i].nec;
199 if (k2 == 0xffff)
200 break;
201 if (k2 == k)
202 {
203 k = ibmkanji[i].sjis;
204 c1 = (k >> 8) & 0xff;
205 c2 = k & 0xff;
206 }
207 }
208 }
209
210 if (k < 0xeb3f)
211 {
212 /* JIS X0208 */
213 *p++ = LC_JISX0208;
214 *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
215 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
216 }
217 else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
218 {
219 /* NEC selection IBM kanji - Other undecided justice */
220 *p++ = LC_JISX0208;
221 *p++ = PGEUCALTCODE >> 8;
222 *p++ = PGEUCALTCODE & 0xff;
223 }
224 else if (k >= 0xf040 && k < 0xf540)
225 {
226 /*
227 * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
228 * 0x7e7e EUC 0xf5a1 - 0xfefe
229 */
230 *p++ = LC_JISX0208;
231 c1 -= 0x6f;
232 *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
233 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
234 }
235 else if (k >= 0xf540 && k < 0xfa40)
236 {
237 /*
238 * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
239 * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
240 */
241 *p++ = LC_JISX0212;
242 c1 -= 0x74;
243 *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
244 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
245 }
246 else if (k >= 0xfa40)
247 {
248 /*
249 * mapping IBM kanji to X0208 and X0212
250 */
251 for (i = 0;; i++)
252 {
253 k2 = ibmkanji[i].sjis;
254 if (k2 == 0xffff)
255 break;
256 if (k2 == k)
257 {
258 k = ibmkanji[i].euc;
259 if (k >= 0x8f0000)
260 {
261 *p++ = LC_JISX0212;
262 *p++ = 0x80 | ((k & 0xff00) >> 8);
263 *p++ = 0x80 | (k & 0xff);
264 }
265 else
266 {
267 *p++ = LC_JISX0208;
268 *p++ = 0x80 | (k >> 8);
269 *p++ = 0x80 | (k & 0xff);
270 }
271 }
272 }
273 }
274 sjis += 2;
275 len -= 2;
276 }
277 else
278 { /* should be ASCII */
279 if (c1 == 0)
280 {
281 if (noError)
282 break;
283 report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
284 }
285 *p++ = c1;
286 sjis++;
287 len--;
288 }
289 }
290 *p = '\0';
291
292 return sjis - start;
293}
294
295/*
296 * MIC ---> SJIS
297 */
298static int
299mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
300{
301 const unsigned char *start = mic;
302 int c1,
303 c2,
304 k,
305 l;
306
307 while (len > 0)
308 {
309 c1 = *mic;
310 if (!IS_HIGHBIT_SET(c1))
311 {
312 /* ASCII */
313 if (c1 == 0)
314 {
315 if (noError)
316 break;
318 (const char *) mic, len);
319 }
320 *p++ = c1;
321 mic++;
322 len--;
323 continue;
324 }
325 l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
326 if (l < 0)
327 {
328 if (noError)
329 break;
331 (const char *) mic, len);
332 }
333 if (c1 == LC_JISX0201K)
334 *p++ = mic[1];
335 else if (c1 == LC_JISX0208)
336 {
337 c1 = mic[1];
338 c2 = mic[2];
339 k = (c1 << 8) | (c2 & 0xff);
340 if (k >= 0xf5a1)
341 {
342 /* UDC1 */
343 c1 -= 0x54;
344 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
345 }
346 else
347 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
348 *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
349 }
350 else if (c1 == LC_JISX0212)
351 {
352 int i,
353 k2;
354
355 c1 = mic[1];
356 c2 = mic[2];
357 k = c1 << 8 | c2;
358 if (k >= 0xf5a1)
359 {
360 /* UDC2 */
361 c1 -= 0x54;
362 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
363 *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
364 }
365 else
366 {
367 /* IBM kanji */
368 for (i = 0;; i++)
369 {
370 k2 = ibmkanji[i].euc & 0xffff;
371 if (k2 == 0xffff)
372 {
373 *p++ = PGSJISALTCODE >> 8;
374 *p++ = PGSJISALTCODE & 0xff;
375 break;
376 }
377 if (k2 == k)
378 {
379 k = ibmkanji[i].sjis;
380 *p++ = k >> 8;
381 *p++ = k & 0xff;
382 break;
383 }
384 }
385 }
386 }
387 else
388 {
389 if (noError)
390 break;
392 (const char *) mic, len);
393 }
394 mic += l;
395 len -= l;
396 }
397 *p = '\0';
398
399 return mic - start;
400}
401
402/*
403 * EUC_JP ---> MIC
404 */
405static int
406euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
407{
408 const unsigned char *start = euc;
409 int c1;
410 int l;
411
412 while (len > 0)
413 {
414 c1 = *euc;
415 if (!IS_HIGHBIT_SET(c1))
416 {
417 /* ASCII */
418 if (c1 == 0)
419 {
420 if (noError)
421 break;
423 (const char *) euc, len);
424 }
425 *p++ = c1;
426 euc++;
427 len--;
428 continue;
429 }
430 l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
431 if (l < 0)
432 {
433 if (noError)
434 break;
436 (const char *) euc, len);
437 }
438 if (c1 == SS2)
439 { /* 1 byte kana? */
440 *p++ = LC_JISX0201K;
441 *p++ = euc[1];
442 }
443 else if (c1 == SS3)
444 { /* JIS X0212 kanji? */
445 *p++ = LC_JISX0212;
446 *p++ = euc[1];
447 *p++ = euc[2];
448 }
449 else
450 { /* kanji? */
451 *p++ = LC_JISX0208;
452 *p++ = c1;
453 *p++ = euc[1];
454 }
455 euc += l;
456 len -= l;
457 }
458 *p = '\0';
459
460 return euc - start;
461}
462
463/*
464 * MIC ---> EUC_JP
465 */
466static int
467mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
468{
469 const unsigned char *start = mic;
470 int c1;
471 int l;
472
473 while (len > 0)
474 {
475 c1 = *mic;
476 if (!IS_HIGHBIT_SET(c1))
477 {
478 /* ASCII */
479 if (c1 == 0)
480 {
481 if (noError)
482 break;
484 (const char *) mic, len);
485 }
486 *p++ = c1;
487 mic++;
488 len--;
489 continue;
490 }
491 l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
492 if (l < 0)
493 {
494 if (noError)
495 break;
497 (const char *) mic, len);
498 }
499 if (c1 == LC_JISX0201K)
500 {
501 *p++ = SS2;
502 *p++ = mic[1];
503 }
504 else if (c1 == LC_JISX0212)
505 {
506 *p++ = SS3;
507 *p++ = mic[1];
508 *p++ = mic[2];
509 }
510 else if (c1 == LC_JISX0208)
511 {
512 *p++ = mic[1];
513 *p++ = mic[2];
514 }
515 else
516 {
517 if (noError)
518 break;
520 (const char *) mic, len);
521 }
522 mic += l;
523 len -= l;
524 }
525 *p = '\0';
526
527 return mic - start;
528}
529
530/*
531 * EUC_JP -> SJIS
532 */
533static int
534euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
535{
536 const unsigned char *start = euc;
537 int c1,
538 c2,
539 k;
540 int l;
541
542 while (len > 0)
543 {
544 c1 = *euc;
545 if (!IS_HIGHBIT_SET(c1))
546 {
547 /* ASCII */
548 if (c1 == 0)
549 {
550 if (noError)
551 break;
553 (const char *) euc, len);
554 }
555 *p++ = c1;
556 euc++;
557 len--;
558 continue;
559 }
560 l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
561 if (l < 0)
562 {
563 if (noError)
564 break;
566 (const char *) euc, len);
567 }
568 if (c1 == SS2)
569 {
570 /* hankaku kana? */
571 *p++ = euc[1];
572 }
573 else if (c1 == SS3)
574 {
575 /* JIS X0212 kanji? */
576 c1 = euc[1];
577 c2 = euc[2];
578 k = c1 << 8 | c2;
579 if (k >= 0xf5a1)
580 {
581 /* UDC2 */
582 c1 -= 0x54;
583 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
584 *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
585 }
586 else
587 {
588 int i,
589 k2;
590
591 /* IBM kanji */
592 for (i = 0;; i++)
593 {
594 k2 = ibmkanji[i].euc & 0xffff;
595 if (k2 == 0xffff)
596 {
597 *p++ = PGSJISALTCODE >> 8;
598 *p++ = PGSJISALTCODE & 0xff;
599 break;
600 }
601 if (k2 == k)
602 {
603 k = ibmkanji[i].sjis;
604 *p++ = k >> 8;
605 *p++ = k & 0xff;
606 break;
607 }
608 }
609 }
610 }
611 else
612 {
613 /* JIS X0208 kanji? */
614 c2 = euc[1];
615 k = (c1 << 8) | (c2 & 0xff);
616 if (k >= 0xf5a1)
617 {
618 /* UDC1 */
619 c1 -= 0x54;
620 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
621 }
622 else
623 *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
624 *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
625 }
626 euc += l;
627 len -= l;
628 }
629 *p = '\0';
630
631 return euc - start;
632}
633
634/*
635 * SJIS ---> EUC_JP
636 */
637static int
638sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
639{
640 const unsigned char *start = sjis;
641 int c1,
642 c2,
643 i,
644 k,
645 k2;
646 int l;
647
648 while (len > 0)
649 {
650 c1 = *sjis;
651 if (!IS_HIGHBIT_SET(c1))
652 {
653 /* ASCII */
654 if (c1 == 0)
655 {
656 if (noError)
657 break;
659 (const char *) sjis, len);
660 }
661 *p++ = c1;
662 sjis++;
663 len--;
664 continue;
665 }
666 l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
667 if (l < 0)
668 {
669 if (noError)
670 break;
672 (const char *) sjis, len);
673 }
674 if (c1 >= 0xa1 && c1 <= 0xdf)
675 {
676 /* JIS X0201 (1 byte kana) */
677 *p++ = SS2;
678 *p++ = c1;
679 }
680 else
681 {
682 /*
683 * JIS X0208, X0212, user defined extended characters
684 */
685 c2 = sjis[1];
686 k = (c1 << 8) + c2;
687 if (k >= 0xed40 && k < 0xf040)
688 {
689 /* NEC selection IBM kanji */
690 for (i = 0;; i++)
691 {
692 k2 = ibmkanji[i].nec;
693 if (k2 == 0xffff)
694 break;
695 if (k2 == k)
696 {
697 k = ibmkanji[i].sjis;
698 c1 = (k >> 8) & 0xff;
699 c2 = k & 0xff;
700 }
701 }
702 }
703
704 if (k < 0xeb3f)
705 {
706 /* JIS X0208 */
707 *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
708 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
709 }
710 else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
711 {
712 /* NEC selection IBM kanji - Other undecided justice */
713 *p++ = PGEUCALTCODE >> 8;
714 *p++ = PGEUCALTCODE & 0xff;
715 }
716 else if (k >= 0xf040 && k < 0xf540)
717 {
718 /*
719 * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
720 * 0x7e7e EUC 0xf5a1 - 0xfefe
721 */
722 c1 -= 0x6f;
723 *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
724 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
725 }
726 else if (k >= 0xf540 && k < 0xfa40)
727 {
728 /*
729 * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
730 * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
731 */
732 *p++ = SS3;
733 c1 -= 0x74;
734 *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
735 *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
736 }
737 else if (k >= 0xfa40)
738 {
739 /*
740 * mapping IBM kanji to X0208 and X0212
741 *
742 */
743 for (i = 0;; i++)
744 {
745 k2 = ibmkanji[i].sjis;
746 if (k2 == 0xffff)
747 break;
748 if (k2 == k)
749 {
750 k = ibmkanji[i].euc;
751 if (k >= 0x8f0000)
752 {
753 *p++ = SS3;
754 *p++ = 0x80 | ((k & 0xff00) >> 8);
755 *p++ = 0x80 | (k & 0xff);
756 }
757 else
758 {
759 *p++ = 0x80 | (k >> 8);
760 *p++ = 0x80 | (k & 0xff);
761 }
762 }
763 }
764 }
765 }
766 sjis += l;
767 len -= l;
768 }
769 *p = '\0';
770
771 return sjis - start;
772}
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1112
static int mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError)
#define PGSJISALTCODE
Datum mic_to_sjis(PG_FUNCTION_ARGS)
Datum euc_jp_to_sjis(PG_FUNCTION_ARGS)
PG_MODULE_MAGIC
static int sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError)
PG_FUNCTION_INFO_V1(euc_jp_to_sjis)
Datum euc_jp_to_mic(PG_FUNCTION_ARGS)
static int sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError)
Datum sjis_to_euc_jp(PG_FUNCTION_ARGS)
Datum mic_to_euc_jp(PG_FUNCTION_ARGS)
Datum sjis_to_mic(PG_FUNCTION_ARGS)
static int euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
static int euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError)
#define PGEUCALTCODE
static int mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError)
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:277
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_GETARG_BOOL(n)
Definition: fmgr.h:274
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
return str start
int i
Definition: isn.c:72
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1730
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1698
const void size_t len
#define ISSJISTAIL(c)
Definition: pg_wchar.h:45
@ PG_MULE_INTERNAL
Definition: pg_wchar.h:233
@ PG_SJIS
Definition: pg_wchar.h:264
@ PG_EUC_JP
Definition: pg_wchar.h:227
#define SS2
Definition: pg_wchar.h:38
#define LC_JISX0208
Definition: pg_wchar.h:134
#define CHECK_ENCODING_CONVERSION_ARGS(srcencoding, destencoding)
Definition: pg_wchar.h:507
#define ISSJISHEAD(c)
Definition: pg_wchar.h:44
#define LC_JISX0212
Definition: pg_wchar.h:136
#define LC_JISX0201K
Definition: pg_wchar.h:113
#define SS3
Definition: pg_wchar.h:39
uintptr_t Datum
Definition: postgres.h:69
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:2103