PostgreSQL Source Code git master
Loading...
Searching...
No Matches
wchar.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
5 *
6 * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/wchar.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include <limits.h>
16
17#include "mb/pg_wchar.h"
18#include "utils/ascii.h"
19
20
21/*
22 * In today's multibyte encodings other than UTF8, this two-byte sequence
23 * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 *
25 * For historical reasons, several verifychar implementations opt to reject
26 * this pair specifically. Byte pair range constraints, in encoding
27 * originator documentation, always excluded this pair. No core conversion
28 * could translate it. However, longstanding verifychar implementations
29 * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
30 * pairs not valid per encoding originator documentation. To avoid tightening
31 * core or non-core conversions in a security patch, we sought this one pair.
32 *
33 * PQescapeString() historically used spaces for BYTE1; many other values
34 * could suffice for BYTE1.
35 */
36#define NONUTF8_INVALID_BYTE0 (0x8d)
37#define NONUTF8_INVALID_BYTE1 (' ')
38
39
40/*
41 * Operations on multi-byte encodings are driven by a table of helper
42 * functions.
43 *
44 * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 * and wchar2mb() conversion functions.
47 *
48 * These functions generally assume that their input is validly formed.
49 * The "verifier" functions, further down in the file, have to be more
50 * paranoid.
51 *
52 * We expect that mblen() does not need to examine more than the first byte
53 * of the character to discover the correct length. GB18030 is an exception
54 * to that rule, though, as it also looks at second byte. But even that
55 * behaves in a predictable way, if you only pass the first byte: it will
56 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 * good enough for all current uses.
58 *
59 * Note: for the display output of psql to work properly, the return values
60 * of the dsplen functions must conform to the Unicode standard. In particular
61 * the NUL character is zero width and control characters are generally
62 * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 * subset to the ASCII routines to ensure consistency.
64 */
65
66/* No error-reporting facility. Ignore incomplete trailing byte sequence. */
67#define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
68
69/*
70 * SQL/ASCII
71 */
72static int
73pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
74{
75 int cnt = 0;
76
77 while (len > 0 && *from)
78 {
79 *to++ = *from++;
80 len--;
81 cnt++;
82 }
83 *to = 0;
84 return cnt;
85}
86
87static int
88pg_ascii_mblen(const unsigned char *s)
89{
90 return 1;
91}
92
93static int
94pg_ascii_dsplen(const unsigned char *s)
95{
96 if (*s == '\0')
97 return 0;
98 if (*s < 0x20 || *s == 0x7f)
99 return -1;
100
101 return 1;
102}
103
104/*
105 * EUC
106 */
107static int
108pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
109{
110 int cnt = 0;
111
112 while (len > 0 && *from)
113 {
114 if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
115 {
117 from++;
118 *to = (SS2 << 8) | *from++;
119 len -= 2;
120 }
121 else if (*from == SS3) /* JIS X 0212 KANJI */
122 {
124 from++;
125 *to = (SS3 << 16) | (*from++ << 8);
126 *to |= *from++;
127 len -= 3;
128 }
129 else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
130 {
132 *to = *from++ << 8;
133 *to |= *from++;
134 len -= 2;
135 }
136 else /* must be ASCII */
137 {
138 *to = *from++;
139 len--;
140 }
141 to++;
142 cnt++;
143 }
144 *to = 0;
145 return cnt;
146}
147
148static inline int
149pg_euc_mblen(const unsigned char *s)
150{
151 int len;
152
153 if (*s == SS2)
154 len = 2;
155 else if (*s == SS3)
156 len = 3;
157 else if (IS_HIGHBIT_SET(*s))
158 len = 2;
159 else
160 len = 1;
161 return len;
162}
163
164static inline int
165pg_euc_dsplen(const unsigned char *s)
166{
167 int len;
168
169 if (*s == SS2)
170 len = 2;
171 else if (*s == SS3)
172 len = 2;
173 else if (IS_HIGHBIT_SET(*s))
174 len = 2;
175 else
176 len = pg_ascii_dsplen(s);
177 return len;
178}
179
180/*
181 * EUC_JP
182 */
183static int
184pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
185{
186 return pg_euc2wchar_with_len(from, to, len);
187}
188
189static int
190pg_eucjp_mblen(const unsigned char *s)
191{
192 return pg_euc_mblen(s);
193}
194
195static int
196pg_eucjp_dsplen(const unsigned char *s)
197{
198 int len;
199
200 if (*s == SS2)
201 len = 1;
202 else if (*s == SS3)
203 len = 2;
204 else if (IS_HIGHBIT_SET(*s))
205 len = 2;
206 else
207 len = pg_ascii_dsplen(s);
208 return len;
209}
210
211/*
212 * EUC_KR
213 */
214static int
215pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
216{
217 return pg_euc2wchar_with_len(from, to, len);
218}
219
220static int
221pg_euckr_mblen(const unsigned char *s)
222{
223 return pg_euc_mblen(s);
224}
225
226static int
227pg_euckr_dsplen(const unsigned char *s)
228{
229 return pg_euc_dsplen(s);
230}
231
232/*
233 * EUC_CN
234 *
235 */
236static int
237pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
238{
239 int cnt = 0;
240
241 while (len > 0 && *from)
242 {
243 if (*from == SS2) /* code set 2 (unused?) */
244 {
246 from++;
247 *to = (SS2 << 16) | (*from++ << 8);
248 *to |= *from++;
249 len -= 3;
250 }
251 else if (*from == SS3) /* code set 3 (unused ?) */
252 {
254 from++;
255 *to = (SS3 << 16) | (*from++ << 8);
256 *to |= *from++;
257 len -= 3;
258 }
259 else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
260 {
262 *to = *from++ << 8;
263 *to |= *from++;
264 len -= 2;
265 }
266 else
267 {
268 *to = *from++;
269 len--;
270 }
271 to++;
272 cnt++;
273 }
274 *to = 0;
275 return cnt;
276}
277
278/*
279 * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
280 * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
281 * relies on agreement between mb2wchar_with_len and mblen. Invalid text
282 * datums (e.g. from shared catalogs) reach this.
283 */
284static int
285pg_euccn_mblen(const unsigned char *s)
286{
287 int len;
288
289 if (*s == SS2)
290 len = 3;
291 else if (*s == SS3)
292 len = 3;
293 else if (IS_HIGHBIT_SET(*s))
294 len = 2;
295 else
296 len = 1;
297 return len;
298}
299
300static int
301pg_euccn_dsplen(const unsigned char *s)
302{
303 int len;
304
305 if (IS_HIGHBIT_SET(*s))
306 len = 2;
307 else
308 len = pg_ascii_dsplen(s);
309 return len;
310}
311
312/*
313 * EUC_TW
314 *
315 */
316static int
317pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
318{
319 int cnt = 0;
320
321 while (len > 0 && *from)
322 {
323 if (*from == SS2) /* code set 2 */
324 {
326 from++;
327 *to = (((uint32) SS2) << 24) | (*from++ << 16);
328 *to |= *from++ << 8;
329 *to |= *from++;
330 len -= 4;
331 }
332 else if (*from == SS3) /* code set 3 (unused?) */
333 {
335 from++;
336 *to = (SS3 << 16) | (*from++ << 8);
337 *to |= *from++;
338 len -= 3;
339 }
340 else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
341 {
343 *to = *from++ << 8;
344 *to |= *from++;
345 len -= 2;
346 }
347 else
348 {
349 *to = *from++;
350 len--;
351 }
352 to++;
353 cnt++;
354 }
355 *to = 0;
356 return cnt;
357}
358
359static int
360pg_euctw_mblen(const unsigned char *s)
361{
362 int len;
363
364 if (*s == SS2)
365 len = 4;
366 else if (*s == SS3)
367 len = 3;
368 else if (IS_HIGHBIT_SET(*s))
369 len = 2;
370 else
371 len = 1;
372 return len;
373}
374
375static int
376pg_euctw_dsplen(const unsigned char *s)
377{
378 int len;
379
380 if (*s == SS2)
381 len = 2;
382 else if (*s == SS3)
383 len = 2;
384 else if (IS_HIGHBIT_SET(*s))
385 len = 2;
386 else
387 len = pg_ascii_dsplen(s);
388 return len;
389}
390
391/*
392 * Convert pg_wchar to EUC_* encoding.
393 * caller must allocate enough space for "to", including a trailing zero!
394 * len: length of from.
395 * "from" not necessarily null terminated.
396 */
397static int
398pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
399{
400 int cnt = 0;
401
402 while (len > 0 && *from)
403 {
404 unsigned char c;
405
406 if ((c = (*from >> 24)))
407 {
408 *to++ = c;
409 *to++ = (*from >> 16) & 0xff;
410 *to++ = (*from >> 8) & 0xff;
411 *to++ = *from & 0xff;
412 cnt += 4;
413 }
414 else if ((c = (*from >> 16)))
415 {
416 *to++ = c;
417 *to++ = (*from >> 8) & 0xff;
418 *to++ = *from & 0xff;
419 cnt += 3;
420 }
421 else if ((c = (*from >> 8)))
422 {
423 *to++ = c;
424 *to++ = *from & 0xff;
425 cnt += 2;
426 }
427 else
428 {
429 *to++ = *from;
430 cnt++;
431 }
432 from++;
433 len--;
434 }
435 *to = 0;
436 return cnt;
437}
438
439
440/*
441 * JOHAB
442 */
443static int
444pg_johab_mblen(const unsigned char *s)
445{
446 return pg_euc_mblen(s);
447}
448
449static int
450pg_johab_dsplen(const unsigned char *s)
451{
452 return pg_euc_dsplen(s);
453}
454
455/*
456 * convert UTF8 string to pg_wchar (UCS-4)
457 * caller must allocate enough space for "to", including a trailing zero!
458 * len: length of from.
459 * "from" not necessarily null terminated.
460 */
461static int
462pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
463{
464 int cnt = 0;
465 uint32 c1,
466 c2,
467 c3,
468 c4;
469
470 while (len > 0 && *from)
471 {
472 if ((*from & 0x80) == 0)
473 {
474 *to = *from++;
475 len--;
476 }
477 else if ((*from & 0xe0) == 0xc0)
478 {
480 c1 = *from++ & 0x1f;
481 c2 = *from++ & 0x3f;
482 *to = (c1 << 6) | c2;
483 len -= 2;
484 }
485 else if ((*from & 0xf0) == 0xe0)
486 {
488 c1 = *from++ & 0x0f;
489 c2 = *from++ & 0x3f;
490 c3 = *from++ & 0x3f;
491 *to = (c1 << 12) | (c2 << 6) | c3;
492 len -= 3;
493 }
494 else if ((*from & 0xf8) == 0xf0)
495 {
497 c1 = *from++ & 0x07;
498 c2 = *from++ & 0x3f;
499 c3 = *from++ & 0x3f;
500 c4 = *from++ & 0x3f;
501 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
502 len -= 4;
503 }
504 else
505 {
506 /* treat a bogus char as length 1; not ours to raise error */
507 *to = *from++;
508 len--;
509 }
510 to++;
511 cnt++;
512 }
513 *to = 0;
514 return cnt;
515}
516
517
518/*
519 * Trivial conversion from pg_wchar to UTF-8.
520 * caller should allocate enough space for "to"
521 * len: length of from.
522 * "from" not necessarily null terminated.
523 */
524static int
525pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
526{
527 int cnt = 0;
528
529 while (len > 0 && *from)
530 {
531 int char_len;
532
533 unicode_to_utf8(*from, to);
535 cnt += char_len;
536 to += char_len;
537 from++;
538 len--;
539 }
540 *to = 0;
541 return cnt;
542}
543
544/*
545 * Return the byte length of a UTF8 character pointed to by s
546 *
547 * Note: in the current implementation we do not support UTF8 sequences
548 * of more than 4 bytes; hence do NOT return a value larger than 4.
549 * We return "1" for any leading byte that is either flat-out illegal or
550 * indicates a length larger than we support.
551 *
552 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
553 * other places would need to be fixed to change this.
554 */
555int
556pg_utf_mblen(const unsigned char *s)
557{
558 int len;
559
560 if ((*s & 0x80) == 0)
561 len = 1;
562 else if ((*s & 0xe0) == 0xc0)
563 len = 2;
564 else if ((*s & 0xf0) == 0xe0)
565 len = 3;
566 else if ((*s & 0xf8) == 0xf0)
567 len = 4;
568#ifdef NOT_USED
569 else if ((*s & 0xfc) == 0xf8)
570 len = 5;
571 else if ((*s & 0xfe) == 0xfc)
572 len = 6;
573#endif
574 else
575 len = 1;
576 return len;
577}
578
579/*
580 * This is an implementation of wcwidth() and wcswidth() as defined in
581 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
582 * <http://www.unix.org/online.html>
583 *
584 * Markus Kuhn -- 2001-09-08 -- public domain
585 *
586 * customised for PostgreSQL
587 *
588 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
589 */
590
592{
593 unsigned int first;
594 unsigned int last;
595};
596
597/* auxiliary function for binary search in interval table */
598static int
599mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
600{
601 int min = 0;
602 int mid;
603
604 if (ucs < table[0].first || ucs > table[max].last)
605 return 0;
606 while (max >= min)
607 {
608 mid = (min + max) / 2;
609 if (ucs > table[mid].last)
610 min = mid + 1;
611 else if (ucs < table[mid].first)
612 max = mid - 1;
613 else
614 return 1;
615 }
616
617 return 0;
618}
619
620
621/* The following functions define the column width of an ISO 10646
622 * character as follows:
623 *
624 * - The null character (U+0000) has a column width of 0.
625 *
626 * - Other C0/C1 control characters and DEL will lead to a return
627 * value of -1.
628 *
629 * - Non-spacing and enclosing combining characters (general
630 * category code Mn, Me or Cf in the Unicode database) have a
631 * column width of 0.
632 *
633 * - Spacing characters in the East Asian Wide (W) or East Asian
634 * FullWidth (F) category as defined in Unicode Technical
635 * Report #11 have a column width of 2.
636 *
637 * - All remaining characters (including all printable
638 * ISO 8859-1 and WGL4 characters, Unicode control characters,
639 * etc.) have a column width of 1.
640 *
641 * This implementation assumes that wchar_t characters are encoded
642 * in ISO 10646.
643 */
644
645static int
647{
650
651 /* test for 8-bit control characters */
652 if (ucs == 0)
653 return 0;
654
655 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
656 return -1;
657
658 /*
659 * binary search in table of non-spacing characters
660 *
661 * XXX: In the official Unicode sources, it is possible for a character to
662 * be described as both non-spacing and wide at the same time. As of
663 * Unicode 13.0, treating the non-spacing property as the determining
664 * factor for display width leads to the correct behavior, so do that
665 * search first.
666 */
668 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
669 return 0;
670
671 /* binary search in table of wide characters */
673 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
674 return 2;
675
676 return 1;
677}
678
679static int
680pg_utf_dsplen(const unsigned char *s)
681{
682 return ucs_wcwidth(utf8_to_unicode(s));
683}
684
685/*
686 * convert mule internal code to pg_wchar
687 * caller should allocate enough space for "to"
688 * len: length of from.
689 * "from" not necessarily null terminated.
690 */
691static int
692pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
693{
694 int cnt = 0;
695
696 while (len > 0 && *from)
697 {
698 if (IS_LC1(*from))
699 {
701 *to = *from++ << 16;
702 *to |= *from++;
703 len -= 2;
704 }
705 else if (IS_LCPRV1(*from))
706 {
708 from++;
709 *to = *from++ << 16;
710 *to |= *from++;
711 len -= 3;
712 }
713 else if (IS_LC2(*from))
714 {
716 *to = *from++ << 16;
717 *to |= *from++ << 8;
718 *to |= *from++;
719 len -= 3;
720 }
721 else if (IS_LCPRV2(*from))
722 {
724 from++;
725 *to = *from++ << 16;
726 *to |= *from++ << 8;
727 *to |= *from++;
728 len -= 4;
729 }
730 else
731 { /* assume ASCII */
732 *to = (unsigned char) *from++;
733 len--;
734 }
735 to++;
736 cnt++;
737 }
738 *to = 0;
739 return cnt;
740}
741
742/*
743 * convert pg_wchar to mule internal code
744 * caller should allocate enough space for "to"
745 * len: length of from.
746 * "from" not necessarily null terminated.
747 */
748static int
749pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
750{
751 int cnt = 0;
752
753 while (len > 0 && *from)
754 {
755 unsigned char lb;
756
757 lb = (*from >> 16) & 0xff;
758 if (IS_LC1(lb))
759 {
760 *to++ = lb;
761 *to++ = *from & 0xff;
762 cnt += 2;
763 }
764 else if (IS_LC2(lb))
765 {
766 *to++ = lb;
767 *to++ = (*from >> 8) & 0xff;
768 *to++ = *from & 0xff;
769 cnt += 3;
770 }
771 else if (IS_LCPRV1_A_RANGE(lb))
772 {
773 *to++ = LCPRV1_A;
774 *to++ = lb;
775 *to++ = *from & 0xff;
776 cnt += 3;
777 }
778 else if (IS_LCPRV1_B_RANGE(lb))
779 {
780 *to++ = LCPRV1_B;
781 *to++ = lb;
782 *to++ = *from & 0xff;
783 cnt += 3;
784 }
785 else if (IS_LCPRV2_A_RANGE(lb))
786 {
787 *to++ = LCPRV2_A;
788 *to++ = lb;
789 *to++ = (*from >> 8) & 0xff;
790 *to++ = *from & 0xff;
791 cnt += 4;
792 }
793 else if (IS_LCPRV2_B_RANGE(lb))
794 {
795 *to++ = LCPRV2_B;
796 *to++ = lb;
797 *to++ = (*from >> 8) & 0xff;
798 *to++ = *from & 0xff;
799 cnt += 4;
800 }
801 else
802 {
803 *to++ = *from & 0xff;
804 cnt += 1;
805 }
806 from++;
807 len--;
808 }
809 *to = 0;
810 return cnt;
811}
812
813/* exported for direct use by conv.c */
814int
815pg_mule_mblen(const unsigned char *s)
816{
817 int len;
818
819 if (IS_LC1(*s))
820 len = 2;
821 else if (IS_LCPRV1(*s))
822 len = 3;
823 else if (IS_LC2(*s))
824 len = 3;
825 else if (IS_LCPRV2(*s))
826 len = 4;
827 else
828 len = 1; /* assume ASCII */
829 return len;
830}
831
832static int
833pg_mule_dsplen(const unsigned char *s)
834{
835 int len;
836
837 /*
838 * Note: it's not really appropriate to assume that all multibyte charsets
839 * are double-wide on screen. But this seems an okay approximation for
840 * the MULE charsets we currently support.
841 */
842
843 if (IS_LC1(*s))
844 len = 1;
845 else if (IS_LCPRV1(*s))
846 len = 1;
847 else if (IS_LC2(*s))
848 len = 2;
849 else if (IS_LCPRV2(*s))
850 len = 2;
851 else
852 len = 1; /* assume ASCII */
853
854 return len;
855}
856
857/*
858 * ISO8859-1
859 */
860static int
861pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
862{
863 int cnt = 0;
864
865 while (len > 0 && *from)
866 {
867 *to++ = *from++;
868 len--;
869 cnt++;
870 }
871 *to = 0;
872 return cnt;
873}
874
875/*
876 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
877 * high bits.
878 * caller should allocate enough space for "to"
879 * len: length of from.
880 * "from" not necessarily null terminated.
881 */
882static int
883pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
884{
885 int cnt = 0;
886
887 while (len > 0 && *from)
888 {
889 *to++ = *from++;
890 len--;
891 cnt++;
892 }
893 *to = 0;
894 return cnt;
895}
896
897static int
898pg_latin1_mblen(const unsigned char *s)
899{
900 return 1;
901}
902
903static int
904pg_latin1_dsplen(const unsigned char *s)
905{
906 return pg_ascii_dsplen(s);
907}
908
909/*
910 * SJIS
911 */
912static int
913pg_sjis_mblen(const unsigned char *s)
914{
915 int len;
916
917 if (*s >= 0xa1 && *s <= 0xdf)
918 len = 1; /* 1 byte kana? */
919 else if (IS_HIGHBIT_SET(*s))
920 len = 2; /* kanji? */
921 else
922 len = 1; /* should be ASCII */
923 return len;
924}
925
926static int
927pg_sjis_dsplen(const unsigned char *s)
928{
929 int len;
930
931 if (*s >= 0xa1 && *s <= 0xdf)
932 len = 1; /* 1 byte kana? */
933 else if (IS_HIGHBIT_SET(*s))
934 len = 2; /* kanji? */
935 else
936 len = pg_ascii_dsplen(s); /* should be ASCII */
937 return len;
938}
939
940/*
941 * Big5
942 */
943static int
944pg_big5_mblen(const unsigned char *s)
945{
946 int len;
947
948 if (IS_HIGHBIT_SET(*s))
949 len = 2; /* kanji? */
950 else
951 len = 1; /* should be ASCII */
952 return len;
953}
954
955static int
956pg_big5_dsplen(const unsigned char *s)
957{
958 int len;
959
960 if (IS_HIGHBIT_SET(*s))
961 len = 2; /* kanji? */
962 else
963 len = pg_ascii_dsplen(s); /* should be ASCII */
964 return len;
965}
966
967/*
968 * GBK
969 */
970static int
971pg_gbk_mblen(const unsigned char *s)
972{
973 int len;
974
975 if (IS_HIGHBIT_SET(*s))
976 len = 2; /* kanji? */
977 else
978 len = 1; /* should be ASCII */
979 return len;
980}
981
982static int
983pg_gbk_dsplen(const unsigned char *s)
984{
985 int len;
986
987 if (IS_HIGHBIT_SET(*s))
988 len = 2; /* kanji? */
989 else
990 len = pg_ascii_dsplen(s); /* should be ASCII */
991 return len;
992}
993
994/*
995 * UHC
996 */
997static int
998pg_uhc_mblen(const unsigned char *s)
999{
1000 int len;
1001
1002 if (IS_HIGHBIT_SET(*s))
1003 len = 2; /* 2byte? */
1004 else
1005 len = 1; /* should be ASCII */
1006 return len;
1007}
1008
1009static int
1010pg_uhc_dsplen(const unsigned char *s)
1011{
1012 int len;
1013
1014 if (IS_HIGHBIT_SET(*s))
1015 len = 2; /* 2byte? */
1016 else
1017 len = pg_ascii_dsplen(s); /* should be ASCII */
1018 return len;
1019}
1020
1021/*
1022 * GB18030
1023 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1024 */
1025
1026/*
1027 * Unlike all other mblen() functions, this also looks at the second byte of
1028 * the input. However, if you only pass the first byte of a multi-byte
1029 * string, and \0 as the second byte, this still works in a predictable way:
1030 * a 4-byte character will be reported as two 2-byte characters. That's
1031 * enough for all current uses, as a client-only encoding. It works that
1032 * way, because in any valid 4-byte GB18030-encoded character, the third and
1033 * fourth byte look like a 2-byte encoded character, when looked at
1034 * separately.
1035 */
1036static int
1037pg_gb18030_mblen(const unsigned char *s)
1038{
1039 int len;
1040
1041 if (!IS_HIGHBIT_SET(*s))
1042 len = 1; /* ASCII */
1043 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1044 len = 4;
1045 else
1046 len = 2;
1047 return len;
1048}
1049
1050static int
1051pg_gb18030_dsplen(const unsigned char *s)
1052{
1053 int len;
1054
1055 if (IS_HIGHBIT_SET(*s))
1056 len = 2;
1057 else
1058 len = pg_ascii_dsplen(s); /* ASCII */
1059 return len;
1060}
1061
1062/*
1063 *-------------------------------------------------------------------
1064 * multibyte sequence validators
1065 *
1066 * The verifychar functions accept "s", a pointer to the first byte of a
1067 * string, and "len", the remaining length of the string. If there is a
1068 * validly encoded character beginning at *s, return its length in bytes;
1069 * else return -1.
1070 *
1071 * The verifystr functions also accept "s", a pointer to a string and "len",
1072 * the length of the string. They verify the whole string, and return the
1073 * number of input bytes (<= len) that are valid. In other words, if the
1074 * whole string is valid, verifystr returns "len", otherwise it returns the
1075 * byte offset of the first invalid character. The verifystr functions must
1076 * test for and reject zeroes in the input.
1077 *
1078 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1079 * they must test for and reject zeroes in any additional bytes of a
1080 * multibyte character. Note that this definition allows the function for a
1081 * single-byte encoding to be just "return 1".
1082 *-------------------------------------------------------------------
1083 */
1084static int
1085pg_ascii_verifychar(const unsigned char *s, int len)
1086{
1087 return 1;
1088}
1089
1090static int
1091pg_ascii_verifystr(const unsigned char *s, int len)
1092{
1093 const unsigned char *nullpos = memchr(s, 0, len);
1094
1095 if (nullpos == NULL)
1096 return len;
1097 else
1098 return nullpos - s;
1099}
1100
1101#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1102
1103static int
1104pg_eucjp_verifychar(const unsigned char *s, int len)
1105{
1106 int l;
1107 unsigned char c1,
1108 c2;
1109
1110 c1 = *s++;
1111
1112 switch (c1)
1113 {
1114 case SS2: /* JIS X 0201 */
1115 l = 2;
1116 if (l > len)
1117 return -1;
1118 c2 = *s++;
1119 if (c2 < 0xa1 || c2 > 0xdf)
1120 return -1;
1121 break;
1122
1123 case SS3: /* JIS X 0212 */
1124 l = 3;
1125 if (l > len)
1126 return -1;
1127 c2 = *s++;
1128 if (!IS_EUC_RANGE_VALID(c2))
1129 return -1;
1130 c2 = *s++;
1131 if (!IS_EUC_RANGE_VALID(c2))
1132 return -1;
1133 break;
1134
1135 default:
1136 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1137 {
1138 l = 2;
1139 if (l > len)
1140 return -1;
1141 if (!IS_EUC_RANGE_VALID(c1))
1142 return -1;
1143 c2 = *s++;
1144 if (!IS_EUC_RANGE_VALID(c2))
1145 return -1;
1146 }
1147 else
1148 /* must be ASCII */
1149 {
1150 l = 1;
1151 }
1152 break;
1153 }
1154
1155 return l;
1156}
1157
1158static int
1159pg_eucjp_verifystr(const unsigned char *s, int len)
1160{
1161 const unsigned char *start = s;
1162
1163 while (len > 0)
1164 {
1165 int l;
1166
1167 /* fast path for ASCII-subset characters */
1168 if (!IS_HIGHBIT_SET(*s))
1169 {
1170 if (*s == '\0')
1171 break;
1172 l = 1;
1173 }
1174 else
1175 {
1176 l = pg_eucjp_verifychar(s, len);
1177 if (l == -1)
1178 break;
1179 }
1180 s += l;
1181 len -= l;
1182 }
1183
1184 return s - start;
1185}
1186
1187static int
1188pg_euckr_verifychar(const unsigned char *s, int len)
1189{
1190 int l;
1191 unsigned char c1,
1192 c2;
1193
1194 c1 = *s++;
1195
1196 if (IS_HIGHBIT_SET(c1))
1197 {
1198 l = 2;
1199 if (l > len)
1200 return -1;
1201 if (!IS_EUC_RANGE_VALID(c1))
1202 return -1;
1203 c2 = *s++;
1204 if (!IS_EUC_RANGE_VALID(c2))
1205 return -1;
1206 }
1207 else
1208 /* must be ASCII */
1209 {
1210 l = 1;
1211 }
1212
1213 return l;
1214}
1215
1216static int
1217pg_euckr_verifystr(const unsigned char *s, int len)
1218{
1219 const unsigned char *start = s;
1220
1221 while (len > 0)
1222 {
1223 int l;
1224
1225 /* fast path for ASCII-subset characters */
1226 if (!IS_HIGHBIT_SET(*s))
1227 {
1228 if (*s == '\0')
1229 break;
1230 l = 1;
1231 }
1232 else
1233 {
1234 l = pg_euckr_verifychar(s, len);
1235 if (l == -1)
1236 break;
1237 }
1238 s += l;
1239 len -= l;
1240 }
1241
1242 return s - start;
1243}
1244
1245/* EUC-CN byte sequences are exactly same as EUC-KR */
1246#define pg_euccn_verifychar pg_euckr_verifychar
1247#define pg_euccn_verifystr pg_euckr_verifystr
1248
1249static int
1250pg_euctw_verifychar(const unsigned char *s, int len)
1251{
1252 int l;
1253 unsigned char c1,
1254 c2;
1255
1256 c1 = *s++;
1257
1258 switch (c1)
1259 {
1260 case SS2: /* CNS 11643 Plane 1-7 */
1261 l = 4;
1262 if (l > len)
1263 return -1;
1264 c2 = *s++;
1265 if (c2 < 0xa1 || c2 > 0xa7)
1266 return -1;
1267 c2 = *s++;
1268 if (!IS_EUC_RANGE_VALID(c2))
1269 return -1;
1270 c2 = *s++;
1271 if (!IS_EUC_RANGE_VALID(c2))
1272 return -1;
1273 break;
1274
1275 case SS3: /* unused */
1276 return -1;
1277
1278 default:
1279 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1280 {
1281 l = 2;
1282 if (l > len)
1283 return -1;
1284 /* no further range check on c1? */
1285 c2 = *s++;
1286 if (!IS_EUC_RANGE_VALID(c2))
1287 return -1;
1288 }
1289 else
1290 /* must be ASCII */
1291 {
1292 l = 1;
1293 }
1294 break;
1295 }
1296 return l;
1297}
1298
1299static int
1300pg_euctw_verifystr(const unsigned char *s, int len)
1301{
1302 const unsigned char *start = s;
1303
1304 while (len > 0)
1305 {
1306 int l;
1307
1308 /* fast path for ASCII-subset characters */
1309 if (!IS_HIGHBIT_SET(*s))
1310 {
1311 if (*s == '\0')
1312 break;
1313 l = 1;
1314 }
1315 else
1316 {
1317 l = pg_euctw_verifychar(s, len);
1318 if (l == -1)
1319 break;
1320 }
1321 s += l;
1322 len -= l;
1323 }
1324
1325 return s - start;
1326}
1327
1328static int
1329pg_johab_verifychar(const unsigned char *s, int len)
1330{
1331 int l,
1332 mbl;
1333 unsigned char c;
1334
1335 l = mbl = pg_johab_mblen(s);
1336
1337 if (len < l)
1338 return -1;
1339
1340 if (!IS_HIGHBIT_SET(*s))
1341 return mbl;
1342
1343 while (--l > 0)
1344 {
1345 c = *++s;
1346 if (!IS_EUC_RANGE_VALID(c))
1347 return -1;
1348 }
1349 return mbl;
1350}
1351
1352static int
1353pg_johab_verifystr(const unsigned char *s, int len)
1354{
1355 const unsigned char *start = s;
1356
1357 while (len > 0)
1358 {
1359 int l;
1360
1361 /* fast path for ASCII-subset characters */
1362 if (!IS_HIGHBIT_SET(*s))
1363 {
1364 if (*s == '\0')
1365 break;
1366 l = 1;
1367 }
1368 else
1369 {
1370 l = pg_johab_verifychar(s, len);
1371 if (l == -1)
1372 break;
1373 }
1374 s += l;
1375 len -= l;
1376 }
1377
1378 return s - start;
1379}
1380
1381static int
1382pg_mule_verifychar(const unsigned char *s, int len)
1383{
1384 int l,
1385 mbl;
1386 unsigned char c;
1387
1388 l = mbl = pg_mule_mblen(s);
1389
1390 if (len < l)
1391 return -1;
1392
1393 while (--l > 0)
1394 {
1395 c = *++s;
1396 if (!IS_HIGHBIT_SET(c))
1397 return -1;
1398 }
1399 return mbl;
1400}
1401
1402static int
1403pg_mule_verifystr(const unsigned char *s, int len)
1404{
1405 const unsigned char *start = s;
1406
1407 while (len > 0)
1408 {
1409 int l;
1410
1411 /* fast path for ASCII-subset characters */
1412 if (!IS_HIGHBIT_SET(*s))
1413 {
1414 if (*s == '\0')
1415 break;
1416 l = 1;
1417 }
1418 else
1419 {
1420 l = pg_mule_verifychar(s, len);
1421 if (l == -1)
1422 break;
1423 }
1424 s += l;
1425 len -= l;
1426 }
1427
1428 return s - start;
1429}
1430
1431static int
1432pg_latin1_verifychar(const unsigned char *s, int len)
1433{
1434 return 1;
1435}
1436
1437static int
1438pg_latin1_verifystr(const unsigned char *s, int len)
1439{
1440 const unsigned char *nullpos = memchr(s, 0, len);
1441
1442 if (nullpos == NULL)
1443 return len;
1444 else
1445 return nullpos - s;
1446}
1447
1448static int
1449pg_sjis_verifychar(const unsigned char *s, int len)
1450{
1451 int l,
1452 mbl;
1453 unsigned char c1,
1454 c2;
1455
1456 l = mbl = pg_sjis_mblen(s);
1457
1458 if (len < l)
1459 return -1;
1460
1461 if (l == 1) /* pg_sjis_mblen already verified it */
1462 return mbl;
1463
1464 c1 = *s++;
1465 c2 = *s;
1466 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1467 return -1;
1468 return mbl;
1469}
1470
1471static int
1472pg_sjis_verifystr(const unsigned char *s, int len)
1473{
1474 const unsigned char *start = s;
1475
1476 while (len > 0)
1477 {
1478 int l;
1479
1480 /* fast path for ASCII-subset characters */
1481 if (!IS_HIGHBIT_SET(*s))
1482 {
1483 if (*s == '\0')
1484 break;
1485 l = 1;
1486 }
1487 else
1488 {
1489 l = pg_sjis_verifychar(s, len);
1490 if (l == -1)
1491 break;
1492 }
1493 s += l;
1494 len -= l;
1495 }
1496
1497 return s - start;
1498}
1499
1500static int
1501pg_big5_verifychar(const unsigned char *s, int len)
1502{
1503 int l,
1504 mbl;
1505
1506 l = mbl = pg_big5_mblen(s);
1507
1508 if (len < l)
1509 return -1;
1510
1511 if (l == 2 &&
1512 s[0] == NONUTF8_INVALID_BYTE0 &&
1513 s[1] == NONUTF8_INVALID_BYTE1)
1514 return -1;
1515
1516 while (--l > 0)
1517 {
1518 if (*++s == '\0')
1519 return -1;
1520 }
1521
1522 return mbl;
1523}
1524
1525static int
1526pg_big5_verifystr(const unsigned char *s, int len)
1527{
1528 const unsigned char *start = s;
1529
1530 while (len > 0)
1531 {
1532 int l;
1533
1534 /* fast path for ASCII-subset characters */
1535 if (!IS_HIGHBIT_SET(*s))
1536 {
1537 if (*s == '\0')
1538 break;
1539 l = 1;
1540 }
1541 else
1542 {
1543 l = pg_big5_verifychar(s, len);
1544 if (l == -1)
1545 break;
1546 }
1547 s += l;
1548 len -= l;
1549 }
1550
1551 return s - start;
1552}
1553
1554static int
1555pg_gbk_verifychar(const unsigned char *s, int len)
1556{
1557 int l,
1558 mbl;
1559
1560 l = mbl = pg_gbk_mblen(s);
1561
1562 if (len < l)
1563 return -1;
1564
1565 if (l == 2 &&
1566 s[0] == NONUTF8_INVALID_BYTE0 &&
1567 s[1] == NONUTF8_INVALID_BYTE1)
1568 return -1;
1569
1570 while (--l > 0)
1571 {
1572 if (*++s == '\0')
1573 return -1;
1574 }
1575
1576 return mbl;
1577}
1578
1579static int
1580pg_gbk_verifystr(const unsigned char *s, int len)
1581{
1582 const unsigned char *start = s;
1583
1584 while (len > 0)
1585 {
1586 int l;
1587
1588 /* fast path for ASCII-subset characters */
1589 if (!IS_HIGHBIT_SET(*s))
1590 {
1591 if (*s == '\0')
1592 break;
1593 l = 1;
1594 }
1595 else
1596 {
1597 l = pg_gbk_verifychar(s, len);
1598 if (l == -1)
1599 break;
1600 }
1601 s += l;
1602 len -= l;
1603 }
1604
1605 return s - start;
1606}
1607
1608static int
1609pg_uhc_verifychar(const unsigned char *s, int len)
1610{
1611 int l,
1612 mbl;
1613
1614 l = mbl = pg_uhc_mblen(s);
1615
1616 if (len < l)
1617 return -1;
1618
1619 if (l == 2 &&
1620 s[0] == NONUTF8_INVALID_BYTE0 &&
1621 s[1] == NONUTF8_INVALID_BYTE1)
1622 return -1;
1623
1624 while (--l > 0)
1625 {
1626 if (*++s == '\0')
1627 return -1;
1628 }
1629
1630 return mbl;
1631}
1632
1633static int
1634pg_uhc_verifystr(const unsigned char *s, int len)
1635{
1636 const unsigned char *start = s;
1637
1638 while (len > 0)
1639 {
1640 int l;
1641
1642 /* fast path for ASCII-subset characters */
1643 if (!IS_HIGHBIT_SET(*s))
1644 {
1645 if (*s == '\0')
1646 break;
1647 l = 1;
1648 }
1649 else
1650 {
1651 l = pg_uhc_verifychar(s, len);
1652 if (l == -1)
1653 break;
1654 }
1655 s += l;
1656 len -= l;
1657 }
1658
1659 return s - start;
1660}
1661
1662static int
1663pg_gb18030_verifychar(const unsigned char *s, int len)
1664{
1665 int l;
1666
1667 if (!IS_HIGHBIT_SET(*s))
1668 l = 1; /* ASCII */
1669 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1670 {
1671 /* Should be 4-byte, validate remaining bytes */
1672 if (*s >= 0x81 && *s <= 0xfe &&
1673 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1674 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1675 l = 4;
1676 else
1677 l = -1;
1678 }
1679 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1680 {
1681 /* Should be 2-byte, validate */
1682 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1683 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1684 l = 2;
1685 else
1686 l = -1;
1687 }
1688 else
1689 l = -1;
1690 return l;
1691}
1692
1693static int
1694pg_gb18030_verifystr(const unsigned char *s, int len)
1695{
1696 const unsigned char *start = s;
1697
1698 while (len > 0)
1699 {
1700 int l;
1701
1702 /* fast path for ASCII-subset characters */
1703 if (!IS_HIGHBIT_SET(*s))
1704 {
1705 if (*s == '\0')
1706 break;
1707 l = 1;
1708 }
1709 else
1710 {
1711 l = pg_gb18030_verifychar(s, len);
1712 if (l == -1)
1713 break;
1714 }
1715 s += l;
1716 len -= l;
1717 }
1718
1719 return s - start;
1720}
1721
1722static int
1723pg_utf8_verifychar(const unsigned char *s, int len)
1724{
1725 int l;
1726
1727 if ((*s & 0x80) == 0)
1728 {
1729 if (*s == '\0')
1730 return -1;
1731 return 1;
1732 }
1733 else if ((*s & 0xe0) == 0xc0)
1734 l = 2;
1735 else if ((*s & 0xf0) == 0xe0)
1736 l = 3;
1737 else if ((*s & 0xf8) == 0xf0)
1738 l = 4;
1739 else
1740 l = 1;
1741
1742 if (l > len)
1743 return -1;
1744
1745 if (!pg_utf8_islegal(s, l))
1746 return -1;
1747
1748 return l;
1749}
1750
1751/*
1752 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1753 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1754 * input byte and current state are used to compute an index into an array of
1755 * state transitions. Since the address of the next transition is dependent
1756 * on this computation, there is latency in executing the load instruction,
1757 * and the CPU is not kept busy.
1758 *
1759 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1760 *
1761 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1762 *
1763 * In a shift-based DFA, the input byte is an index into array of integers
1764 * whose bit pattern encodes the state transitions. To compute the next
1765 * state, we simply right-shift the integer by the current state and apply a
1766 * mask. In this scheme, the address of the transition only depends on the
1767 * input byte, so there is better pipelining.
1768 *
1769 * The naming convention for states and transitions was adopted from a UTF-8
1770 * to UTF-16/32 transcoder, whose table is reproduced below:
1771 *
1772 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1773 *
1774 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1775 * ==========================================================================
1776 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1777 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1778 * |
1779 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1780 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1781 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1782 * |
1783 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1784 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1785 * |
1786 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1787 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1788 *
1789 * In the most straightforward implementation, a shift-based DFA for UTF-8
1790 * requires 64-bit integers to encode the transitions, but with an SMT solver
1791 * it's possible to find state numbers such that the transitions fit within
1792 * 32-bit integers, as Dougall Johnson demonstrated:
1793 *
1794 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1795 *
1796 * This packed representation is the reason for the seemingly odd choice of
1797 * state values below.
1798 */
1799
1800/* Error */
1801#define ERR 0
1802/* Begin */
1803#define BGN 11
1804/* Continuation states, expect 1/2/3 continuation bytes */
1805#define CS1 16
1806#define CS2 1
1807#define CS3 5
1808/* Partial states, where the first continuation byte has a restricted range */
1809#define P3A 6 /* Lead was E0, check for 3-byte overlong */
1810#define P3B 20 /* Lead was ED, check for surrogate */
1811#define P4A 25 /* Lead was F0, check for 4-byte overlong */
1812#define P4B 30 /* Lead was F4, check for too-large */
1813/* Begin and End are the same state */
1814#define END BGN
1815
1816/* the encoded state transitions for the lookup table */
1817
1818/* ASCII */
1819#define ASC (END << BGN)
1820/* 2-byte lead */
1821#define L2A (CS1 << BGN)
1822/* 3-byte lead */
1823#define L3A (P3A << BGN)
1824#define L3B (CS2 << BGN)
1825#define L3C (P3B << BGN)
1826/* 4-byte lead */
1827#define L4A (P4A << BGN)
1828#define L4B (CS3 << BGN)
1829#define L4C (P4B << BGN)
1830/* continuation byte */
1831#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1832#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1833#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1834/* invalid byte */
1835#define ILL ERR
1836
1837static const uint32 Utf8Transition[256] =
1838{
1839 /* ASCII */
1840
1841 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1842 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1843 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1844 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1845
1846 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1847 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1848 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1849 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1850
1851 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1852 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1853 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1854 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1855
1856 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1857 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1858 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1859 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1860
1861 /* continuation bytes */
1862
1863 /* 80..8F */
1864 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1865 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1866
1867 /* 90..9F */
1868 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1869 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1870
1871 /* A0..BF */
1872 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1873 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1874 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1875 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1876
1877 /* leading bytes */
1878
1879 /* C0..DF */
1880 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1881 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1882 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1883 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1884
1885 /* E0..EF */
1886 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1887 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1888
1889 /* F0..FF */
1890 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1891 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1892};
1893
1894static void
1895utf8_advance(const unsigned char *s, uint32 *state, int len)
1896{
1897 /* Note: We deliberately don't check the state's value here. */
1898 while (len > 0)
1899 {
1900 /*
1901 * It's important that the mask value is 31: In most instruction sets,
1902 * a shift by a 32-bit operand is understood to be a shift by its mod
1903 * 32, so the compiler should elide the mask operation.
1904 */
1905 *state = Utf8Transition[*s++] >> (*state & 31);
1906 len--;
1907 }
1908
1909 *state &= 31;
1910}
1911
1912static int
1913pg_utf8_verifystr(const unsigned char *s, int len)
1914{
1915 const unsigned char *start = s;
1916 const int orig_len = len;
1917 uint32 state = BGN;
1918
1919/*
1920 * With a stride of two vector widths, gcc will unroll the loop. Even if
1921 * the compiler can unroll a longer loop, it's not worth it because we
1922 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1923 */
1924#define STRIDE_LENGTH (2 * sizeof(Vector8))
1925
1926 if (len >= STRIDE_LENGTH)
1927 {
1928 while (len >= STRIDE_LENGTH)
1929 {
1930 /*
1931 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1932 * but we must first check for a non-END state, which means the
1933 * previous chunk ended in the middle of a multibyte sequence.
1934 */
1935 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1937
1938 s += STRIDE_LENGTH;
1939 len -= STRIDE_LENGTH;
1940 }
1941
1942 /* The error state persists, so we only need to check for it here. */
1943 if (state == ERR)
1944 {
1945 /*
1946 * Start over from the beginning with the slow path so we can
1947 * count the valid bytes.
1948 */
1949 len = orig_len;
1950 s = start;
1951 }
1952 else if (state != END)
1953 {
1954 /*
1955 * The fast path exited in the middle of a multibyte sequence.
1956 * Walk backwards to find the leading byte so that the slow path
1957 * can resume checking from there. We must always backtrack at
1958 * least one byte, since the current byte could be e.g. an ASCII
1959 * byte after a 2-byte lead, which is invalid.
1960 */
1961 do
1962 {
1963 Assert(s > start);
1964 s--;
1965 len++;
1967 } while (pg_utf_mblen(s) <= 1);
1968 }
1969 }
1970
1971 /* check remaining bytes */
1972 while (len > 0)
1973 {
1974 int l;
1975
1976 /* fast path for ASCII-subset characters */
1977 if (!IS_HIGHBIT_SET(*s))
1978 {
1979 if (*s == '\0')
1980 break;
1981 l = 1;
1982 }
1983 else
1984 {
1985 l = pg_utf8_verifychar(s, len);
1986 if (l == -1)
1987 break;
1988 }
1989 s += l;
1990 len -= l;
1991 }
1992
1993 return s - start;
1994}
1995
1996/*
1997 * Check for validity of a single UTF-8 encoded character
1998 *
1999 * This directly implements the rules in RFC3629. The bizarre-looking
2000 * restrictions on the second byte are meant to ensure that there isn't
2001 * more than one encoding of a given Unicode character point; that is,
2002 * you may not use a longer-than-necessary byte sequence with high order
2003 * zero bits to represent a character that would fit in fewer bytes.
2004 * To do otherwise is to create security hazards (eg, create an apparent
2005 * non-ASCII character that decodes to plain ASCII).
2006 *
2007 * length is assumed to have been obtained by pg_utf_mblen(), and the
2008 * caller must have checked that that many bytes are present in the buffer.
2009 */
2010bool
2011pg_utf8_islegal(const unsigned char *source, int length)
2012{
2013 unsigned char a;
2014
2015 switch (length)
2016 {
2017 default:
2018 /* reject lengths 5 and 6 for now */
2019 return false;
2020 case 4:
2021 a = source[3];
2022 if (a < 0x80 || a > 0xBF)
2023 return false;
2024 /* FALL THRU */
2025 case 3:
2026 a = source[2];
2027 if (a < 0x80 || a > 0xBF)
2028 return false;
2029 /* FALL THRU */
2030 case 2:
2031 a = source[1];
2032 switch (*source)
2033 {
2034 case 0xE0:
2035 if (a < 0xA0 || a > 0xBF)
2036 return false;
2037 break;
2038 case 0xED:
2039 if (a < 0x80 || a > 0x9F)
2040 return false;
2041 break;
2042 case 0xF0:
2043 if (a < 0x90 || a > 0xBF)
2044 return false;
2045 break;
2046 case 0xF4:
2047 if (a < 0x80 || a > 0x8F)
2048 return false;
2049 break;
2050 default:
2051 if (a < 0x80 || a > 0xBF)
2052 return false;
2053 break;
2054 }
2055 /* FALL THRU */
2056 case 1:
2057 a = *source;
2058 if (a >= 0x80 && a < 0xC2)
2059 return false;
2060 if (a > 0xF4)
2061 return false;
2062 break;
2063 }
2064 return true;
2065}
2066
2067
2068/*
2069 * Fills the provided buffer with two bytes such that:
2070 * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2071 */
2072void
2080
2081/*
2082 *-------------------------------------------------------------------
2083 * encoding info table
2084 *-------------------------------------------------------------------
2085 */
2129};
2130
2131/*
2132 * Returns the byte length of a multibyte character.
2133 *
2134 * Choose "mblen" functions based on the input string characteristics.
2135 * pg_encoding_mblen() can be used when ANY of these conditions are met:
2136 *
2137 * - The input string is zero-terminated
2138 *
2139 * - The input string is known to be valid in the encoding (e.g., string
2140 * converted from database encoding)
2141 *
2142 * - The encoding is not GB18030 (e.g., when only database encodings are
2143 * passed to 'encoding' parameter)
2144 *
2145 * encoding==GB18030 requires examining up to two bytes to determine character
2146 * length. Therefore, callers satisfying none of those conditions must use
2147 * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
2148 * guaranteed to be within allocation bounds.
2149 *
2150 * When dealing with text that is not certainly valid in the specified
2151 * encoding, the result may exceed the actual remaining string length.
2152 * Callers that are not prepared to deal with that should use Min(remaining,
2153 * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
2154 * pg_encoding_mblen_bounded() are interchangeable.
2155 */
2156int
2158{
2159 return (PG_VALID_ENCODING(encoding) ?
2160 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2161 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2162}
2163
2164/*
2165 * Returns the byte length of a multibyte character (possibly not
2166 * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
2167 */
2168int
2170 size_t remaining)
2171{
2172 /*
2173 * Define zero remaining as too few, even for single-byte encodings.
2174 * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
2175 * zero; others read one.
2176 */
2177 if (remaining < 1 ||
2179 return INT_MAX;
2181}
2182
2183/*
2184 * Returns the byte length of a multibyte character; but not more than the
2185 * distance to the terminating zero byte. For input that might lack a
2186 * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
2187 */
2188int
2193
2194/*
2195 * Returns the display length of a multibyte character.
2196 */
2197int
2199{
2200 return (PG_VALID_ENCODING(encoding) ?
2201 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2202 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2203}
2204
2205/*
2206 * Verify the first multibyte character of the given string.
2207 * Return its byte length if good, -1 if bad. (See comments above for
2208 * full details of the mbverifychar API.)
2209 */
2210int
2212{
2213 return (PG_VALID_ENCODING(encoding) ?
2214 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2215 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2216}
2217
2218/*
2219 * Verify that a string is valid for the given encoding.
2220 * Returns the number of input bytes (<= len) that form a valid string.
2221 * (See comments above for full details of the mbverifystr API.)
2222 */
2223int
2225{
2226 return (PG_VALID_ENCODING(encoding) ?
2227 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2228 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2229}
2230
2231/*
2232 * fetch maximum length of a given encoding
2233 */
2234int
2236{
2238
2239 /*
2240 * Check for the encoding despite the assert, due to some mingw versions
2241 * otherwise issuing bogus warnings.
2242 */
2243 return PG_VALID_ENCODING(encoding) ?
2246}
static bool is_valid_ascii(const unsigned char *s, int len)
Definition ascii.h:25
#define IS_HIGHBIT_SET(ch)
Definition c.h:1150
#define Assert(condition)
Definition c.h:873
uint32_t uint32
Definition c.h:546
return str start
int remaining
Definition informix.c:692
static char * encoding
Definition initdb.c:139
int a
Definition isn.c:73
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
static char32_t utf8_to_unicode(const unsigned char *c)
Definition mbprint.c:53
const void size_t len
static const struct lconv_member_info table[]
static rewind_source * source
Definition pg_rewind.c:89
#define IS_LCPRV2(c)
Definition pg_wchar.h:164
#define ISSJISTAIL(c)
Definition pg_wchar.h:45
#define pg_utf_mblen
Definition pg_wchar.h:633
@ PG_WIN1254
Definition pg_wchar.h:257
@ PG_LATIN4
Definition pg_wchar.h:237
@ PG_LATIN9
Definition pg_wchar.h:242
@ PG_JOHAB
Definition pg_wchar.h:269
@ PG_GB18030
Definition pg_wchar.h:268
@ PG_SQL_ASCII
Definition pg_wchar.h:226
@ PG_KOI8R
Definition pg_wchar.h:248
@ PG_ISO_8859_6
Definition pg_wchar.h:252
@ PG_WIN1253
Definition pg_wchar.h:256
@ PG_KOI8U
Definition pg_wchar.h:260
@ PG_LATIN6
Definition pg_wchar.h:239
@ PG_MULE_INTERNAL
Definition pg_wchar.h:233
@ PG_LATIN5
Definition pg_wchar.h:238
@ PG_EUC_CN
Definition pg_wchar.h:228
@ PG_UHC
Definition pg_wchar.h:267
@ PG_LATIN2
Definition pg_wchar.h:235
@ PG_ISO_8859_5
Definition pg_wchar.h:251
@ PG_LATIN10
Definition pg_wchar.h:243
@ PG_WIN1250
Definition pg_wchar.h:255
@ PG_ISO_8859_7
Definition pg_wchar.h:253
@ PG_SJIS
Definition pg_wchar.h:264
@ PG_LATIN8
Definition pg_wchar.h:241
@ PG_EUC_JP
Definition pg_wchar.h:227
@ PG_GBK
Definition pg_wchar.h:266
@ PG_LATIN3
Definition pg_wchar.h:236
@ PG_WIN1256
Definition pg_wchar.h:244
@ PG_LATIN1
Definition pg_wchar.h:234
@ PG_EUC_TW
Definition pg_wchar.h:230
@ PG_WIN1258
Definition pg_wchar.h:245
@ PG_SHIFT_JIS_2004
Definition pg_wchar.h:270
@ PG_WIN1252
Definition pg_wchar.h:250
@ PG_LATIN7
Definition pg_wchar.h:240
@ PG_WIN1255
Definition pg_wchar.h:258
@ PG_WIN1257
Definition pg_wchar.h:259
@ PG_WIN1251
Definition pg_wchar.h:249
@ PG_EUC_KR
Definition pg_wchar.h:229
@ PG_WIN866
Definition pg_wchar.h:246
@ PG_ISO_8859_8
Definition pg_wchar.h:254
@ PG_WIN874
Definition pg_wchar.h:247
@ PG_EUC_JIS_2004
Definition pg_wchar.h:231
@ PG_BIG5
Definition pg_wchar.h:265
#define LCPRV1_A
Definition pg_wchar.h:150
#define LCPRV1_B
Definition pg_wchar.h:151
#define IS_LC2(c)
Definition pg_wchar.h:144
#define IS_LCPRV1(c)
Definition pg_wchar.h:152
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition pg_wchar.h:575
#define LCPRV2_A
Definition pg_wchar.h:162
#define IS_LCPRV2_B_RANGE(c)
Definition pg_wchar.h:167
#define SS2
Definition pg_wchar.h:38
#define IS_LCPRV1_A_RANGE(c)
Definition pg_wchar.h:153
#define PG_VALID_ENCODING(_enc)
Definition pg_wchar.h:287
#define IS_LCPRV1_B_RANGE(c)
Definition pg_wchar.h:155
#define ISSJISHEAD(c)
Definition pg_wchar.h:44
#define IS_LC1(c)
Definition pg_wchar.h:126
#define IS_LCPRV2_A_RANGE(c)
Definition pg_wchar.h:165
#define SS3
Definition pg_wchar.h:39
#define LCPRV2_B
Definition pg_wchar.h:163
char * c
static int fb(int x)
unsigned int first
Definition wchar.c:593
unsigned int last
Definition wchar.c:594
mbstr_verifier mbverifystr
Definition pg_wchar.h:385
mblen_converter mblen
Definition pg_wchar.h:382
mbdisplaylen_converter dsplen
Definition pg_wchar.h:383
mbchar_verifier mbverifychar
Definition pg_wchar.h:384
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition wchar.c:1634
static int pg_latin1_dsplen(const unsigned char *s)
Definition wchar.c:904
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition wchar.c:2189
static int pg_euctw_mblen(const unsigned char *s)
Definition wchar.c:360
static int pg_euckr_dsplen(const unsigned char *s)
Definition wchar.c:227
static const uint32 Utf8Transition[256]
Definition wchar.c:1837
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition wchar.c:2011
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition wchar.c:1091
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition wchar.c:1432
static int pg_sjis_dsplen(const unsigned char *s)
Definition wchar.c:927
#define CR3
Definition wchar.c:1833
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition wchar.c:1449
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:108
static int pg_eucjp_dsplen(const unsigned char *s)
Definition wchar.c:196
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:73
#define L3B
Definition wchar.c:1824
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition wchar.c:1609
#define L2A
Definition wchar.c:1821
static int pg_gbk_dsplen(const unsigned char *s)
Definition wchar.c:983
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:215
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:317
#define END
Definition wchar.c:1814
#define pg_euccn_verifychar
Definition wchar.c:1246
#define L4C
Definition wchar.c:1829
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition wchar.c:1472
static int pg_johab_mblen(const unsigned char *s)
Definition wchar.c:444
static int pg_johab_dsplen(const unsigned char *s)
Definition wchar.c:450
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition wchar.c:1526
#define CR2
Definition wchar.c:1832
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition wchar.c:1382
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:184
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition wchar.c:1438
static int pg_latin1_mblen(const unsigned char *s)
Definition wchar.c:898
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition wchar.c:1085
static int pg_ascii_mblen(const unsigned char *s)
Definition wchar.c:88
void pg_encoding_set_invalid(int encoding, char *dst)
Definition wchar.c:2073
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition wchar.c:599
static int pg_big5_dsplen(const unsigned char *s)
Definition wchar.c:956
#define pg_euccn_verifystr
Definition wchar.c:1247
int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, size_t remaining)
Definition wchar.c:2169
#define NONUTF8_INVALID_BYTE0
Definition wchar.c:36
static int pg_eucjp_mblen(const unsigned char *s)
Definition wchar.c:190
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition wchar.c:1555
static int pg_big5_mblen(const unsigned char *s)
Definition wchar.c:944
static int pg_euccn_dsplen(const unsigned char *s)
Definition wchar.c:301
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition wchar.c:1250
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition wchar.c:1188
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition wchar.c:1300
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition wchar.c:1580
static int pg_gb18030_dsplen(const unsigned char *s)
Definition wchar.c:1051
#define ERR
Definition wchar.c:1801
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:462
int pg_mule_mblen(const unsigned char *s)
Definition wchar.c:815
static int pg_euccn_mblen(const unsigned char *s)
Definition wchar.c:285
#define ASC
Definition wchar.c:1819
static int pg_gbk_mblen(const unsigned char *s)
Definition wchar.c:971
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition wchar.c:1159
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition wchar.c:1353
static int pg_euc_dsplen(const unsigned char *s)
Definition wchar.c:165
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition wchar.c:1694
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition wchar.c:1217
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:749
static int pg_sjis_mblen(const unsigned char *s)
Definition wchar.c:913
#define IS_EUC_RANGE_VALID(c)
Definition wchar.c:1101
static int pg_uhc_dsplen(const unsigned char *s)
Definition wchar.c:1010
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition wchar.c:1104
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition wchar.c:1501
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition wchar.c:1663
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition wchar.c:1403
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:398
#define L3C
Definition wchar.c:1825
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition wchar.c:1723
#define MB2CHAR_NEED_AT_LEAST(len, need)
Definition wchar.c:67
#define CR1
Definition wchar.c:1831
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:883
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:525
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:237
static int pg_gb18030_mblen(const unsigned char *s)
Definition wchar.c:1037
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition wchar.c:2198
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition wchar.c:1895
static int pg_euctw_dsplen(const unsigned char *s)
Definition wchar.c:376
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:861
static int pg_uhc_mblen(const unsigned char *s)
Definition wchar.c:998
static int pg_euc_mblen(const unsigned char *s)
Definition wchar.c:149
static int pg_mule_dsplen(const unsigned char *s)
Definition wchar.c:833
#define L3A
Definition wchar.c:1823
#define L4B
Definition wchar.c:1828
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition wchar.c:2224
#define NONUTF8_INVALID_BYTE1
Definition wchar.c:37
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition wchar.c:1913
static int pg_euckr_mblen(const unsigned char *s)
Definition wchar.c:221
const pg_wchar_tbl pg_wchar_table[]
Definition wchar.c:2086
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:692
#define BGN
Definition wchar.c:1803
int pg_encoding_max_length(int encoding)
Definition wchar.c:2235
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition wchar.c:2157
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition wchar.c:1329
#define ILL
Definition wchar.c:1835
#define STRIDE_LENGTH
#define L4A
Definition wchar.c:1827
static int pg_ascii_dsplen(const unsigned char *s)
Definition wchar.c:94
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition wchar.c:2211
static int ucs_wcwidth(pg_wchar ucs)
Definition wchar.c:646
static int pg_utf_dsplen(const unsigned char *s)
Definition wchar.c:680