PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
wchar.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
5 *
6 * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/wchar.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include "mb/pg_wchar.h"
16#include "utils/ascii.h"
17
18
19/*
20 * In today's multibyte encodings other than UTF8, this two-byte sequence
21 * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
22 *
23 * For historical reasons, several verifychar implementations opt to reject
24 * this pair specifically. Byte pair range constraints, in encoding
25 * originator documentation, always excluded this pair. No core conversion
26 * could translate it. However, longstanding verifychar implementations
27 * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
28 * pairs not valid per encoding originator documentation. To avoid tightening
29 * core or non-core conversions in a security patch, we sought this one pair.
30 *
31 * PQescapeString() historically used spaces for BYTE1; many other values
32 * could suffice for BYTE1.
33 */
34#define NONUTF8_INVALID_BYTE0 (0x8d)
35#define NONUTF8_INVALID_BYTE1 (' ')
36
37
38/*
39 * Operations on multi-byte encodings are driven by a table of helper
40 * functions.
41 *
42 * To add an encoding support, define mblen(), dsplen(), verifychar() and
43 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
44 * and wchar2mb() conversion functions.
45 *
46 * These functions generally assume that their input is validly formed.
47 * The "verifier" functions, further down in the file, have to be more
48 * paranoid.
49 *
50 * We expect that mblen() does not need to examine more than the first byte
51 * of the character to discover the correct length. GB18030 is an exception
52 * to that rule, though, as it also looks at second byte. But even that
53 * behaves in a predictable way, if you only pass the first byte: it will
54 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
55 * good enough for all current uses.
56 *
57 * Note: for the display output of psql to work properly, the return values
58 * of the dsplen functions must conform to the Unicode standard. In particular
59 * the NUL character is zero width and control characters are generally
60 * width -1. It is recommended that non-ASCII encodings refer their ASCII
61 * subset to the ASCII routines to ensure consistency.
62 */
63
64/*
65 * SQL/ASCII
66 */
67static int
68pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
69{
70 int cnt = 0;
71
72 while (len > 0 && *from)
73 {
74 *to++ = *from++;
75 len--;
76 cnt++;
77 }
78 *to = 0;
79 return cnt;
80}
81
82static int
83pg_ascii_mblen(const unsigned char *s)
84{
85 return 1;
86}
87
88static int
89pg_ascii_dsplen(const unsigned char *s)
90{
91 if (*s == '\0')
92 return 0;
93 if (*s < 0x20 || *s == 0x7f)
94 return -1;
95
96 return 1;
97}
98
99/*
100 * EUC
101 */
102static int
103pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
104{
105 int cnt = 0;
106
107 while (len > 0 && *from)
108 {
109 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
110 * KANA") */
111 {
112 from++;
113 *to = (SS2 << 8) | *from++;
114 len -= 2;
115 }
116 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
117 {
118 from++;
119 *to = (SS3 << 16) | (*from++ << 8);
120 *to |= *from++;
121 len -= 3;
122 }
123 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
124 {
125 *to = *from++ << 8;
126 *to |= *from++;
127 len -= 2;
128 }
129 else /* must be ASCII */
130 {
131 *to = *from++;
132 len--;
133 }
134 to++;
135 cnt++;
136 }
137 *to = 0;
138 return cnt;
139}
140
141static inline int
142pg_euc_mblen(const unsigned char *s)
143{
144 int len;
145
146 if (*s == SS2)
147 len = 2;
148 else if (*s == SS3)
149 len = 3;
150 else if (IS_HIGHBIT_SET(*s))
151 len = 2;
152 else
153 len = 1;
154 return len;
155}
156
157static inline int
158pg_euc_dsplen(const unsigned char *s)
159{
160 int len;
161
162 if (*s == SS2)
163 len = 2;
164 else if (*s == SS3)
165 len = 2;
166 else if (IS_HIGHBIT_SET(*s))
167 len = 2;
168 else
169 len = pg_ascii_dsplen(s);
170 return len;
171}
172
173/*
174 * EUC_JP
175 */
176static int
177pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
178{
179 return pg_euc2wchar_with_len(from, to, len);
180}
181
182static int
183pg_eucjp_mblen(const unsigned char *s)
184{
185 return pg_euc_mblen(s);
186}
187
188static int
189pg_eucjp_dsplen(const unsigned char *s)
190{
191 int len;
192
193 if (*s == SS2)
194 len = 1;
195 else if (*s == SS3)
196 len = 2;
197 else if (IS_HIGHBIT_SET(*s))
198 len = 2;
199 else
200 len = pg_ascii_dsplen(s);
201 return len;
202}
203
204/*
205 * EUC_KR
206 */
207static int
208pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
209{
210 return pg_euc2wchar_with_len(from, to, len);
211}
212
213static int
214pg_euckr_mblen(const unsigned char *s)
215{
216 return pg_euc_mblen(s);
217}
218
219static int
220pg_euckr_dsplen(const unsigned char *s)
221{
222 return pg_euc_dsplen(s);
223}
224
225/*
226 * EUC_CN
227 *
228 */
229static int
230pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
231{
232 int cnt = 0;
233
234 while (len > 0 && *from)
235 {
236 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
237 {
238 from++;
239 *to = (SS2 << 16) | (*from++ << 8);
240 *to |= *from++;
241 len -= 3;
242 }
243 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
244 {
245 from++;
246 *to = (SS3 << 16) | (*from++ << 8);
247 *to |= *from++;
248 len -= 3;
249 }
250 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
251 {
252 *to = *from++ << 8;
253 *to |= *from++;
254 len -= 2;
255 }
256 else
257 {
258 *to = *from++;
259 len--;
260 }
261 to++;
262 cnt++;
263 }
264 *to = 0;
265 return cnt;
266}
267
268static int
269pg_euccn_mblen(const unsigned char *s)
270{
271 int len;
272
273 if (IS_HIGHBIT_SET(*s))
274 len = 2;
275 else
276 len = 1;
277 return len;
278}
279
280static int
281pg_euccn_dsplen(const unsigned char *s)
282{
283 int len;
284
285 if (IS_HIGHBIT_SET(*s))
286 len = 2;
287 else
288 len = pg_ascii_dsplen(s);
289 return len;
290}
291
292/*
293 * EUC_TW
294 *
295 */
296static int
297pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
298{
299 int cnt = 0;
300
301 while (len > 0 && *from)
302 {
303 if (*from == SS2 && len >= 4) /* code set 2 */
304 {
305 from++;
306 *to = (((uint32) SS2) << 24) | (*from++ << 16);
307 *to |= *from++ << 8;
308 *to |= *from++;
309 len -= 4;
310 }
311 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
312 {
313 from++;
314 *to = (SS3 << 16) | (*from++ << 8);
315 *to |= *from++;
316 len -= 3;
317 }
318 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
319 {
320 *to = *from++ << 8;
321 *to |= *from++;
322 len -= 2;
323 }
324 else
325 {
326 *to = *from++;
327 len--;
328 }
329 to++;
330 cnt++;
331 }
332 *to = 0;
333 return cnt;
334}
335
336static int
337pg_euctw_mblen(const unsigned char *s)
338{
339 int len;
340
341 if (*s == SS2)
342 len = 4;
343 else if (*s == SS3)
344 len = 3;
345 else if (IS_HIGHBIT_SET(*s))
346 len = 2;
347 else
348 len = 1;
349 return len;
350}
351
352static int
353pg_euctw_dsplen(const unsigned char *s)
354{
355 int len;
356
357 if (*s == SS2)
358 len = 2;
359 else if (*s == SS3)
360 len = 2;
361 else if (IS_HIGHBIT_SET(*s))
362 len = 2;
363 else
364 len = pg_ascii_dsplen(s);
365 return len;
366}
367
368/*
369 * Convert pg_wchar to EUC_* encoding.
370 * caller must allocate enough space for "to", including a trailing zero!
371 * len: length of from.
372 * "from" not necessarily null terminated.
373 */
374static int
375pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
376{
377 int cnt = 0;
378
379 while (len > 0 && *from)
380 {
381 unsigned char c;
382
383 if ((c = (*from >> 24)))
384 {
385 *to++ = c;
386 *to++ = (*from >> 16) & 0xff;
387 *to++ = (*from >> 8) & 0xff;
388 *to++ = *from & 0xff;
389 cnt += 4;
390 }
391 else if ((c = (*from >> 16)))
392 {
393 *to++ = c;
394 *to++ = (*from >> 8) & 0xff;
395 *to++ = *from & 0xff;
396 cnt += 3;
397 }
398 else if ((c = (*from >> 8)))
399 {
400 *to++ = c;
401 *to++ = *from & 0xff;
402 cnt += 2;
403 }
404 else
405 {
406 *to++ = *from;
407 cnt++;
408 }
409 from++;
410 len--;
411 }
412 *to = 0;
413 return cnt;
414}
415
416
417/*
418 * JOHAB
419 */
420static int
421pg_johab_mblen(const unsigned char *s)
422{
423 return pg_euc_mblen(s);
424}
425
426static int
427pg_johab_dsplen(const unsigned char *s)
428{
429 return pg_euc_dsplen(s);
430}
431
432/*
433 * convert UTF8 string to pg_wchar (UCS-4)
434 * caller must allocate enough space for "to", including a trailing zero!
435 * len: length of from.
436 * "from" not necessarily null terminated.
437 */
438static int
439pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
440{
441 int cnt = 0;
442 uint32 c1,
443 c2,
444 c3,
445 c4;
446
447 while (len > 0 && *from)
448 {
449 if ((*from & 0x80) == 0)
450 {
451 *to = *from++;
452 len--;
453 }
454 else if ((*from & 0xe0) == 0xc0)
455 {
456 if (len < 2)
457 break; /* drop trailing incomplete char */
458 c1 = *from++ & 0x1f;
459 c2 = *from++ & 0x3f;
460 *to = (c1 << 6) | c2;
461 len -= 2;
462 }
463 else if ((*from & 0xf0) == 0xe0)
464 {
465 if (len < 3)
466 break; /* drop trailing incomplete char */
467 c1 = *from++ & 0x0f;
468 c2 = *from++ & 0x3f;
469 c3 = *from++ & 0x3f;
470 *to = (c1 << 12) | (c2 << 6) | c3;
471 len -= 3;
472 }
473 else if ((*from & 0xf8) == 0xf0)
474 {
475 if (len < 4)
476 break; /* drop trailing incomplete char */
477 c1 = *from++ & 0x07;
478 c2 = *from++ & 0x3f;
479 c3 = *from++ & 0x3f;
480 c4 = *from++ & 0x3f;
481 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
482 len -= 4;
483 }
484 else
485 {
486 /* treat a bogus char as length 1; not ours to raise error */
487 *to = *from++;
488 len--;
489 }
490 to++;
491 cnt++;
492 }
493 *to = 0;
494 return cnt;
495}
496
497
498/*
499 * Trivial conversion from pg_wchar to UTF-8.
500 * caller should allocate enough space for "to"
501 * len: length of from.
502 * "from" not necessarily null terminated.
503 */
504static int
505pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
506{
507 int cnt = 0;
508
509 while (len > 0 && *from)
510 {
511 int char_len;
512
513 unicode_to_utf8(*from, to);
514 char_len = pg_utf_mblen(to);
515 cnt += char_len;
516 to += char_len;
517 from++;
518 len--;
519 }
520 *to = 0;
521 return cnt;
522}
523
524/*
525 * Return the byte length of a UTF8 character pointed to by s
526 *
527 * Note: in the current implementation we do not support UTF8 sequences
528 * of more than 4 bytes; hence do NOT return a value larger than 4.
529 * We return "1" for any leading byte that is either flat-out illegal or
530 * indicates a length larger than we support.
531 *
532 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
533 * other places would need to be fixed to change this.
534 */
535int
536pg_utf_mblen(const unsigned char *s)
537{
538 int len;
539
540 if ((*s & 0x80) == 0)
541 len = 1;
542 else if ((*s & 0xe0) == 0xc0)
543 len = 2;
544 else if ((*s & 0xf0) == 0xe0)
545 len = 3;
546 else if ((*s & 0xf8) == 0xf0)
547 len = 4;
548#ifdef NOT_USED
549 else if ((*s & 0xfc) == 0xf8)
550 len = 5;
551 else if ((*s & 0xfe) == 0xfc)
552 len = 6;
553#endif
554 else
555 len = 1;
556 return len;
557}
558
559/*
560 * This is an implementation of wcwidth() and wcswidth() as defined in
561 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
562 * <http://www.unix.org/online.html>
563 *
564 * Markus Kuhn -- 2001-09-08 -- public domain
565 *
566 * customised for PostgreSQL
567 *
568 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
569 */
570
572{
573 unsigned int first;
574 unsigned int last;
575};
576
577/* auxiliary function for binary search in interval table */
578static int
579mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
580{
581 int min = 0;
582 int mid;
583
584 if (ucs < table[0].first || ucs > table[max].last)
585 return 0;
586 while (max >= min)
587 {
588 mid = (min + max) / 2;
589 if (ucs > table[mid].last)
590 min = mid + 1;
591 else if (ucs < table[mid].first)
592 max = mid - 1;
593 else
594 return 1;
595 }
596
597 return 0;
598}
599
600
601/* The following functions define the column width of an ISO 10646
602 * character as follows:
603 *
604 * - The null character (U+0000) has a column width of 0.
605 *
606 * - Other C0/C1 control characters and DEL will lead to a return
607 * value of -1.
608 *
609 * - Non-spacing and enclosing combining characters (general
610 * category code Mn, Me or Cf in the Unicode database) have a
611 * column width of 0.
612 *
613 * - Spacing characters in the East Asian Wide (W) or East Asian
614 * FullWidth (F) category as defined in Unicode Technical
615 * Report #11 have a column width of 2.
616 *
617 * - All remaining characters (including all printable
618 * ISO 8859-1 and WGL4 characters, Unicode control characters,
619 * etc.) have a column width of 1.
620 *
621 * This implementation assumes that wchar_t characters are encoded
622 * in ISO 10646.
623 */
624
625static int
627{
630
631 /* test for 8-bit control characters */
632 if (ucs == 0)
633 return 0;
634
635 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
636 return -1;
637
638 /*
639 * binary search in table of non-spacing characters
640 *
641 * XXX: In the official Unicode sources, it is possible for a character to
642 * be described as both non-spacing and wide at the same time. As of
643 * Unicode 13.0, treating the non-spacing property as the determining
644 * factor for display width leads to the correct behavior, so do that
645 * search first.
646 */
647 if (mbbisearch(ucs, nonspacing,
648 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
649 return 0;
650
651 /* binary search in table of wide characters */
652 if (mbbisearch(ucs, east_asian_fw,
653 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
654 return 2;
655
656 return 1;
657}
658
659static int
660pg_utf_dsplen(const unsigned char *s)
661{
662 return ucs_wcwidth(utf8_to_unicode(s));
663}
664
665/*
666 * convert mule internal code to pg_wchar
667 * caller should allocate enough space for "to"
668 * len: length of from.
669 * "from" not necessarily null terminated.
670 */
671static int
672pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
673{
674 int cnt = 0;
675
676 while (len > 0 && *from)
677 {
678 if (IS_LC1(*from) && len >= 2)
679 {
680 *to = *from++ << 16;
681 *to |= *from++;
682 len -= 2;
683 }
684 else if (IS_LCPRV1(*from) && len >= 3)
685 {
686 from++;
687 *to = *from++ << 16;
688 *to |= *from++;
689 len -= 3;
690 }
691 else if (IS_LC2(*from) && len >= 3)
692 {
693 *to = *from++ << 16;
694 *to |= *from++ << 8;
695 *to |= *from++;
696 len -= 3;
697 }
698 else if (IS_LCPRV2(*from) && len >= 4)
699 {
700 from++;
701 *to = *from++ << 16;
702 *to |= *from++ << 8;
703 *to |= *from++;
704 len -= 4;
705 }
706 else
707 { /* assume ASCII */
708 *to = (unsigned char) *from++;
709 len--;
710 }
711 to++;
712 cnt++;
713 }
714 *to = 0;
715 return cnt;
716}
717
718/*
719 * convert pg_wchar to mule internal code
720 * caller should allocate enough space for "to"
721 * len: length of from.
722 * "from" not necessarily null terminated.
723 */
724static int
725pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
726{
727 int cnt = 0;
728
729 while (len > 0 && *from)
730 {
731 unsigned char lb;
732
733 lb = (*from >> 16) & 0xff;
734 if (IS_LC1(lb))
735 {
736 *to++ = lb;
737 *to++ = *from & 0xff;
738 cnt += 2;
739 }
740 else if (IS_LC2(lb))
741 {
742 *to++ = lb;
743 *to++ = (*from >> 8) & 0xff;
744 *to++ = *from & 0xff;
745 cnt += 3;
746 }
747 else if (IS_LCPRV1_A_RANGE(lb))
748 {
749 *to++ = LCPRV1_A;
750 *to++ = lb;
751 *to++ = *from & 0xff;
752 cnt += 3;
753 }
754 else if (IS_LCPRV1_B_RANGE(lb))
755 {
756 *to++ = LCPRV1_B;
757 *to++ = lb;
758 *to++ = *from & 0xff;
759 cnt += 3;
760 }
761 else if (IS_LCPRV2_A_RANGE(lb))
762 {
763 *to++ = LCPRV2_A;
764 *to++ = lb;
765 *to++ = (*from >> 8) & 0xff;
766 *to++ = *from & 0xff;
767 cnt += 4;
768 }
769 else if (IS_LCPRV2_B_RANGE(lb))
770 {
771 *to++ = LCPRV2_B;
772 *to++ = lb;
773 *to++ = (*from >> 8) & 0xff;
774 *to++ = *from & 0xff;
775 cnt += 4;
776 }
777 else
778 {
779 *to++ = *from & 0xff;
780 cnt += 1;
781 }
782 from++;
783 len--;
784 }
785 *to = 0;
786 return cnt;
787}
788
789/* exported for direct use by conv.c */
790int
791pg_mule_mblen(const unsigned char *s)
792{
793 int len;
794
795 if (IS_LC1(*s))
796 len = 2;
797 else if (IS_LCPRV1(*s))
798 len = 3;
799 else if (IS_LC2(*s))
800 len = 3;
801 else if (IS_LCPRV2(*s))
802 len = 4;
803 else
804 len = 1; /* assume ASCII */
805 return len;
806}
807
808static int
809pg_mule_dsplen(const unsigned char *s)
810{
811 int len;
812
813 /*
814 * Note: it's not really appropriate to assume that all multibyte charsets
815 * are double-wide on screen. But this seems an okay approximation for
816 * the MULE charsets we currently support.
817 */
818
819 if (IS_LC1(*s))
820 len = 1;
821 else if (IS_LCPRV1(*s))
822 len = 1;
823 else if (IS_LC2(*s))
824 len = 2;
825 else if (IS_LCPRV2(*s))
826 len = 2;
827 else
828 len = 1; /* assume ASCII */
829
830 return len;
831}
832
833/*
834 * ISO8859-1
835 */
836static int
837pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
838{
839 int cnt = 0;
840
841 while (len > 0 && *from)
842 {
843 *to++ = *from++;
844 len--;
845 cnt++;
846 }
847 *to = 0;
848 return cnt;
849}
850
851/*
852 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
853 * high bits.
854 * caller should allocate enough space for "to"
855 * len: length of from.
856 * "from" not necessarily null terminated.
857 */
858static int
859pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
860{
861 int cnt = 0;
862
863 while (len > 0 && *from)
864 {
865 *to++ = *from++;
866 len--;
867 cnt++;
868 }
869 *to = 0;
870 return cnt;
871}
872
873static int
874pg_latin1_mblen(const unsigned char *s)
875{
876 return 1;
877}
878
879static int
880pg_latin1_dsplen(const unsigned char *s)
881{
882 return pg_ascii_dsplen(s);
883}
884
885/*
886 * SJIS
887 */
888static int
889pg_sjis_mblen(const unsigned char *s)
890{
891 int len;
892
893 if (*s >= 0xa1 && *s <= 0xdf)
894 len = 1; /* 1 byte kana? */
895 else if (IS_HIGHBIT_SET(*s))
896 len = 2; /* kanji? */
897 else
898 len = 1; /* should be ASCII */
899 return len;
900}
901
902static int
903pg_sjis_dsplen(const unsigned char *s)
904{
905 int len;
906
907 if (*s >= 0xa1 && *s <= 0xdf)
908 len = 1; /* 1 byte kana? */
909 else if (IS_HIGHBIT_SET(*s))
910 len = 2; /* kanji? */
911 else
912 len = pg_ascii_dsplen(s); /* should be ASCII */
913 return len;
914}
915
916/*
917 * Big5
918 */
919static int
920pg_big5_mblen(const unsigned char *s)
921{
922 int len;
923
924 if (IS_HIGHBIT_SET(*s))
925 len = 2; /* kanji? */
926 else
927 len = 1; /* should be ASCII */
928 return len;
929}
930
931static int
932pg_big5_dsplen(const unsigned char *s)
933{
934 int len;
935
936 if (IS_HIGHBIT_SET(*s))
937 len = 2; /* kanji? */
938 else
939 len = pg_ascii_dsplen(s); /* should be ASCII */
940 return len;
941}
942
943/*
944 * GBK
945 */
946static int
947pg_gbk_mblen(const unsigned char *s)
948{
949 int len;
950
951 if (IS_HIGHBIT_SET(*s))
952 len = 2; /* kanji? */
953 else
954 len = 1; /* should be ASCII */
955 return len;
956}
957
958static int
959pg_gbk_dsplen(const unsigned char *s)
960{
961 int len;
962
963 if (IS_HIGHBIT_SET(*s))
964 len = 2; /* kanji? */
965 else
966 len = pg_ascii_dsplen(s); /* should be ASCII */
967 return len;
968}
969
970/*
971 * UHC
972 */
973static int
974pg_uhc_mblen(const unsigned char *s)
975{
976 int len;
977
978 if (IS_HIGHBIT_SET(*s))
979 len = 2; /* 2byte? */
980 else
981 len = 1; /* should be ASCII */
982 return len;
983}
984
985static int
986pg_uhc_dsplen(const unsigned char *s)
987{
988 int len;
989
990 if (IS_HIGHBIT_SET(*s))
991 len = 2; /* 2byte? */
992 else
993 len = pg_ascii_dsplen(s); /* should be ASCII */
994 return len;
995}
996
997/*
998 * GB18030
999 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1000 */
1001
1002/*
1003 * Unlike all other mblen() functions, this also looks at the second byte of
1004 * the input. However, if you only pass the first byte of a multi-byte
1005 * string, and \0 as the second byte, this still works in a predictable way:
1006 * a 4-byte character will be reported as two 2-byte characters. That's
1007 * enough for all current uses, as a client-only encoding. It works that
1008 * way, because in any valid 4-byte GB18030-encoded character, the third and
1009 * fourth byte look like a 2-byte encoded character, when looked at
1010 * separately.
1011 */
1012static int
1013pg_gb18030_mblen(const unsigned char *s)
1014{
1015 int len;
1016
1017 if (!IS_HIGHBIT_SET(*s))
1018 len = 1; /* ASCII */
1019 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1020 len = 4;
1021 else
1022 len = 2;
1023 return len;
1024}
1025
1026static int
1027pg_gb18030_dsplen(const unsigned char *s)
1028{
1029 int len;
1030
1031 if (IS_HIGHBIT_SET(*s))
1032 len = 2;
1033 else
1034 len = pg_ascii_dsplen(s); /* ASCII */
1035 return len;
1036}
1037
1038/*
1039 *-------------------------------------------------------------------
1040 * multibyte sequence validators
1041 *
1042 * The verifychar functions accept "s", a pointer to the first byte of a
1043 * string, and "len", the remaining length of the string. If there is a
1044 * validly encoded character beginning at *s, return its length in bytes;
1045 * else return -1.
1046 *
1047 * The verifystr functions also accept "s", a pointer to a string and "len",
1048 * the length of the string. They verify the whole string, and return the
1049 * number of input bytes (<= len) that are valid. In other words, if the
1050 * whole string is valid, verifystr returns "len", otherwise it returns the
1051 * byte offset of the first invalid character. The verifystr functions must
1052 * test for and reject zeroes in the input.
1053 *
1054 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1055 * they must test for and reject zeroes in any additional bytes of a
1056 * multibyte character. Note that this definition allows the function for a
1057 * single-byte encoding to be just "return 1".
1058 *-------------------------------------------------------------------
1059 */
1060static int
1061pg_ascii_verifychar(const unsigned char *s, int len)
1062{
1063 return 1;
1064}
1065
1066static int
1067pg_ascii_verifystr(const unsigned char *s, int len)
1068{
1069 const unsigned char *nullpos = memchr(s, 0, len);
1070
1071 if (nullpos == NULL)
1072 return len;
1073 else
1074 return nullpos - s;
1075}
1076
1077#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1078
1079static int
1080pg_eucjp_verifychar(const unsigned char *s, int len)
1081{
1082 int l;
1083 unsigned char c1,
1084 c2;
1085
1086 c1 = *s++;
1087
1088 switch (c1)
1089 {
1090 case SS2: /* JIS X 0201 */
1091 l = 2;
1092 if (l > len)
1093 return -1;
1094 c2 = *s++;
1095 if (c2 < 0xa1 || c2 > 0xdf)
1096 return -1;
1097 break;
1098
1099 case SS3: /* JIS X 0212 */
1100 l = 3;
1101 if (l > len)
1102 return -1;
1103 c2 = *s++;
1104 if (!IS_EUC_RANGE_VALID(c2))
1105 return -1;
1106 c2 = *s++;
1107 if (!IS_EUC_RANGE_VALID(c2))
1108 return -1;
1109 break;
1110
1111 default:
1112 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1113 {
1114 l = 2;
1115 if (l > len)
1116 return -1;
1117 if (!IS_EUC_RANGE_VALID(c1))
1118 return -1;
1119 c2 = *s++;
1120 if (!IS_EUC_RANGE_VALID(c2))
1121 return -1;
1122 }
1123 else
1124 /* must be ASCII */
1125 {
1126 l = 1;
1127 }
1128 break;
1129 }
1130
1131 return l;
1132}
1133
1134static int
1135pg_eucjp_verifystr(const unsigned char *s, int len)
1136{
1137 const unsigned char *start = s;
1138
1139 while (len > 0)
1140 {
1141 int l;
1142
1143 /* fast path for ASCII-subset characters */
1144 if (!IS_HIGHBIT_SET(*s))
1145 {
1146 if (*s == '\0')
1147 break;
1148 l = 1;
1149 }
1150 else
1151 {
1152 l = pg_eucjp_verifychar(s, len);
1153 if (l == -1)
1154 break;
1155 }
1156 s += l;
1157 len -= l;
1158 }
1159
1160 return s - start;
1161}
1162
1163static int
1164pg_euckr_verifychar(const unsigned char *s, int len)
1165{
1166 int l;
1167 unsigned char c1,
1168 c2;
1169
1170 c1 = *s++;
1171
1172 if (IS_HIGHBIT_SET(c1))
1173 {
1174 l = 2;
1175 if (l > len)
1176 return -1;
1177 if (!IS_EUC_RANGE_VALID(c1))
1178 return -1;
1179 c2 = *s++;
1180 if (!IS_EUC_RANGE_VALID(c2))
1181 return -1;
1182 }
1183 else
1184 /* must be ASCII */
1185 {
1186 l = 1;
1187 }
1188
1189 return l;
1190}
1191
1192static int
1193pg_euckr_verifystr(const unsigned char *s, int len)
1194{
1195 const unsigned char *start = s;
1196
1197 while (len > 0)
1198 {
1199 int l;
1200
1201 /* fast path for ASCII-subset characters */
1202 if (!IS_HIGHBIT_SET(*s))
1203 {
1204 if (*s == '\0')
1205 break;
1206 l = 1;
1207 }
1208 else
1209 {
1210 l = pg_euckr_verifychar(s, len);
1211 if (l == -1)
1212 break;
1213 }
1214 s += l;
1215 len -= l;
1216 }
1217
1218 return s - start;
1219}
1220
1221/* EUC-CN byte sequences are exactly same as EUC-KR */
1222#define pg_euccn_verifychar pg_euckr_verifychar
1223#define pg_euccn_verifystr pg_euckr_verifystr
1224
1225static int
1226pg_euctw_verifychar(const unsigned char *s, int len)
1227{
1228 int l;
1229 unsigned char c1,
1230 c2;
1231
1232 c1 = *s++;
1233
1234 switch (c1)
1235 {
1236 case SS2: /* CNS 11643 Plane 1-7 */
1237 l = 4;
1238 if (l > len)
1239 return -1;
1240 c2 = *s++;
1241 if (c2 < 0xa1 || c2 > 0xa7)
1242 return -1;
1243 c2 = *s++;
1244 if (!IS_EUC_RANGE_VALID(c2))
1245 return -1;
1246 c2 = *s++;
1247 if (!IS_EUC_RANGE_VALID(c2))
1248 return -1;
1249 break;
1250
1251 case SS3: /* unused */
1252 return -1;
1253
1254 default:
1255 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1256 {
1257 l = 2;
1258 if (l > len)
1259 return -1;
1260 /* no further range check on c1? */
1261 c2 = *s++;
1262 if (!IS_EUC_RANGE_VALID(c2))
1263 return -1;
1264 }
1265 else
1266 /* must be ASCII */
1267 {
1268 l = 1;
1269 }
1270 break;
1271 }
1272 return l;
1273}
1274
1275static int
1276pg_euctw_verifystr(const unsigned char *s, int len)
1277{
1278 const unsigned char *start = s;
1279
1280 while (len > 0)
1281 {
1282 int l;
1283
1284 /* fast path for ASCII-subset characters */
1285 if (!IS_HIGHBIT_SET(*s))
1286 {
1287 if (*s == '\0')
1288 break;
1289 l = 1;
1290 }
1291 else
1292 {
1293 l = pg_euctw_verifychar(s, len);
1294 if (l == -1)
1295 break;
1296 }
1297 s += l;
1298 len -= l;
1299 }
1300
1301 return s - start;
1302}
1303
1304static int
1305pg_johab_verifychar(const unsigned char *s, int len)
1306{
1307 int l,
1308 mbl;
1309 unsigned char c;
1310
1311 l = mbl = pg_johab_mblen(s);
1312
1313 if (len < l)
1314 return -1;
1315
1316 if (!IS_HIGHBIT_SET(*s))
1317 return mbl;
1318
1319 while (--l > 0)
1320 {
1321 c = *++s;
1322 if (!IS_EUC_RANGE_VALID(c))
1323 return -1;
1324 }
1325 return mbl;
1326}
1327
1328static int
1329pg_johab_verifystr(const unsigned char *s, int len)
1330{
1331 const unsigned char *start = s;
1332
1333 while (len > 0)
1334 {
1335 int l;
1336
1337 /* fast path for ASCII-subset characters */
1338 if (!IS_HIGHBIT_SET(*s))
1339 {
1340 if (*s == '\0')
1341 break;
1342 l = 1;
1343 }
1344 else
1345 {
1346 l = pg_johab_verifychar(s, len);
1347 if (l == -1)
1348 break;
1349 }
1350 s += l;
1351 len -= l;
1352 }
1353
1354 return s - start;
1355}
1356
1357static int
1358pg_mule_verifychar(const unsigned char *s, int len)
1359{
1360 int l,
1361 mbl;
1362 unsigned char c;
1363
1364 l = mbl = pg_mule_mblen(s);
1365
1366 if (len < l)
1367 return -1;
1368
1369 while (--l > 0)
1370 {
1371 c = *++s;
1372 if (!IS_HIGHBIT_SET(c))
1373 return -1;
1374 }
1375 return mbl;
1376}
1377
1378static int
1379pg_mule_verifystr(const unsigned char *s, int len)
1380{
1381 const unsigned char *start = s;
1382
1383 while (len > 0)
1384 {
1385 int l;
1386
1387 /* fast path for ASCII-subset characters */
1388 if (!IS_HIGHBIT_SET(*s))
1389 {
1390 if (*s == '\0')
1391 break;
1392 l = 1;
1393 }
1394 else
1395 {
1396 l = pg_mule_verifychar(s, len);
1397 if (l == -1)
1398 break;
1399 }
1400 s += l;
1401 len -= l;
1402 }
1403
1404 return s - start;
1405}
1406
1407static int
1408pg_latin1_verifychar(const unsigned char *s, int len)
1409{
1410 return 1;
1411}
1412
1413static int
1414pg_latin1_verifystr(const unsigned char *s, int len)
1415{
1416 const unsigned char *nullpos = memchr(s, 0, len);
1417
1418 if (nullpos == NULL)
1419 return len;
1420 else
1421 return nullpos - s;
1422}
1423
1424static int
1425pg_sjis_verifychar(const unsigned char *s, int len)
1426{
1427 int l,
1428 mbl;
1429 unsigned char c1,
1430 c2;
1431
1432 l = mbl = pg_sjis_mblen(s);
1433
1434 if (len < l)
1435 return -1;
1436
1437 if (l == 1) /* pg_sjis_mblen already verified it */
1438 return mbl;
1439
1440 c1 = *s++;
1441 c2 = *s;
1442 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1443 return -1;
1444 return mbl;
1445}
1446
1447static int
1448pg_sjis_verifystr(const unsigned char *s, int len)
1449{
1450 const unsigned char *start = s;
1451
1452 while (len > 0)
1453 {
1454 int l;
1455
1456 /* fast path for ASCII-subset characters */
1457 if (!IS_HIGHBIT_SET(*s))
1458 {
1459 if (*s == '\0')
1460 break;
1461 l = 1;
1462 }
1463 else
1464 {
1465 l = pg_sjis_verifychar(s, len);
1466 if (l == -1)
1467 break;
1468 }
1469 s += l;
1470 len -= l;
1471 }
1472
1473 return s - start;
1474}
1475
1476static int
1477pg_big5_verifychar(const unsigned char *s, int len)
1478{
1479 int l,
1480 mbl;
1481
1482 l = mbl = pg_big5_mblen(s);
1483
1484 if (len < l)
1485 return -1;
1486
1487 if (l == 2 &&
1488 s[0] == NONUTF8_INVALID_BYTE0 &&
1489 s[1] == NONUTF8_INVALID_BYTE1)
1490 return -1;
1491
1492 while (--l > 0)
1493 {
1494 if (*++s == '\0')
1495 return -1;
1496 }
1497
1498 return mbl;
1499}
1500
1501static int
1502pg_big5_verifystr(const unsigned char *s, int len)
1503{
1504 const unsigned char *start = s;
1505
1506 while (len > 0)
1507 {
1508 int l;
1509
1510 /* fast path for ASCII-subset characters */
1511 if (!IS_HIGHBIT_SET(*s))
1512 {
1513 if (*s == '\0')
1514 break;
1515 l = 1;
1516 }
1517 else
1518 {
1519 l = pg_big5_verifychar(s, len);
1520 if (l == -1)
1521 break;
1522 }
1523 s += l;
1524 len -= l;
1525 }
1526
1527 return s - start;
1528}
1529
1530static int
1531pg_gbk_verifychar(const unsigned char *s, int len)
1532{
1533 int l,
1534 mbl;
1535
1536 l = mbl = pg_gbk_mblen(s);
1537
1538 if (len < l)
1539 return -1;
1540
1541 if (l == 2 &&
1542 s[0] == NONUTF8_INVALID_BYTE0 &&
1543 s[1] == NONUTF8_INVALID_BYTE1)
1544 return -1;
1545
1546 while (--l > 0)
1547 {
1548 if (*++s == '\0')
1549 return -1;
1550 }
1551
1552 return mbl;
1553}
1554
1555static int
1556pg_gbk_verifystr(const unsigned char *s, int len)
1557{
1558 const unsigned char *start = s;
1559
1560 while (len > 0)
1561 {
1562 int l;
1563
1564 /* fast path for ASCII-subset characters */
1565 if (!IS_HIGHBIT_SET(*s))
1566 {
1567 if (*s == '\0')
1568 break;
1569 l = 1;
1570 }
1571 else
1572 {
1573 l = pg_gbk_verifychar(s, len);
1574 if (l == -1)
1575 break;
1576 }
1577 s += l;
1578 len -= l;
1579 }
1580
1581 return s - start;
1582}
1583
1584static int
1585pg_uhc_verifychar(const unsigned char *s, int len)
1586{
1587 int l,
1588 mbl;
1589
1590 l = mbl = pg_uhc_mblen(s);
1591
1592 if (len < l)
1593 return -1;
1594
1595 if (l == 2 &&
1596 s[0] == NONUTF8_INVALID_BYTE0 &&
1597 s[1] == NONUTF8_INVALID_BYTE1)
1598 return -1;
1599
1600 while (--l > 0)
1601 {
1602 if (*++s == '\0')
1603 return -1;
1604 }
1605
1606 return mbl;
1607}
1608
1609static int
1610pg_uhc_verifystr(const unsigned char *s, int len)
1611{
1612 const unsigned char *start = s;
1613
1614 while (len > 0)
1615 {
1616 int l;
1617
1618 /* fast path for ASCII-subset characters */
1619 if (!IS_HIGHBIT_SET(*s))
1620 {
1621 if (*s == '\0')
1622 break;
1623 l = 1;
1624 }
1625 else
1626 {
1627 l = pg_uhc_verifychar(s, len);
1628 if (l == -1)
1629 break;
1630 }
1631 s += l;
1632 len -= l;
1633 }
1634
1635 return s - start;
1636}
1637
1638static int
1639pg_gb18030_verifychar(const unsigned char *s, int len)
1640{
1641 int l;
1642
1643 if (!IS_HIGHBIT_SET(*s))
1644 l = 1; /* ASCII */
1645 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1646 {
1647 /* Should be 4-byte, validate remaining bytes */
1648 if (*s >= 0x81 && *s <= 0xfe &&
1649 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1650 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1651 l = 4;
1652 else
1653 l = -1;
1654 }
1655 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1656 {
1657 /* Should be 2-byte, validate */
1658 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1659 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1660 l = 2;
1661 else
1662 l = -1;
1663 }
1664 else
1665 l = -1;
1666 return l;
1667}
1668
1669static int
1670pg_gb18030_verifystr(const unsigned char *s, int len)
1671{
1672 const unsigned char *start = s;
1673
1674 while (len > 0)
1675 {
1676 int l;
1677
1678 /* fast path for ASCII-subset characters */
1679 if (!IS_HIGHBIT_SET(*s))
1680 {
1681 if (*s == '\0')
1682 break;
1683 l = 1;
1684 }
1685 else
1686 {
1687 l = pg_gb18030_verifychar(s, len);
1688 if (l == -1)
1689 break;
1690 }
1691 s += l;
1692 len -= l;
1693 }
1694
1695 return s - start;
1696}
1697
1698static int
1699pg_utf8_verifychar(const unsigned char *s, int len)
1700{
1701 int l;
1702
1703 if ((*s & 0x80) == 0)
1704 {
1705 if (*s == '\0')
1706 return -1;
1707 return 1;
1708 }
1709 else if ((*s & 0xe0) == 0xc0)
1710 l = 2;
1711 else if ((*s & 0xf0) == 0xe0)
1712 l = 3;
1713 else if ((*s & 0xf8) == 0xf0)
1714 l = 4;
1715 else
1716 l = 1;
1717
1718 if (l > len)
1719 return -1;
1720
1721 if (!pg_utf8_islegal(s, l))
1722 return -1;
1723
1724 return l;
1725}
1726
1727/*
1728 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1729 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1730 * input byte and current state are used to compute an index into an array of
1731 * state transitions. Since the address of the next transition is dependent
1732 * on this computation, there is latency in executing the load instruction,
1733 * and the CPU is not kept busy.
1734 *
1735 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1736 *
1737 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1738 *
1739 * In a shift-based DFA, the input byte is an index into array of integers
1740 * whose bit pattern encodes the state transitions. To compute the next
1741 * state, we simply right-shift the integer by the current state and apply a
1742 * mask. In this scheme, the address of the transition only depends on the
1743 * input byte, so there is better pipelining.
1744 *
1745 * The naming convention for states and transitions was adopted from a UTF-8
1746 * to UTF-16/32 transcoder, whose table is reproduced below:
1747 *
1748 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1749 *
1750 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1751 * ==========================================================================
1752 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1753 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1754 * |
1755 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1756 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1757 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1758 * |
1759 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1760 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1761 * |
1762 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1763 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1764 *
1765 * In the most straightforward implementation, a shift-based DFA for UTF-8
1766 * requires 64-bit integers to encode the transitions, but with an SMT solver
1767 * it's possible to find state numbers such that the transitions fit within
1768 * 32-bit integers, as Dougall Johnson demonstrated:
1769 *
1770 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1771 *
1772 * This packed representation is the reason for the seemingly odd choice of
1773 * state values below.
1774 */
1775
1776/* Error */
1777#define ERR 0
1778/* Begin */
1779#define BGN 11
1780/* Continuation states, expect 1/2/3 continuation bytes */
1781#define CS1 16
1782#define CS2 1
1783#define CS3 5
1784/* Partial states, where the first continuation byte has a restricted range */
1785#define P3A 6 /* Lead was E0, check for 3-byte overlong */
1786#define P3B 20 /* Lead was ED, check for surrogate */
1787#define P4A 25 /* Lead was F0, check for 4-byte overlong */
1788#define P4B 30 /* Lead was F4, check for too-large */
1789/* Begin and End are the same state */
1790#define END BGN
1791
1792/* the encoded state transitions for the lookup table */
1793
1794/* ASCII */
1795#define ASC (END << BGN)
1796/* 2-byte lead */
1797#define L2A (CS1 << BGN)
1798/* 3-byte lead */
1799#define L3A (P3A << BGN)
1800#define L3B (CS2 << BGN)
1801#define L3C (P3B << BGN)
1802/* 4-byte lead */
1803#define L4A (P4A << BGN)
1804#define L4B (CS3 << BGN)
1805#define L4C (P4B << BGN)
1806/* continuation byte */
1807#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1808#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1809#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1810/* invalid byte */
1811#define ILL ERR
1812
1813static const uint32 Utf8Transition[256] =
1814{
1815 /* ASCII */
1816
1817 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1818 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1819 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1820 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1821
1822 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1823 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1824 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1825 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1826
1827 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1828 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1829 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1830 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1831
1832 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1833 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1834 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1835 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1836
1837 /* continuation bytes */
1838
1839 /* 80..8F */
1840 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1841 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1842
1843 /* 90..9F */
1844 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1845 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1846
1847 /* A0..BF */
1848 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1849 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1850 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1851 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1852
1853 /* leading bytes */
1854
1855 /* C0..DF */
1856 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1857 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1858 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1859 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1860
1861 /* E0..EF */
1862 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1863 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1864
1865 /* F0..FF */
1866 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1867 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1868};
1869
1870static void
1871utf8_advance(const unsigned char *s, uint32 *state, int len)
1872{
1873 /* Note: We deliberately don't check the state's value here. */
1874 while (len > 0)
1875 {
1876 /*
1877 * It's important that the mask value is 31: In most instruction sets,
1878 * a shift by a 32-bit operand is understood to be a shift by its mod
1879 * 32, so the compiler should elide the mask operation.
1880 */
1881 *state = Utf8Transition[*s++] >> (*state & 31);
1882 len--;
1883 }
1884
1885 *state &= 31;
1886}
1887
1888static int
1889pg_utf8_verifystr(const unsigned char *s, int len)
1890{
1891 const unsigned char *start = s;
1892 const int orig_len = len;
1893 uint32 state = BGN;
1894
1895/*
1896 * With a stride of two vector widths, gcc will unroll the loop. Even if
1897 * the compiler can unroll a longer loop, it's not worth it because we
1898 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1899 */
1900#define STRIDE_LENGTH (2 * sizeof(Vector8))
1901
1902 if (len >= STRIDE_LENGTH)
1903 {
1904 while (len >= STRIDE_LENGTH)
1905 {
1906 /*
1907 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1908 * but we must first check for a non-END state, which means the
1909 * previous chunk ended in the middle of a multibyte sequence.
1910 */
1911 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1913
1914 s += STRIDE_LENGTH;
1915 len -= STRIDE_LENGTH;
1916 }
1917
1918 /* The error state persists, so we only need to check for it here. */
1919 if (state == ERR)
1920 {
1921 /*
1922 * Start over from the beginning with the slow path so we can
1923 * count the valid bytes.
1924 */
1925 len = orig_len;
1926 s = start;
1927 }
1928 else if (state != END)
1929 {
1930 /*
1931 * The fast path exited in the middle of a multibyte sequence.
1932 * Walk backwards to find the leading byte so that the slow path
1933 * can resume checking from there. We must always backtrack at
1934 * least one byte, since the current byte could be e.g. an ASCII
1935 * byte after a 2-byte lead, which is invalid.
1936 */
1937 do
1938 {
1939 Assert(s > start);
1940 s--;
1941 len++;
1943 } while (pg_utf_mblen(s) <= 1);
1944 }
1945 }
1946
1947 /* check remaining bytes */
1948 while (len > 0)
1949 {
1950 int l;
1951
1952 /* fast path for ASCII-subset characters */
1953 if (!IS_HIGHBIT_SET(*s))
1954 {
1955 if (*s == '\0')
1956 break;
1957 l = 1;
1958 }
1959 else
1960 {
1961 l = pg_utf8_verifychar(s, len);
1962 if (l == -1)
1963 break;
1964 }
1965 s += l;
1966 len -= l;
1967 }
1968
1969 return s - start;
1970}
1971
1972/*
1973 * Check for validity of a single UTF-8 encoded character
1974 *
1975 * This directly implements the rules in RFC3629. The bizarre-looking
1976 * restrictions on the second byte are meant to ensure that there isn't
1977 * more than one encoding of a given Unicode character point; that is,
1978 * you may not use a longer-than-necessary byte sequence with high order
1979 * zero bits to represent a character that would fit in fewer bytes.
1980 * To do otherwise is to create security hazards (eg, create an apparent
1981 * non-ASCII character that decodes to plain ASCII).
1982 *
1983 * length is assumed to have been obtained by pg_utf_mblen(), and the
1984 * caller must have checked that that many bytes are present in the buffer.
1985 */
1986bool
1987pg_utf8_islegal(const unsigned char *source, int length)
1988{
1989 unsigned char a;
1990
1991 switch (length)
1992 {
1993 default:
1994 /* reject lengths 5 and 6 for now */
1995 return false;
1996 case 4:
1997 a = source[3];
1998 if (a < 0x80 || a > 0xBF)
1999 return false;
2000 /* FALL THRU */
2001 case 3:
2002 a = source[2];
2003 if (a < 0x80 || a > 0xBF)
2004 return false;
2005 /* FALL THRU */
2006 case 2:
2007 a = source[1];
2008 switch (*source)
2009 {
2010 case 0xE0:
2011 if (a < 0xA0 || a > 0xBF)
2012 return false;
2013 break;
2014 case 0xED:
2015 if (a < 0x80 || a > 0x9F)
2016 return false;
2017 break;
2018 case 0xF0:
2019 if (a < 0x90 || a > 0xBF)
2020 return false;
2021 break;
2022 case 0xF4:
2023 if (a < 0x80 || a > 0x8F)
2024 return false;
2025 break;
2026 default:
2027 if (a < 0x80 || a > 0xBF)
2028 return false;
2029 break;
2030 }
2031 /* FALL THRU */
2032 case 1:
2033 a = *source;
2034 if (a >= 0x80 && a < 0xC2)
2035 return false;
2036 if (a > 0xF4)
2037 return false;
2038 break;
2039 }
2040 return true;
2041}
2042
2043
2044/*
2045 * Fills the provided buffer with two bytes such that:
2046 * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2047 */
2048void
2050{
2052
2053 dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2054 dst[1] = NONUTF8_INVALID_BYTE1;
2055}
2056
2057/*
2058 *-------------------------------------------------------------------
2059 * encoding info table
2060 *-------------------------------------------------------------------
2061 */
2105};
2106
2107/*
2108 * Returns the byte length of a multibyte character.
2109 *
2110 * Caution: when dealing with text that is not certainly valid in the
2111 * specified encoding, the result may exceed the actual remaining
2112 * string length. Callers that are not prepared to deal with that
2113 * should use pg_encoding_mblen_bounded() instead.
2114 */
2115int
2116pg_encoding_mblen(int encoding, const char *mbstr)
2117{
2118 return (PG_VALID_ENCODING(encoding) ?
2119 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2120 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2121}
2122
2123/*
2124 * Returns the byte length of a multibyte character; but not more than
2125 * the distance to end of string.
2126 */
2127int
2129{
2130 return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2131}
2132
2133/*
2134 * Returns the display length of a multibyte character.
2135 */
2136int
2137pg_encoding_dsplen(int encoding, const char *mbstr)
2138{
2139 return (PG_VALID_ENCODING(encoding) ?
2140 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2141 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2142}
2143
2144/*
2145 * Verify the first multibyte character of the given string.
2146 * Return its byte length if good, -1 if bad. (See comments above for
2147 * full details of the mbverifychar API.)
2148 */
2149int
2150pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2151{
2152 return (PG_VALID_ENCODING(encoding) ?
2153 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2154 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2155}
2156
2157/*
2158 * Verify that a string is valid for the given encoding.
2159 * Returns the number of input bytes (<= len) that form a valid string.
2160 * (See comments above for full details of the mbverifystr API.)
2161 */
2162int
2163pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2164{
2165 return (PG_VALID_ENCODING(encoding) ?
2166 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2167 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2168}
2169
2170/*
2171 * fetch maximum length of a given encoding
2172 */
2173int
2175{
2177
2178 /*
2179 * Check for the encoding despite the assert, due to some mingw versions
2180 * otherwise issuing bogus warnings.
2181 */
2182 return PG_VALID_ENCODING(encoding) ?
2185}
static bool is_valid_ascii(const unsigned char *s, int len)
Definition: ascii.h:25
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1126
uint32_t uint32
Definition: c.h:502
Assert(PointerIsAligned(start, uint64))
return str start
int a
Definition: isn.c:73
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
int32 encoding
Definition: pg_database.h:41
static const struct lconv_member_info table[]
static rewind_source * source
Definition: pg_rewind.c:89
#define IS_LCPRV2(c)
Definition: pg_wchar.h:164
#define ISSJISTAIL(c)
Definition: pg_wchar.h:45
@ PG_WIN1254
Definition: pg_wchar.h:257
@ PG_LATIN4
Definition: pg_wchar.h:237
@ PG_LATIN9
Definition: pg_wchar.h:242
@ PG_JOHAB
Definition: pg_wchar.h:269
@ PG_GB18030
Definition: pg_wchar.h:268
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_ISO_8859_6
Definition: pg_wchar.h:252
@ PG_WIN1253
Definition: pg_wchar.h:256
@ PG_KOI8U
Definition: pg_wchar.h:260
@ PG_LATIN6
Definition: pg_wchar.h:239
@ PG_MULE_INTERNAL
Definition: pg_wchar.h:233
@ PG_LATIN5
Definition: pg_wchar.h:238
@ PG_EUC_CN
Definition: pg_wchar.h:228
@ PG_UHC
Definition: pg_wchar.h:267
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_ISO_8859_5
Definition: pg_wchar.h:251
@ PG_LATIN10
Definition: pg_wchar.h:243
@ PG_WIN1250
Definition: pg_wchar.h:255
@ PG_ISO_8859_7
Definition: pg_wchar.h:253
@ PG_SJIS
Definition: pg_wchar.h:264
@ PG_LATIN8
Definition: pg_wchar.h:241
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_GBK
Definition: pg_wchar.h:266
@ PG_LATIN3
Definition: pg_wchar.h:236
@ PG_WIN1256
Definition: pg_wchar.h:244
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_EUC_TW
Definition: pg_wchar.h:230
@ PG_WIN1258
Definition: pg_wchar.h:245
@ PG_SHIFT_JIS_2004
Definition: pg_wchar.h:270
@ PG_WIN1252
Definition: pg_wchar.h:250
@ PG_LATIN7
Definition: pg_wchar.h:240
@ PG_UTF8
Definition: pg_wchar.h:232
@ PG_WIN1255
Definition: pg_wchar.h:258
@ PG_WIN1257
Definition: pg_wchar.h:259
@ PG_WIN1251
Definition: pg_wchar.h:249
@ PG_EUC_KR
Definition: pg_wchar.h:229
@ PG_WIN866
Definition: pg_wchar.h:246
@ PG_ISO_8859_8
Definition: pg_wchar.h:254
@ PG_WIN874
Definition: pg_wchar.h:247
@ PG_EUC_JIS_2004
Definition: pg_wchar.h:231
@ PG_BIG5
Definition: pg_wchar.h:265
#define LCPRV1_A
Definition: pg_wchar.h:150
#define LCPRV1_B
Definition: pg_wchar.h:151
#define IS_LC2(c)
Definition: pg_wchar.h:144
#define IS_LCPRV1(c)
Definition: pg_wchar.h:152
#define LCPRV2_A
Definition: pg_wchar.h:162
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:167
#define SS2
Definition: pg_wchar.h:38
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:153
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:155
#define ISSJISHEAD(c)
Definition: pg_wchar.h:44
#define IS_LC1(c)
Definition: pg_wchar.h:126
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:165
#define SS3
Definition: pg_wchar.h:39
#define LCPRV2_B
Definition: pg_wchar.h:163
size_t strnlen(const char *str, size_t maxlen)
Definition: strnlen.c:26
char * c
unsigned int first
Definition: wchar.c:573
unsigned int last
Definition: wchar.c:574
int maxmblen
Definition: pg_wchar.h:386
Definition: regguts.h:323
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1610
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:880
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition: wchar.c:2128
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:337
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:220
static const uint32 Utf8Transition[256]
Definition: wchar.c:1813
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1987
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1067
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1408
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:903
#define CR3
Definition: wchar.c:1809
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1425
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:103
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:189
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:68
#define L3B
Definition: wchar.c:1800
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1585
#define L2A
Definition: wchar.c:1797
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:959
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:208
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:297
#define END
Definition: wchar.c:1790
#define pg_euccn_verifychar
Definition: wchar.c:1222
#define L4C
Definition: wchar.c:1805
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1448
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:421
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:427
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1502
#define CR2
Definition: wchar.c:1808
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1358
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:177
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1414
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:874
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1061
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:83
void pg_encoding_set_invalid(int encoding, char *dst)
Definition: wchar.c:2049
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:579
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:932
#define pg_euccn_verifystr
Definition: wchar.c:1223
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:536
#define NONUTF8_INVALID_BYTE0
Definition: wchar.c:34
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:183
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1531
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:920
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:281
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1226
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1164
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1276
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1556
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1027
#define ERR
Definition: wchar.c:1777
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:439
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:791
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:269
#define ASC
Definition: wchar.c:1795
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:947
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1135
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1329
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:158
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1670
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1193
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:725
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:889
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1077
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:986
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1080
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1477
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1639
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1379
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:375
#define L3C
Definition: wchar.c:1801
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1699
#define CR1
Definition: wchar.c:1807
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:859
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:505
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:230
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:1013
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:2137
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition: wchar.c:1871
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:353
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:837
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:974
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:142
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:809
#define L3A
Definition: wchar.c:1799
#define L4B
Definition: wchar.c:1804
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition: wchar.c:2163
#define NONUTF8_INVALID_BYTE1
Definition: wchar.c:35
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1889
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:214
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2062
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:672
#define BGN
Definition: wchar.c:1779
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2174
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2116
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1305
#define ILL
Definition: wchar.c:1811
#define STRIDE_LENGTH
#define L4A
Definition: wchar.c:1803
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:89
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:2150
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:626
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:660