PostgreSQL Source Code git master
Loading...
Searching...
No Matches
wchar.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
5 *
6 * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/wchar.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include <limits.h>
16
17#include "mb/pg_wchar.h"
18#include "utils/ascii.h"
19
20
21/*
22 * In today's multibyte encodings other than UTF8, this two-byte sequence
23 * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 *
25 * For historical reasons, several verifychar implementations opt to reject
26 * this pair specifically. Byte pair range constraints, in encoding
27 * originator documentation, always excluded this pair. No core conversion
28 * could translate it. However, longstanding verifychar implementations
29 * accepted any non-NUL byte. big5_to_euc_tw even translates pairs not
30 * valid per encoding originator documentation. To avoid tightening core
31 * or non-core conversions in a security patch, we sought this one pair.
32 *
33 * PQescapeString() historically used spaces for BYTE1; many other values
34 * could suffice for BYTE1.
35 */
36#define NONUTF8_INVALID_BYTE0 (0x8d)
37#define NONUTF8_INVALID_BYTE1 (' ')
38
39
40/*
41 * Operations on multi-byte encodings are driven by a table of helper
42 * functions.
43 *
44 * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 * and wchar2mb() conversion functions.
47 *
48 * These functions generally assume that their input is validly formed.
49 * The "verifier" functions, further down in the file, have to be more
50 * paranoid.
51 *
52 * We expect that mblen() does not need to examine more than the first byte
53 * of the character to discover the correct length. GB18030 is an exception
54 * to that rule, though, as it also looks at second byte. But even that
55 * behaves in a predictable way, if you only pass the first byte: it will
56 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 * good enough for all current uses.
58 *
59 * Note: for the display output of psql to work properly, the return values
60 * of the dsplen functions must conform to the Unicode standard. In particular
61 * the NUL character is zero width and control characters are generally
62 * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 * subset to the ASCII routines to ensure consistency.
64 */
65
66/* No error-reporting facility. Ignore incomplete trailing byte sequence. */
67#define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
68
69/*
70 * SQL/ASCII
71 */
72static int
73pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
74{
75 int cnt = 0;
76
77 while (len > 0 && *from)
78 {
79 *to++ = *from++;
80 len--;
81 cnt++;
82 }
83 *to = 0;
84 return cnt;
85}
86
87static int
88pg_ascii_mblen(const unsigned char *s)
89{
90 return 1;
91}
92
93static int
94pg_ascii_dsplen(const unsigned char *s)
95{
96 if (*s == '\0')
97 return 0;
98 if (*s < 0x20 || *s == 0x7f)
99 return -1;
100
101 return 1;
102}
103
104/*
105 * EUC
106 */
107static int
108pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
109{
110 int cnt = 0;
111
112 while (len > 0 && *from)
113 {
114 if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
115 {
117 from++;
118 *to = (SS2 << 8) | *from++;
119 len -= 2;
120 }
121 else if (*from == SS3) /* JIS X 0212 KANJI */
122 {
124 from++;
125 *to = (SS3 << 16) | (*from++ << 8);
126 *to |= *from++;
127 len -= 3;
128 }
129 else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
130 {
132 *to = *from++ << 8;
133 *to |= *from++;
134 len -= 2;
135 }
136 else /* must be ASCII */
137 {
138 *to = *from++;
139 len--;
140 }
141 to++;
142 cnt++;
143 }
144 *to = 0;
145 return cnt;
146}
147
148static inline int
149pg_euc_mblen(const unsigned char *s)
150{
151 int len;
152
153 if (*s == SS2)
154 len = 2;
155 else if (*s == SS3)
156 len = 3;
157 else if (IS_HIGHBIT_SET(*s))
158 len = 2;
159 else
160 len = 1;
161 return len;
162}
163
164static inline int
165pg_euc_dsplen(const unsigned char *s)
166{
167 int len;
168
169 if (*s == SS2)
170 len = 2;
171 else if (*s == SS3)
172 len = 2;
173 else if (IS_HIGHBIT_SET(*s))
174 len = 2;
175 else
176 len = pg_ascii_dsplen(s);
177 return len;
178}
179
180/*
181 * EUC_JP
182 */
183static int
184pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
185{
186 return pg_euc2wchar_with_len(from, to, len);
187}
188
189static int
190pg_eucjp_mblen(const unsigned char *s)
191{
192 return pg_euc_mblen(s);
193}
194
195static int
196pg_eucjp_dsplen(const unsigned char *s)
197{
198 int len;
199
200 if (*s == SS2)
201 len = 1;
202 else if (*s == SS3)
203 len = 2;
204 else if (IS_HIGHBIT_SET(*s))
205 len = 2;
206 else
207 len = pg_ascii_dsplen(s);
208 return len;
209}
210
211/*
212 * EUC_KR
213 */
214static int
215pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
216{
217 return pg_euc2wchar_with_len(from, to, len);
218}
219
220static int
221pg_euckr_mblen(const unsigned char *s)
222{
223 return pg_euc_mblen(s);
224}
225
226static int
227pg_euckr_dsplen(const unsigned char *s)
228{
229 return pg_euc_dsplen(s);
230}
231
232/*
233 * EUC_CN
234 *
235 */
236static int
237pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
238{
239 int cnt = 0;
240
241 while (len > 0 && *from)
242 {
243 if (*from == SS2) /* code set 2 (unused?) */
244 {
246 from++;
247 *to = (SS2 << 16) | (*from++ << 8);
248 *to |= *from++;
249 len -= 3;
250 }
251 else if (*from == SS3) /* code set 3 (unused ?) */
252 {
254 from++;
255 *to = (SS3 << 16) | (*from++ << 8);
256 *to |= *from++;
257 len -= 3;
258 }
259 else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
260 {
262 *to = *from++ << 8;
263 *to |= *from++;
264 len -= 2;
265 }
266 else
267 {
268 *to = *from++;
269 len--;
270 }
271 to++;
272 cnt++;
273 }
274 *to = 0;
275 return cnt;
276}
277
278/*
279 * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
280 * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
281 * relies on agreement between mb2wchar_with_len and mblen. Invalid text
282 * datums (e.g. from shared catalogs) reach this.
283 */
284static int
285pg_euccn_mblen(const unsigned char *s)
286{
287 int len;
288
289 if (*s == SS2)
290 len = 3;
291 else if (*s == SS3)
292 len = 3;
293 else if (IS_HIGHBIT_SET(*s))
294 len = 2;
295 else
296 len = 1;
297 return len;
298}
299
300static int
301pg_euccn_dsplen(const unsigned char *s)
302{
303 int len;
304
305 if (IS_HIGHBIT_SET(*s))
306 len = 2;
307 else
308 len = pg_ascii_dsplen(s);
309 return len;
310}
311
312/*
313 * EUC_TW
314 *
315 */
316static int
317pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
318{
319 int cnt = 0;
320
321 while (len > 0 && *from)
322 {
323 if (*from == SS2) /* code set 2 */
324 {
326 from++;
327 *to = (((uint32) SS2) << 24) | (*from++ << 16);
328 *to |= *from++ << 8;
329 *to |= *from++;
330 len -= 4;
331 }
332 else if (*from == SS3) /* code set 3 (unused?) */
333 {
335 from++;
336 *to = (SS3 << 16) | (*from++ << 8);
337 *to |= *from++;
338 len -= 3;
339 }
340 else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
341 {
343 *to = *from++ << 8;
344 *to |= *from++;
345 len -= 2;
346 }
347 else
348 {
349 *to = *from++;
350 len--;
351 }
352 to++;
353 cnt++;
354 }
355 *to = 0;
356 return cnt;
357}
358
359static int
360pg_euctw_mblen(const unsigned char *s)
361{
362 int len;
363
364 if (*s == SS2)
365 len = 4;
366 else if (*s == SS3)
367 len = 3;
368 else if (IS_HIGHBIT_SET(*s))
369 len = 2;
370 else
371 len = 1;
372 return len;
373}
374
375static int
376pg_euctw_dsplen(const unsigned char *s)
377{
378 int len;
379
380 if (*s == SS2)
381 len = 2;
382 else if (*s == SS3)
383 len = 2;
384 else if (IS_HIGHBIT_SET(*s))
385 len = 2;
386 else
387 len = pg_ascii_dsplen(s);
388 return len;
389}
390
391/*
392 * Convert pg_wchar to EUC_* encoding.
393 * caller must allocate enough space for "to", including a trailing zero!
394 * len: length of from.
395 * "from" not necessarily null terminated.
396 */
397static int
398pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
399{
400 int cnt = 0;
401
402 while (len > 0 && *from)
403 {
404 unsigned char c;
405
406 if ((c = (*from >> 24)))
407 {
408 *to++ = c;
409 *to++ = (*from >> 16) & 0xff;
410 *to++ = (*from >> 8) & 0xff;
411 *to++ = *from & 0xff;
412 cnt += 4;
413 }
414 else if ((c = (*from >> 16)))
415 {
416 *to++ = c;
417 *to++ = (*from >> 8) & 0xff;
418 *to++ = *from & 0xff;
419 cnt += 3;
420 }
421 else if ((c = (*from >> 8)))
422 {
423 *to++ = c;
424 *to++ = *from & 0xff;
425 cnt += 2;
426 }
427 else
428 {
429 *to++ = *from;
430 cnt++;
431 }
432 from++;
433 len--;
434 }
435 *to = 0;
436 return cnt;
437}
438
439
440/*
441 * JOHAB
442 */
443static int
444pg_johab_mblen(const unsigned char *s)
445{
446 return pg_euc_mblen(s);
447}
448
449static int
450pg_johab_dsplen(const unsigned char *s)
451{
452 return pg_euc_dsplen(s);
453}
454
455/*
456 * convert UTF8 string to pg_wchar (UCS-4)
457 * caller must allocate enough space for "to", including a trailing zero!
458 * len: length of from.
459 * "from" not necessarily null terminated.
460 */
461static int
462pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
463{
464 int cnt = 0;
465 uint32 c1,
466 c2,
467 c3,
468 c4;
469
470 while (len > 0 && *from)
471 {
472 if ((*from & 0x80) == 0)
473 {
474 *to = *from++;
475 len--;
476 }
477 else if ((*from & 0xe0) == 0xc0)
478 {
480 c1 = *from++ & 0x1f;
481 c2 = *from++ & 0x3f;
482 *to = (c1 << 6) | c2;
483 len -= 2;
484 }
485 else if ((*from & 0xf0) == 0xe0)
486 {
488 c1 = *from++ & 0x0f;
489 c2 = *from++ & 0x3f;
490 c3 = *from++ & 0x3f;
491 *to = (c1 << 12) | (c2 << 6) | c3;
492 len -= 3;
493 }
494 else if ((*from & 0xf8) == 0xf0)
495 {
497 c1 = *from++ & 0x07;
498 c2 = *from++ & 0x3f;
499 c3 = *from++ & 0x3f;
500 c4 = *from++ & 0x3f;
501 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
502 len -= 4;
503 }
504 else
505 {
506 /* treat a bogus char as length 1; not ours to raise error */
507 *to = *from++;
508 len--;
509 }
510 to++;
511 cnt++;
512 }
513 *to = 0;
514 return cnt;
515}
516
517
518/*
519 * Trivial conversion from pg_wchar to UTF-8.
520 * caller should allocate enough space for "to"
521 * len: length of from.
522 * "from" not necessarily null terminated.
523 */
524static int
525pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
526{
527 int cnt = 0;
528
529 while (len > 0 && *from)
530 {
531 int char_len;
532
533 unicode_to_utf8(*from, to);
535 cnt += char_len;
536 to += char_len;
537 from++;
538 len--;
539 }
540 *to = 0;
541 return cnt;
542}
543
544/*
545 * Return the byte length of a UTF8 character pointed to by s
546 *
547 * Note: in the current implementation we do not support UTF8 sequences
548 * of more than 4 bytes; hence do NOT return a value larger than 4.
549 * We return "1" for any leading byte that is either flat-out illegal or
550 * indicates a length larger than we support.
551 *
552 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
553 * other places would need to be fixed to change this.
554 */
555int
556pg_utf_mblen(const unsigned char *s)
557{
558 int len;
559
560 if ((*s & 0x80) == 0)
561 len = 1;
562 else if ((*s & 0xe0) == 0xc0)
563 len = 2;
564 else if ((*s & 0xf0) == 0xe0)
565 len = 3;
566 else if ((*s & 0xf8) == 0xf0)
567 len = 4;
568#ifdef NOT_USED
569 else if ((*s & 0xfc) == 0xf8)
570 len = 5;
571 else if ((*s & 0xfe) == 0xfc)
572 len = 6;
573#endif
574 else
575 len = 1;
576 return len;
577}
578
579/*
580 * This is an implementation of wcwidth() and wcswidth() as defined in
581 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
582 * <http://www.unix.org/online.html>
583 *
584 * Markus Kuhn -- 2001-09-08 -- public domain
585 *
586 * customised for PostgreSQL
587 *
588 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
589 */
590
592{
593 unsigned int first;
594 unsigned int last;
595};
596
597/* auxiliary function for binary search in interval table */
598static int
599mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
600{
601 int min = 0;
602 int mid;
603
604 if (ucs < table[0].first || ucs > table[max].last)
605 return 0;
606 while (max >= min)
607 {
608 mid = (min + max) / 2;
609 if (ucs > table[mid].last)
610 min = mid + 1;
611 else if (ucs < table[mid].first)
612 max = mid - 1;
613 else
614 return 1;
615 }
616
617 return 0;
618}
619
620
621/*
622 * The following functions define the column width of an ISO 10646
623 * character as follows:
624 *
625 * - The null character (U+0000) has a column width of 0.
626 *
627 * - Other C0/C1 control characters and DEL will lead to a return
628 * value of -1.
629 *
630 * - Non-spacing and enclosing combining characters (general
631 * category code Mn, Me or Cf in the Unicode database) have a
632 * column width of 0.
633 *
634 * - Spacing characters in the East Asian Wide (W) or East Asian
635 * FullWidth (F) category as defined in Unicode Technical
636 * Report #11 have a column width of 2.
637 *
638 * - All remaining characters (including all printable
639 * ISO 8859-1 and WGL4 characters, Unicode control characters,
640 * etc.) have a column width of 1.
641 *
642 * This implementation assumes that wchar_t characters are encoded
643 * in ISO 10646.
644 */
645
646static int
648{
651
652 /* test for 8-bit control characters */
653 if (ucs == 0)
654 return 0;
655
656 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
657 return -1;
658
659 /*
660 * binary search in table of non-spacing characters
661 *
662 * XXX: In the official Unicode sources, it is possible for a character to
663 * be described as both non-spacing and wide at the same time. As of
664 * Unicode 13.0, treating the non-spacing property as the determining
665 * factor for display width leads to the correct behavior, so do that
666 * search first.
667 */
669 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
670 return 0;
671
672 /* binary search in table of wide characters */
674 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
675 return 2;
676
677 return 1;
678}
679
680static int
681pg_utf_dsplen(const unsigned char *s)
682{
683 return ucs_wcwidth(utf8_to_unicode(s));
684}
685
686/*
687 * ISO8859-1
688 */
689static int
690pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
691{
692 int cnt = 0;
693
694 while (len > 0 && *from)
695 {
696 *to++ = *from++;
697 len--;
698 cnt++;
699 }
700 *to = 0;
701 return cnt;
702}
703
704/*
705 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
706 * high bits.
707 * caller should allocate enough space for "to"
708 * len: length of from.
709 * "from" not necessarily null terminated.
710 */
711static int
712pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
713{
714 int cnt = 0;
715
716 while (len > 0 && *from)
717 {
718 *to++ = *from++;
719 len--;
720 cnt++;
721 }
722 *to = 0;
723 return cnt;
724}
725
726static int
727pg_latin1_mblen(const unsigned char *s)
728{
729 return 1;
730}
731
732static int
733pg_latin1_dsplen(const unsigned char *s)
734{
735 return pg_ascii_dsplen(s);
736}
737
738/*
739 * SJIS
740 */
741static int
742pg_sjis_mblen(const unsigned char *s)
743{
744 int len;
745
746 if (*s >= 0xa1 && *s <= 0xdf)
747 len = 1; /* 1 byte kana? */
748 else if (IS_HIGHBIT_SET(*s))
749 len = 2; /* kanji? */
750 else
751 len = 1; /* should be ASCII */
752 return len;
753}
754
755static int
756pg_sjis_dsplen(const unsigned char *s)
757{
758 int len;
759
760 if (*s >= 0xa1 && *s <= 0xdf)
761 len = 1; /* 1 byte kana? */
762 else if (IS_HIGHBIT_SET(*s))
763 len = 2; /* kanji? */
764 else
765 len = pg_ascii_dsplen(s); /* should be ASCII */
766 return len;
767}
768
769/*
770 * Big5
771 */
772static int
773pg_big5_mblen(const unsigned char *s)
774{
775 int len;
776
777 if (IS_HIGHBIT_SET(*s))
778 len = 2; /* kanji? */
779 else
780 len = 1; /* should be ASCII */
781 return len;
782}
783
784static int
785pg_big5_dsplen(const unsigned char *s)
786{
787 int len;
788
789 if (IS_HIGHBIT_SET(*s))
790 len = 2; /* kanji? */
791 else
792 len = pg_ascii_dsplen(s); /* should be ASCII */
793 return len;
794}
795
796/*
797 * GBK
798 */
799static int
800pg_gbk_mblen(const unsigned char *s)
801{
802 int len;
803
804 if (IS_HIGHBIT_SET(*s))
805 len = 2; /* kanji? */
806 else
807 len = 1; /* should be ASCII */
808 return len;
809}
810
811static int
812pg_gbk_dsplen(const unsigned char *s)
813{
814 int len;
815
816 if (IS_HIGHBIT_SET(*s))
817 len = 2; /* kanji? */
818 else
819 len = pg_ascii_dsplen(s); /* should be ASCII */
820 return len;
821}
822
823/*
824 * UHC
825 */
826static int
827pg_uhc_mblen(const unsigned char *s)
828{
829 int len;
830
831 if (IS_HIGHBIT_SET(*s))
832 len = 2; /* 2byte? */
833 else
834 len = 1; /* should be ASCII */
835 return len;
836}
837
838static int
839pg_uhc_dsplen(const unsigned char *s)
840{
841 int len;
842
843 if (IS_HIGHBIT_SET(*s))
844 len = 2; /* 2byte? */
845 else
846 len = pg_ascii_dsplen(s); /* should be ASCII */
847 return len;
848}
849
850/*
851 * GB18030
852 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
853 */
854
855/*
856 * Unlike all other mblen() functions, this also looks at the second byte of
857 * the input. However, if you only pass the first byte of a multi-byte
858 * string, and \0 as the second byte, this still works in a predictable way:
859 * a 4-byte character will be reported as two 2-byte characters. That's
860 * enough for all current uses, as a client-only encoding. It works that
861 * way, because in any valid 4-byte GB18030-encoded character, the third and
862 * fourth byte look like a 2-byte encoded character, when looked at
863 * separately.
864 */
865static int
866pg_gb18030_mblen(const unsigned char *s)
867{
868 int len;
869
870 if (!IS_HIGHBIT_SET(*s))
871 len = 1; /* ASCII */
872 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
873 len = 4;
874 else
875 len = 2;
876 return len;
877}
878
879static int
880pg_gb18030_dsplen(const unsigned char *s)
881{
882 int len;
883
884 if (IS_HIGHBIT_SET(*s))
885 len = 2;
886 else
887 len = pg_ascii_dsplen(s); /* ASCII */
888 return len;
889}
890
891/*
892 *-------------------------------------------------------------------
893 * multibyte sequence validators
894 *
895 * The verifychar functions accept "s", a pointer to the first byte of a
896 * string, and "len", the remaining length of the string. If there is a
897 * validly encoded character beginning at *s, return its length in bytes;
898 * else return -1.
899 *
900 * The verifystr functions also accept "s", a pointer to a string and "len",
901 * the length of the string. They verify the whole string, and return the
902 * number of input bytes (<= len) that are valid. In other words, if the
903 * whole string is valid, verifystr returns "len", otherwise it returns the
904 * byte offset of the first invalid character. The verifystr functions must
905 * test for and reject zeroes in the input.
906 *
907 * The verifychar functions can assume that len > 0 and that *s != '\0', but
908 * they must test for and reject zeroes in any additional bytes of a
909 * multibyte character. Note that this definition allows the function for a
910 * single-byte encoding to be just "return 1".
911 *-------------------------------------------------------------------
912 */
913static int
914pg_ascii_verifychar(const unsigned char *s, int len)
915{
916 return 1;
917}
918
919static int
920pg_ascii_verifystr(const unsigned char *s, int len)
921{
922 const unsigned char *nullpos = memchr(s, 0, len);
923
924 if (nullpos == NULL)
925 return len;
926 else
927 return nullpos - s;
928}
929
930#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
931
932static int
933pg_eucjp_verifychar(const unsigned char *s, int len)
934{
935 int l;
936 unsigned char c1,
937 c2;
938
939 c1 = *s++;
940
941 switch (c1)
942 {
943 case SS2: /* JIS X 0201 */
944 l = 2;
945 if (l > len)
946 return -1;
947 c2 = *s++;
948 if (c2 < 0xa1 || c2 > 0xdf)
949 return -1;
950 break;
951
952 case SS3: /* JIS X 0212 */
953 l = 3;
954 if (l > len)
955 return -1;
956 c2 = *s++;
958 return -1;
959 c2 = *s++;
961 return -1;
962 break;
963
964 default:
965 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
966 {
967 l = 2;
968 if (l > len)
969 return -1;
971 return -1;
972 c2 = *s++;
974 return -1;
975 }
976 else
977 /* must be ASCII */
978 {
979 l = 1;
980 }
981 break;
982 }
983
984 return l;
985}
986
987static int
988pg_eucjp_verifystr(const unsigned char *s, int len)
989{
990 const unsigned char *start = s;
991
992 while (len > 0)
993 {
994 int l;
995
996 /* fast path for ASCII-subset characters */
997 if (!IS_HIGHBIT_SET(*s))
998 {
999 if (*s == '\0')
1000 break;
1001 l = 1;
1002 }
1003 else
1004 {
1005 l = pg_eucjp_verifychar(s, len);
1006 if (l == -1)
1007 break;
1008 }
1009 s += l;
1010 len -= l;
1011 }
1012
1013 return s - start;
1014}
1015
1016static int
1017pg_euckr_verifychar(const unsigned char *s, int len)
1018{
1019 int l;
1020 unsigned char c1,
1021 c2;
1022
1023 c1 = *s++;
1024
1025 if (IS_HIGHBIT_SET(c1))
1026 {
1027 l = 2;
1028 if (l > len)
1029 return -1;
1030 if (!IS_EUC_RANGE_VALID(c1))
1031 return -1;
1032 c2 = *s++;
1033 if (!IS_EUC_RANGE_VALID(c2))
1034 return -1;
1035 }
1036 else
1037 /* must be ASCII */
1038 {
1039 l = 1;
1040 }
1041
1042 return l;
1043}
1044
1045static int
1046pg_euckr_verifystr(const unsigned char *s, int len)
1047{
1048 const unsigned char *start = s;
1049
1050 while (len > 0)
1051 {
1052 int l;
1053
1054 /* fast path for ASCII-subset characters */
1055 if (!IS_HIGHBIT_SET(*s))
1056 {
1057 if (*s == '\0')
1058 break;
1059 l = 1;
1060 }
1061 else
1062 {
1063 l = pg_euckr_verifychar(s, len);
1064 if (l == -1)
1065 break;
1066 }
1067 s += l;
1068 len -= l;
1069 }
1070
1071 return s - start;
1072}
1073
1074/* EUC-CN byte sequences are exactly same as EUC-KR */
1075#define pg_euccn_verifychar pg_euckr_verifychar
1076#define pg_euccn_verifystr pg_euckr_verifystr
1077
1078static int
1079pg_euctw_verifychar(const unsigned char *s, int len)
1080{
1081 int l;
1082 unsigned char c1,
1083 c2;
1084
1085 c1 = *s++;
1086
1087 switch (c1)
1088 {
1089 case SS2: /* CNS 11643 Plane 1-7 */
1090 l = 4;
1091 if (l > len)
1092 return -1;
1093 c2 = *s++;
1094 if (c2 < 0xa1 || c2 > 0xa7)
1095 return -1;
1096 c2 = *s++;
1097 if (!IS_EUC_RANGE_VALID(c2))
1098 return -1;
1099 c2 = *s++;
1100 if (!IS_EUC_RANGE_VALID(c2))
1101 return -1;
1102 break;
1103
1104 case SS3: /* unused */
1105 return -1;
1106
1107 default:
1108 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1109 {
1110 l = 2;
1111 if (l > len)
1112 return -1;
1113 /* no further range check on c1? */
1114 c2 = *s++;
1115 if (!IS_EUC_RANGE_VALID(c2))
1116 return -1;
1117 }
1118 else
1119 /* must be ASCII */
1120 {
1121 l = 1;
1122 }
1123 break;
1124 }
1125 return l;
1126}
1127
1128static int
1129pg_euctw_verifystr(const unsigned char *s, int len)
1130{
1131 const unsigned char *start = s;
1132
1133 while (len > 0)
1134 {
1135 int l;
1136
1137 /* fast path for ASCII-subset characters */
1138 if (!IS_HIGHBIT_SET(*s))
1139 {
1140 if (*s == '\0')
1141 break;
1142 l = 1;
1143 }
1144 else
1145 {
1146 l = pg_euctw_verifychar(s, len);
1147 if (l == -1)
1148 break;
1149 }
1150 s += l;
1151 len -= l;
1152 }
1153
1154 return s - start;
1155}
1156
1157static int
1158pg_johab_verifychar(const unsigned char *s, int len)
1159{
1160 int l,
1161 mbl;
1162 unsigned char c;
1163
1164 l = mbl = pg_johab_mblen(s);
1165
1166 if (len < l)
1167 return -1;
1168
1169 if (!IS_HIGHBIT_SET(*s))
1170 return mbl;
1171
1172 while (--l > 0)
1173 {
1174 c = *++s;
1175 if (!IS_EUC_RANGE_VALID(c))
1176 return -1;
1177 }
1178 return mbl;
1179}
1180
1181static int
1182pg_johab_verifystr(const unsigned char *s, int len)
1183{
1184 const unsigned char *start = s;
1185
1186 while (len > 0)
1187 {
1188 int l;
1189
1190 /* fast path for ASCII-subset characters */
1191 if (!IS_HIGHBIT_SET(*s))
1192 {
1193 if (*s == '\0')
1194 break;
1195 l = 1;
1196 }
1197 else
1198 {
1199 l = pg_johab_verifychar(s, len);
1200 if (l == -1)
1201 break;
1202 }
1203 s += l;
1204 len -= l;
1205 }
1206
1207 return s - start;
1208}
1209
1210static int
1211pg_latin1_verifychar(const unsigned char *s, int len)
1212{
1213 return 1;
1214}
1215
1216static int
1217pg_latin1_verifystr(const unsigned char *s, int len)
1218{
1219 const unsigned char *nullpos = memchr(s, 0, len);
1220
1221 if (nullpos == NULL)
1222 return len;
1223 else
1224 return nullpos - s;
1225}
1226
1227static int
1228pg_sjis_verifychar(const unsigned char *s, int len)
1229{
1230 int l,
1231 mbl;
1232 unsigned char c1,
1233 c2;
1234
1235 l = mbl = pg_sjis_mblen(s);
1236
1237 if (len < l)
1238 return -1;
1239
1240 if (l == 1) /* pg_sjis_mblen already verified it */
1241 return mbl;
1242
1243 c1 = *s++;
1244 c2 = *s;
1245 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1246 return -1;
1247 return mbl;
1248}
1249
1250static int
1251pg_sjis_verifystr(const unsigned char *s, int len)
1252{
1253 const unsigned char *start = s;
1254
1255 while (len > 0)
1256 {
1257 int l;
1258
1259 /* fast path for ASCII-subset characters */
1260 if (!IS_HIGHBIT_SET(*s))
1261 {
1262 if (*s == '\0')
1263 break;
1264 l = 1;
1265 }
1266 else
1267 {
1268 l = pg_sjis_verifychar(s, len);
1269 if (l == -1)
1270 break;
1271 }
1272 s += l;
1273 len -= l;
1274 }
1275
1276 return s - start;
1277}
1278
1279static int
1280pg_big5_verifychar(const unsigned char *s, int len)
1281{
1282 int l,
1283 mbl;
1284
1285 l = mbl = pg_big5_mblen(s);
1286
1287 if (len < l)
1288 return -1;
1289
1290 if (l == 2 &&
1291 s[0] == NONUTF8_INVALID_BYTE0 &&
1292 s[1] == NONUTF8_INVALID_BYTE1)
1293 return -1;
1294
1295 while (--l > 0)
1296 {
1297 if (*++s == '\0')
1298 return -1;
1299 }
1300
1301 return mbl;
1302}
1303
1304static int
1305pg_big5_verifystr(const unsigned char *s, int len)
1306{
1307 const unsigned char *start = s;
1308
1309 while (len > 0)
1310 {
1311 int l;
1312
1313 /* fast path for ASCII-subset characters */
1314 if (!IS_HIGHBIT_SET(*s))
1315 {
1316 if (*s == '\0')
1317 break;
1318 l = 1;
1319 }
1320 else
1321 {
1322 l = pg_big5_verifychar(s, len);
1323 if (l == -1)
1324 break;
1325 }
1326 s += l;
1327 len -= l;
1328 }
1329
1330 return s - start;
1331}
1332
1333static int
1334pg_gbk_verifychar(const unsigned char *s, int len)
1335{
1336 int l,
1337 mbl;
1338
1339 l = mbl = pg_gbk_mblen(s);
1340
1341 if (len < l)
1342 return -1;
1343
1344 if (l == 2 &&
1345 s[0] == NONUTF8_INVALID_BYTE0 &&
1346 s[1] == NONUTF8_INVALID_BYTE1)
1347 return -1;
1348
1349 while (--l > 0)
1350 {
1351 if (*++s == '\0')
1352 return -1;
1353 }
1354
1355 return mbl;
1356}
1357
1358static int
1359pg_gbk_verifystr(const unsigned char *s, int len)
1360{
1361 const unsigned char *start = s;
1362
1363 while (len > 0)
1364 {
1365 int l;
1366
1367 /* fast path for ASCII-subset characters */
1368 if (!IS_HIGHBIT_SET(*s))
1369 {
1370 if (*s == '\0')
1371 break;
1372 l = 1;
1373 }
1374 else
1375 {
1376 l = pg_gbk_verifychar(s, len);
1377 if (l == -1)
1378 break;
1379 }
1380 s += l;
1381 len -= l;
1382 }
1383
1384 return s - start;
1385}
1386
1387static int
1388pg_uhc_verifychar(const unsigned char *s, int len)
1389{
1390 int l,
1391 mbl;
1392
1393 l = mbl = pg_uhc_mblen(s);
1394
1395 if (len < l)
1396 return -1;
1397
1398 if (l == 2 &&
1399 s[0] == NONUTF8_INVALID_BYTE0 &&
1400 s[1] == NONUTF8_INVALID_BYTE1)
1401 return -1;
1402
1403 while (--l > 0)
1404 {
1405 if (*++s == '\0')
1406 return -1;
1407 }
1408
1409 return mbl;
1410}
1411
1412static int
1413pg_uhc_verifystr(const unsigned char *s, int len)
1414{
1415 const unsigned char *start = s;
1416
1417 while (len > 0)
1418 {
1419 int l;
1420
1421 /* fast path for ASCII-subset characters */
1422 if (!IS_HIGHBIT_SET(*s))
1423 {
1424 if (*s == '\0')
1425 break;
1426 l = 1;
1427 }
1428 else
1429 {
1430 l = pg_uhc_verifychar(s, len);
1431 if (l == -1)
1432 break;
1433 }
1434 s += l;
1435 len -= l;
1436 }
1437
1438 return s - start;
1439}
1440
1441static int
1442pg_gb18030_verifychar(const unsigned char *s, int len)
1443{
1444 int l;
1445
1446 if (!IS_HIGHBIT_SET(*s))
1447 l = 1; /* ASCII */
1448 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1449 {
1450 /* Should be 4-byte, validate remaining bytes */
1451 if (*s >= 0x81 && *s <= 0xfe &&
1452 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1453 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1454 l = 4;
1455 else
1456 l = -1;
1457 }
1458 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1459 {
1460 /* Should be 2-byte, validate */
1461 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1462 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1463 l = 2;
1464 else
1465 l = -1;
1466 }
1467 else
1468 l = -1;
1469 return l;
1470}
1471
1472static int
1473pg_gb18030_verifystr(const unsigned char *s, int len)
1474{
1475 const unsigned char *start = s;
1476
1477 while (len > 0)
1478 {
1479 int l;
1480
1481 /* fast path for ASCII-subset characters */
1482 if (!IS_HIGHBIT_SET(*s))
1483 {
1484 if (*s == '\0')
1485 break;
1486 l = 1;
1487 }
1488 else
1489 {
1490 l = pg_gb18030_verifychar(s, len);
1491 if (l == -1)
1492 break;
1493 }
1494 s += l;
1495 len -= l;
1496 }
1497
1498 return s - start;
1499}
1500
1501static int
1502pg_utf8_verifychar(const unsigned char *s, int len)
1503{
1504 int l;
1505
1506 if ((*s & 0x80) == 0)
1507 {
1508 if (*s == '\0')
1509 return -1;
1510 return 1;
1511 }
1512 else if ((*s & 0xe0) == 0xc0)
1513 l = 2;
1514 else if ((*s & 0xf0) == 0xe0)
1515 l = 3;
1516 else if ((*s & 0xf8) == 0xf0)
1517 l = 4;
1518 else
1519 l = 1;
1520
1521 if (l > len)
1522 return -1;
1523
1524 if (!pg_utf8_islegal(s, l))
1525 return -1;
1526
1527 return l;
1528}
1529
1530/*
1531 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1532 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1533 * input byte and current state are used to compute an index into an array of
1534 * state transitions. Since the address of the next transition is dependent
1535 * on this computation, there is latency in executing the load instruction,
1536 * and the CPU is not kept busy.
1537 *
1538 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1539 *
1540 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1541 *
1542 * In a shift-based DFA, the input byte is an index into array of integers
1543 * whose bit pattern encodes the state transitions. To compute the next
1544 * state, we simply right-shift the integer by the current state and apply a
1545 * mask. In this scheme, the address of the transition only depends on the
1546 * input byte, so there is better pipelining.
1547 *
1548 * The naming convention for states and transitions was adopted from a UTF-8
1549 * to UTF-16/32 transcoder, whose table is reproduced below:
1550 *
1551 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1552 *
1553 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1554 * ==========================================================================
1555 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1556 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1557 * |
1558 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1559 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1560 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1561 * |
1562 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1563 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1564 * |
1565 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1566 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1567 *
1568 * In the most straightforward implementation, a shift-based DFA for UTF-8
1569 * requires 64-bit integers to encode the transitions, but with an SMT solver
1570 * it's possible to find state numbers such that the transitions fit within
1571 * 32-bit integers, as Dougall Johnson demonstrated:
1572 *
1573 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1574 *
1575 * This packed representation is the reason for the seemingly odd choice of
1576 * state values below.
1577 */
1578
1579/* Error */
1580#define ERR 0
1581/* Begin */
1582#define BGN 11
1583/* Continuation states, expect 1/2/3 continuation bytes */
1584#define CS1 16
1585#define CS2 1
1586#define CS3 5
1587/* Partial states, where the first continuation byte has a restricted range */
1588#define P3A 6 /* Lead was E0, check for 3-byte overlong */
1589#define P3B 20 /* Lead was ED, check for surrogate */
1590#define P4A 25 /* Lead was F0, check for 4-byte overlong */
1591#define P4B 30 /* Lead was F4, check for too-large */
1592/* Begin and End are the same state */
1593#define END BGN
1594
1595/* the encoded state transitions for the lookup table */
1596
1597/* ASCII */
1598#define ASC (END << BGN)
1599/* 2-byte lead */
1600#define L2A (CS1 << BGN)
1601/* 3-byte lead */
1602#define L3A (P3A << BGN)
1603#define L3B (CS2 << BGN)
1604#define L3C (P3B << BGN)
1605/* 4-byte lead */
1606#define L4A (P4A << BGN)
1607#define L4B (CS3 << BGN)
1608#define L4C (P4B << BGN)
1609/* continuation byte */
1610#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1611#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1612#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1613/* invalid byte */
1614#define ILL ERR
1615
1616static const uint32 Utf8Transition[256] =
1617{
1618 /* ASCII */
1619
1620 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1621 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1622 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1623 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1624
1625 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1626 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1627 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1628 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1629
1630 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1631 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1632 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1633 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1634
1635 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1636 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1637 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1638 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1639
1640 /* continuation bytes */
1641
1642 /* 80..8F */
1643 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1644 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1645
1646 /* 90..9F */
1647 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1648 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1649
1650 /* A0..BF */
1651 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1652 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1653 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1654 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1655
1656 /* leading bytes */
1657
1658 /* C0..DF */
1659 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1660 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1661 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1662 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1663
1664 /* E0..EF */
1665 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1666 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1667
1668 /* F0..FF */
1669 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1670 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1671};
1672
1673static void
1674utf8_advance(const unsigned char *s, uint32 *state, int len)
1675{
1676 /* Note: We deliberately don't check the state's value here. */
1677 while (len > 0)
1678 {
1679 /*
1680 * It's important that the mask value is 31: In most instruction sets,
1681 * a shift by a 32-bit operand is understood to be a shift by its mod
1682 * 32, so the compiler should elide the mask operation.
1683 */
1684 *state = Utf8Transition[*s++] >> (*state & 31);
1685 len--;
1686 }
1687
1688 *state &= 31;
1689}
1690
1691static int
1692pg_utf8_verifystr(const unsigned char *s, int len)
1693{
1694 const unsigned char *start = s;
1695 const int orig_len = len;
1696 uint32 state = BGN;
1697
1698/*
1699 * With a stride of two vector widths, gcc will unroll the loop. Even if
1700 * the compiler can unroll a longer loop, it's not worth it because we
1701 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1702 */
1703#define STRIDE_LENGTH (2 * sizeof(Vector8))
1704
1705 if (len >= STRIDE_LENGTH)
1706 {
1707 while (len >= STRIDE_LENGTH)
1708 {
1709 /*
1710 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1711 * but we must first check for a non-END state, which means the
1712 * previous chunk ended in the middle of a multibyte sequence.
1713 */
1714 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1716
1717 s += STRIDE_LENGTH;
1718 len -= STRIDE_LENGTH;
1719 }
1720
1721 /* The error state persists, so we only need to check for it here. */
1722 if (state == ERR)
1723 {
1724 /*
1725 * Start over from the beginning with the slow path so we can
1726 * count the valid bytes.
1727 */
1728 len = orig_len;
1729 s = start;
1730 }
1731 else if (state != END)
1732 {
1733 /*
1734 * The fast path exited in the middle of a multibyte sequence.
1735 * Walk backwards to find the leading byte so that the slow path
1736 * can resume checking from there. We must always backtrack at
1737 * least one byte, since the current byte could be e.g. an ASCII
1738 * byte after a 2-byte lead, which is invalid.
1739 */
1740 do
1741 {
1742 Assert(s > start);
1743 s--;
1744 len++;
1746 } while (pg_utf_mblen(s) <= 1);
1747 }
1748 }
1749
1750 /* check remaining bytes */
1751 while (len > 0)
1752 {
1753 int l;
1754
1755 /* fast path for ASCII-subset characters */
1756 if (!IS_HIGHBIT_SET(*s))
1757 {
1758 if (*s == '\0')
1759 break;
1760 l = 1;
1761 }
1762 else
1763 {
1764 l = pg_utf8_verifychar(s, len);
1765 if (l == -1)
1766 break;
1767 }
1768 s += l;
1769 len -= l;
1770 }
1771
1772 return s - start;
1773}
1774
1775/*
1776 * Check for validity of a single UTF-8 encoded character
1777 *
1778 * This directly implements the rules in RFC3629. The bizarre-looking
1779 * restrictions on the second byte are meant to ensure that there isn't
1780 * more than one encoding of a given Unicode character point; that is,
1781 * you may not use a longer-than-necessary byte sequence with high order
1782 * zero bits to represent a character that would fit in fewer bytes.
1783 * To do otherwise is to create security hazards (eg, create an apparent
1784 * non-ASCII character that decodes to plain ASCII).
1785 *
1786 * length is assumed to have been obtained by pg_utf_mblen(), and the
1787 * caller must have checked that that many bytes are present in the buffer.
1788 */
1789bool
1790pg_utf8_islegal(const unsigned char *source, int length)
1791{
1792 unsigned char a;
1793
1794 switch (length)
1795 {
1796 default:
1797 /* reject lengths 5 and 6 for now */
1798 return false;
1799 case 4:
1800 a = source[3];
1801 if (a < 0x80 || a > 0xBF)
1802 return false;
1804 case 3:
1805 a = source[2];
1806 if (a < 0x80 || a > 0xBF)
1807 return false;
1809 case 2:
1810 a = source[1];
1811 switch (*source)
1812 {
1813 case 0xE0:
1814 if (a < 0xA0 || a > 0xBF)
1815 return false;
1816 break;
1817 case 0xED:
1818 if (a < 0x80 || a > 0x9F)
1819 return false;
1820 break;
1821 case 0xF0:
1822 if (a < 0x90 || a > 0xBF)
1823 return false;
1824 break;
1825 case 0xF4:
1826 if (a < 0x80 || a > 0x8F)
1827 return false;
1828 break;
1829 default:
1830 if (a < 0x80 || a > 0xBF)
1831 return false;
1832 break;
1833 }
1835 case 1:
1836 a = *source;
1837 if (a >= 0x80 && a < 0xC2)
1838 return false;
1839 if (a > 0xF4)
1840 return false;
1841 break;
1842 }
1843 return true;
1844}
1845
1846
1847/*
1848 * Fills the provided buffer with two bytes such that:
1849 * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
1850 */
1851void
1859
1860/*
1861 *-------------------------------------------------------------------
1862 * encoding info table
1863 *-------------------------------------------------------------------
1864 */
1907};
1908
1909/*
1910 * Returns the byte length of a multibyte character.
1911 *
1912 * Choose "mblen" functions based on the input string characteristics.
1913 * pg_encoding_mblen() can be used when ANY of these conditions are met:
1914 *
1915 * - The input string is zero-terminated
1916 *
1917 * - The input string is known to be valid in the encoding (e.g., string
1918 * converted from database encoding)
1919 *
1920 * - The encoding is not GB18030 (e.g., when only database encodings are
1921 * passed to 'encoding' parameter)
1922 *
1923 * encoding==GB18030 requires examining up to two bytes to determine character
1924 * length. Therefore, callers satisfying none of those conditions must use
1925 * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
1926 * guaranteed to be within allocation bounds.
1927 *
1928 * When dealing with text that is not certainly valid in the specified
1929 * encoding, the result may exceed the actual remaining string length.
1930 * Callers that are not prepared to deal with that should use Min(remaining,
1931 * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
1932 * pg_encoding_mblen_bounded() are interchangeable.
1933 */
1934int
1936{
1937 return (PG_VALID_ENCODING(encoding) ?
1938 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1939 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1940}
1941
1942/*
1943 * Returns the byte length of a multibyte character (possibly not
1944 * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
1945 */
1946int
1948 size_t remaining)
1949{
1950 /*
1951 * Define zero remaining as too few, even for single-byte encodings.
1952 * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
1953 * zero; others read one.
1954 */
1955 if (remaining < 1 ||
1957 return INT_MAX;
1959}
1960
1961/*
1962 * Returns the byte length of a multibyte character; but not more than the
1963 * distance to the terminating zero byte. For input that might lack a
1964 * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
1965 */
1966int
1971
1972/*
1973 * Returns the display length of a multibyte character.
1974 */
1975int
1977{
1978 return (PG_VALID_ENCODING(encoding) ?
1979 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1980 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1981}
1982
1983/*
1984 * Verify the first multibyte character of the given string.
1985 * Return its byte length if good, -1 if bad. (See comments above for
1986 * full details of the mbverifychar API.)
1987 */
1988int
1990{
1991 return (PG_VALID_ENCODING(encoding) ?
1992 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
1993 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
1994}
1995
1996/*
1997 * Verify that a string is valid for the given encoding.
1998 * Returns the number of input bytes (<= len) that form a valid string.
1999 * (See comments above for full details of the mbverifystr API.)
2000 */
2001int
2003{
2004 return (PG_VALID_ENCODING(encoding) ?
2005 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2006 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2007}
2008
2009/*
2010 * fetch maximum length of a given encoding
2011 */
2012int
2014{
2016
2017 /*
2018 * Check for the encoding despite the assert, due to some mingw versions
2019 * otherwise issuing bogus warnings.
2020 */
2021 return PG_VALID_ENCODING(encoding) ?
2024}
static bool is_valid_ascii(const unsigned char *s, int len)
Definition ascii.h:25
#define IS_HIGHBIT_SET(ch)
Definition c.h:1244
#define Assert(condition)
Definition c.h:943
uint32_t uint32
Definition c.h:624
#define pg_fallthrough
Definition c.h:161
return str start
int remaining
Definition informix.c:692
static char * encoding
Definition initdb.c:139
int a
Definition isn.c:73
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
static char32_t utf8_to_unicode(const unsigned char *c)
Definition mbprint.c:53
const void size_t len
static const struct lconv_member_info table[]
static rewind_source * source
Definition pg_rewind.c:89
#define ISSJISTAIL(c)
Definition pg_wchar.h:56
#define pg_utf_mblen
Definition pg_wchar.h:486
@ PG_WIN1254
Definition pg_wchar.h:107
@ PG_LATIN4
Definition pg_wchar.h:87
@ PG_LATIN9
Definition pg_wchar.h:92
@ PG_JOHAB
Definition pg_wchar.h:119
@ PG_GB18030
Definition pg_wchar.h:118
@ PG_SQL_ASCII
Definition pg_wchar.h:76
@ PG_KOI8R
Definition pg_wchar.h:98
@ PG_ISO_8859_6
Definition pg_wchar.h:102
@ PG_WIN1253
Definition pg_wchar.h:106
@ PG_KOI8U
Definition pg_wchar.h:110
@ PG_LATIN6
Definition pg_wchar.h:89
@ PG_LATIN5
Definition pg_wchar.h:88
@ PG_EUC_CN
Definition pg_wchar.h:78
@ PG_UHC
Definition pg_wchar.h:117
@ PG_LATIN2
Definition pg_wchar.h:85
@ PG_ISO_8859_5
Definition pg_wchar.h:101
@ PG_LATIN10
Definition pg_wchar.h:93
@ PG_WIN1250
Definition pg_wchar.h:105
@ PG_ISO_8859_7
Definition pg_wchar.h:103
@ PG_SJIS
Definition pg_wchar.h:114
@ PG_LATIN8
Definition pg_wchar.h:91
@ PG_EUC_JP
Definition pg_wchar.h:77
@ PG_GBK
Definition pg_wchar.h:116
@ PG_LATIN3
Definition pg_wchar.h:86
@ PG_WIN1256
Definition pg_wchar.h:94
@ PG_LATIN1
Definition pg_wchar.h:84
@ PG_EUC_TW
Definition pg_wchar.h:80
@ PG_WIN1258
Definition pg_wchar.h:95
@ PG_SHIFT_JIS_2004
Definition pg_wchar.h:120
@ PG_WIN1252
Definition pg_wchar.h:100
@ PG_LATIN7
Definition pg_wchar.h:90
@ PG_WIN1255
Definition pg_wchar.h:108
@ PG_WIN1257
Definition pg_wchar.h:109
@ PG_WIN1251
Definition pg_wchar.h:99
@ PG_EUC_KR
Definition pg_wchar.h:79
@ PG_WIN866
Definition pg_wchar.h:96
@ PG_ISO_8859_8
Definition pg_wchar.h:104
@ PG_WIN874
Definition pg_wchar.h:97
@ PG_EUC_JIS_2004
Definition pg_wchar.h:81
@ PG_BIG5
Definition pg_wchar.h:115
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition pg_wchar.h:428
#define SS2
Definition pg_wchar.h:38
#define PG_VALID_ENCODING(_enc)
Definition pg_wchar.h:140
#define ISSJISHEAD(c)
Definition pg_wchar.h:55
#define SS3
Definition pg_wchar.h:39
char * c
static int fb(int x)
unsigned int first
Definition wchar.c:593
unsigned int last
Definition wchar.c:594
mbstr_verifier mbverifystr
Definition pg_wchar.h:238
mblen_converter mblen
Definition pg_wchar.h:235
mbdisplaylen_converter dsplen
Definition pg_wchar.h:236
mbchar_verifier mbverifychar
Definition pg_wchar.h:237
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition wchar.c:1413
static int pg_latin1_dsplen(const unsigned char *s)
Definition wchar.c:733
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition wchar.c:1967
static int pg_euctw_mblen(const unsigned char *s)
Definition wchar.c:360
static int pg_euckr_dsplen(const unsigned char *s)
Definition wchar.c:227
static const uint32 Utf8Transition[256]
Definition wchar.c:1616
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition wchar.c:1790
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition wchar.c:920
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition wchar.c:1211
static int pg_sjis_dsplen(const unsigned char *s)
Definition wchar.c:756
#define CR3
Definition wchar.c:1612
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition wchar.c:1228
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:108
static int pg_eucjp_dsplen(const unsigned char *s)
Definition wchar.c:196
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:73
#define L3B
Definition wchar.c:1603
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition wchar.c:1388
#define L2A
Definition wchar.c:1600
static int pg_gbk_dsplen(const unsigned char *s)
Definition wchar.c:812
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:215
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:317
#define END
Definition wchar.c:1593
#define pg_euccn_verifychar
Definition wchar.c:1075
#define L4C
Definition wchar.c:1608
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition wchar.c:1251
static int pg_johab_mblen(const unsigned char *s)
Definition wchar.c:444
static int pg_johab_dsplen(const unsigned char *s)
Definition wchar.c:450
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition wchar.c:1305
#define CR2
Definition wchar.c:1611
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:184
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition wchar.c:1217
static int pg_latin1_mblen(const unsigned char *s)
Definition wchar.c:727
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition wchar.c:914
static int pg_ascii_mblen(const unsigned char *s)
Definition wchar.c:88
void pg_encoding_set_invalid(int encoding, char *dst)
Definition wchar.c:1852
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition wchar.c:599
static int pg_big5_dsplen(const unsigned char *s)
Definition wchar.c:785
#define pg_euccn_verifystr
Definition wchar.c:1076
int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, size_t remaining)
Definition wchar.c:1947
#define NONUTF8_INVALID_BYTE0
Definition wchar.c:36
static int pg_eucjp_mblen(const unsigned char *s)
Definition wchar.c:190
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition wchar.c:1334
static int pg_big5_mblen(const unsigned char *s)
Definition wchar.c:773
static int pg_euccn_dsplen(const unsigned char *s)
Definition wchar.c:301
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition wchar.c:1079
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition wchar.c:1017
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition wchar.c:1129
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition wchar.c:1359
static int pg_gb18030_dsplen(const unsigned char *s)
Definition wchar.c:880
#define ERR
Definition wchar.c:1580
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:462
static int pg_euccn_mblen(const unsigned char *s)
Definition wchar.c:285
#define ASC
Definition wchar.c:1598
static int pg_gbk_mblen(const unsigned char *s)
Definition wchar.c:800
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition wchar.c:988
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition wchar.c:1182
static int pg_euc_dsplen(const unsigned char *s)
Definition wchar.c:165
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition wchar.c:1473
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition wchar.c:1046
static int pg_sjis_mblen(const unsigned char *s)
Definition wchar.c:742
#define IS_EUC_RANGE_VALID(c)
Definition wchar.c:930
static int pg_uhc_dsplen(const unsigned char *s)
Definition wchar.c:839
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition wchar.c:933
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition wchar.c:1280
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition wchar.c:1442
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:398
#define L3C
Definition wchar.c:1604
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition wchar.c:1502
#define MB2CHAR_NEED_AT_LEAST(len, need)
Definition wchar.c:67
#define CR1
Definition wchar.c:1610
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:712
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:525
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:237
static int pg_gb18030_mblen(const unsigned char *s)
Definition wchar.c:866
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition wchar.c:1976
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition wchar.c:1674
static int pg_euctw_dsplen(const unsigned char *s)
Definition wchar.c:376
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:690
static int pg_uhc_mblen(const unsigned char *s)
Definition wchar.c:827
static int pg_euc_mblen(const unsigned char *s)
Definition wchar.c:149
#define L3A
Definition wchar.c:1602
#define L4B
Definition wchar.c:1607
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition wchar.c:2002
#define NONUTF8_INVALID_BYTE1
Definition wchar.c:37
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition wchar.c:1692
static int pg_euckr_mblen(const unsigned char *s)
Definition wchar.c:221
const pg_wchar_tbl pg_wchar_table[]
Definition wchar.c:1865
#define BGN
Definition wchar.c:1582
int pg_encoding_max_length(int encoding)
Definition wchar.c:2013
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition wchar.c:1935
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition wchar.c:1158
#define ILL
Definition wchar.c:1614
#define STRIDE_LENGTH
#define L4A
Definition wchar.c:1606
static int pg_ascii_dsplen(const unsigned char *s)
Definition wchar.c:94
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition wchar.c:1989
static int ucs_wcwidth(pg_wchar ucs)
Definition wchar.c:647
static int pg_utf_dsplen(const unsigned char *s)
Definition wchar.c:681