PostgreSQL Source Code git master
Loading...
Searching...
No Matches
wchar.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
5 *
6 * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/wchar.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include <limits.h>
16
17#include "mb/pg_wchar.h"
18#include "utils/ascii.h"
19
20
21/*
22 * In today's multibyte encodings other than UTF8, this two-byte sequence
23 * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 *
25 * For historical reasons, several verifychar implementations opt to reject
26 * this pair specifically. Byte pair range constraints, in encoding
27 * originator documentation, always excluded this pair. No core conversion
28 * could translate it. However, longstanding verifychar implementations
29 * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
30 * pairs not valid per encoding originator documentation. To avoid tightening
31 * core or non-core conversions in a security patch, we sought this one pair.
32 *
33 * PQescapeString() historically used spaces for BYTE1; many other values
34 * could suffice for BYTE1.
35 */
36#define NONUTF8_INVALID_BYTE0 (0x8d)
37#define NONUTF8_INVALID_BYTE1 (' ')
38
39
40/*
41 * Operations on multi-byte encodings are driven by a table of helper
42 * functions.
43 *
44 * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 * and wchar2mb() conversion functions.
47 *
48 * These functions generally assume that their input is validly formed.
49 * The "verifier" functions, further down in the file, have to be more
50 * paranoid.
51 *
52 * We expect that mblen() does not need to examine more than the first byte
53 * of the character to discover the correct length. GB18030 is an exception
54 * to that rule, though, as it also looks at second byte. But even that
55 * behaves in a predictable way, if you only pass the first byte: it will
56 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 * good enough for all current uses.
58 *
59 * Note: for the display output of psql to work properly, the return values
60 * of the dsplen functions must conform to the Unicode standard. In particular
61 * the NUL character is zero width and control characters are generally
62 * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 * subset to the ASCII routines to ensure consistency.
64 */
65
66/* No error-reporting facility. Ignore incomplete trailing byte sequence. */
67#define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
68
69/*
70 * SQL/ASCII
71 */
72static int
73pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
74{
75 int cnt = 0;
76
77 while (len > 0 && *from)
78 {
79 *to++ = *from++;
80 len--;
81 cnt++;
82 }
83 *to = 0;
84 return cnt;
85}
86
87static int
88pg_ascii_mblen(const unsigned char *s)
89{
90 return 1;
91}
92
93static int
94pg_ascii_dsplen(const unsigned char *s)
95{
96 if (*s == '\0')
97 return 0;
98 if (*s < 0x20 || *s == 0x7f)
99 return -1;
100
101 return 1;
102}
103
104/*
105 * EUC
106 */
107static int
108pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
109{
110 int cnt = 0;
111
112 while (len > 0 && *from)
113 {
114 if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
115 {
117 from++;
118 *to = (SS2 << 8) | *from++;
119 len -= 2;
120 }
121 else if (*from == SS3) /* JIS X 0212 KANJI */
122 {
124 from++;
125 *to = (SS3 << 16) | (*from++ << 8);
126 *to |= *from++;
127 len -= 3;
128 }
129 else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
130 {
132 *to = *from++ << 8;
133 *to |= *from++;
134 len -= 2;
135 }
136 else /* must be ASCII */
137 {
138 *to = *from++;
139 len--;
140 }
141 to++;
142 cnt++;
143 }
144 *to = 0;
145 return cnt;
146}
147
148static inline int
149pg_euc_mblen(const unsigned char *s)
150{
151 int len;
152
153 if (*s == SS2)
154 len = 2;
155 else if (*s == SS3)
156 len = 3;
157 else if (IS_HIGHBIT_SET(*s))
158 len = 2;
159 else
160 len = 1;
161 return len;
162}
163
164static inline int
165pg_euc_dsplen(const unsigned char *s)
166{
167 int len;
168
169 if (*s == SS2)
170 len = 2;
171 else if (*s == SS3)
172 len = 2;
173 else if (IS_HIGHBIT_SET(*s))
174 len = 2;
175 else
176 len = pg_ascii_dsplen(s);
177 return len;
178}
179
180/*
181 * EUC_JP
182 */
183static int
184pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
185{
186 return pg_euc2wchar_with_len(from, to, len);
187}
188
189static int
190pg_eucjp_mblen(const unsigned char *s)
191{
192 return pg_euc_mblen(s);
193}
194
195static int
196pg_eucjp_dsplen(const unsigned char *s)
197{
198 int len;
199
200 if (*s == SS2)
201 len = 1;
202 else if (*s == SS3)
203 len = 2;
204 else if (IS_HIGHBIT_SET(*s))
205 len = 2;
206 else
207 len = pg_ascii_dsplen(s);
208 return len;
209}
210
211/*
212 * EUC_KR
213 */
214static int
215pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
216{
217 return pg_euc2wchar_with_len(from, to, len);
218}
219
220static int
221pg_euckr_mblen(const unsigned char *s)
222{
223 return pg_euc_mblen(s);
224}
225
226static int
227pg_euckr_dsplen(const unsigned char *s)
228{
229 return pg_euc_dsplen(s);
230}
231
232/*
233 * EUC_CN
234 *
235 */
236static int
237pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
238{
239 int cnt = 0;
240
241 while (len > 0 && *from)
242 {
243 if (*from == SS2) /* code set 2 (unused?) */
244 {
246 from++;
247 *to = (SS2 << 16) | (*from++ << 8);
248 *to |= *from++;
249 len -= 3;
250 }
251 else if (*from == SS3) /* code set 3 (unused ?) */
252 {
254 from++;
255 *to = (SS3 << 16) | (*from++ << 8);
256 *to |= *from++;
257 len -= 3;
258 }
259 else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
260 {
262 *to = *from++ << 8;
263 *to |= *from++;
264 len -= 2;
265 }
266 else
267 {
268 *to = *from++;
269 len--;
270 }
271 to++;
272 cnt++;
273 }
274 *to = 0;
275 return cnt;
276}
277
278/*
279 * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
280 * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
281 * relies on agreement between mb2wchar_with_len and mblen. Invalid text
282 * datums (e.g. from shared catalogs) reach this.
283 */
284static int
285pg_euccn_mblen(const unsigned char *s)
286{
287 int len;
288
289 if (*s == SS2)
290 len = 3;
291 else if (*s == SS3)
292 len = 3;
293 else if (IS_HIGHBIT_SET(*s))
294 len = 2;
295 else
296 len = 1;
297 return len;
298}
299
300static int
301pg_euccn_dsplen(const unsigned char *s)
302{
303 int len;
304
305 if (IS_HIGHBIT_SET(*s))
306 len = 2;
307 else
308 len = pg_ascii_dsplen(s);
309 return len;
310}
311
312/*
313 * EUC_TW
314 *
315 */
316static int
317pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
318{
319 int cnt = 0;
320
321 while (len > 0 && *from)
322 {
323 if (*from == SS2) /* code set 2 */
324 {
326 from++;
327 *to = (((uint32) SS2) << 24) | (*from++ << 16);
328 *to |= *from++ << 8;
329 *to |= *from++;
330 len -= 4;
331 }
332 else if (*from == SS3) /* code set 3 (unused?) */
333 {
335 from++;
336 *to = (SS3 << 16) | (*from++ << 8);
337 *to |= *from++;
338 len -= 3;
339 }
340 else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
341 {
343 *to = *from++ << 8;
344 *to |= *from++;
345 len -= 2;
346 }
347 else
348 {
349 *to = *from++;
350 len--;
351 }
352 to++;
353 cnt++;
354 }
355 *to = 0;
356 return cnt;
357}
358
359static int
360pg_euctw_mblen(const unsigned char *s)
361{
362 int len;
363
364 if (*s == SS2)
365 len = 4;
366 else if (*s == SS3)
367 len = 3;
368 else if (IS_HIGHBIT_SET(*s))
369 len = 2;
370 else
371 len = 1;
372 return len;
373}
374
375static int
376pg_euctw_dsplen(const unsigned char *s)
377{
378 int len;
379
380 if (*s == SS2)
381 len = 2;
382 else if (*s == SS3)
383 len = 2;
384 else if (IS_HIGHBIT_SET(*s))
385 len = 2;
386 else
387 len = pg_ascii_dsplen(s);
388 return len;
389}
390
391/*
392 * Convert pg_wchar to EUC_* encoding.
393 * caller must allocate enough space for "to", including a trailing zero!
394 * len: length of from.
395 * "from" not necessarily null terminated.
396 */
397static int
398pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
399{
400 int cnt = 0;
401
402 while (len > 0 && *from)
403 {
404 unsigned char c;
405
406 if ((c = (*from >> 24)))
407 {
408 *to++ = c;
409 *to++ = (*from >> 16) & 0xff;
410 *to++ = (*from >> 8) & 0xff;
411 *to++ = *from & 0xff;
412 cnt += 4;
413 }
414 else if ((c = (*from >> 16)))
415 {
416 *to++ = c;
417 *to++ = (*from >> 8) & 0xff;
418 *to++ = *from & 0xff;
419 cnt += 3;
420 }
421 else if ((c = (*from >> 8)))
422 {
423 *to++ = c;
424 *to++ = *from & 0xff;
425 cnt += 2;
426 }
427 else
428 {
429 *to++ = *from;
430 cnt++;
431 }
432 from++;
433 len--;
434 }
435 *to = 0;
436 return cnt;
437}
438
439
440/*
441 * JOHAB
442 */
443static int
444pg_johab_mblen(const unsigned char *s)
445{
446 return pg_euc_mblen(s);
447}
448
449static int
450pg_johab_dsplen(const unsigned char *s)
451{
452 return pg_euc_dsplen(s);
453}
454
455/*
456 * convert UTF8 string to pg_wchar (UCS-4)
457 * caller must allocate enough space for "to", including a trailing zero!
458 * len: length of from.
459 * "from" not necessarily null terminated.
460 */
461static int
462pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
463{
464 int cnt = 0;
465 uint32 c1,
466 c2,
467 c3,
468 c4;
469
470 while (len > 0 && *from)
471 {
472 if ((*from & 0x80) == 0)
473 {
474 *to = *from++;
475 len--;
476 }
477 else if ((*from & 0xe0) == 0xc0)
478 {
480 c1 = *from++ & 0x1f;
481 c2 = *from++ & 0x3f;
482 *to = (c1 << 6) | c2;
483 len -= 2;
484 }
485 else if ((*from & 0xf0) == 0xe0)
486 {
488 c1 = *from++ & 0x0f;
489 c2 = *from++ & 0x3f;
490 c3 = *from++ & 0x3f;
491 *to = (c1 << 12) | (c2 << 6) | c3;
492 len -= 3;
493 }
494 else if ((*from & 0xf8) == 0xf0)
495 {
497 c1 = *from++ & 0x07;
498 c2 = *from++ & 0x3f;
499 c3 = *from++ & 0x3f;
500 c4 = *from++ & 0x3f;
501 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
502 len -= 4;
503 }
504 else
505 {
506 /* treat a bogus char as length 1; not ours to raise error */
507 *to = *from++;
508 len--;
509 }
510 to++;
511 cnt++;
512 }
513 *to = 0;
514 return cnt;
515}
516
517
518/*
519 * Trivial conversion from pg_wchar to UTF-8.
520 * caller should allocate enough space for "to"
521 * len: length of from.
522 * "from" not necessarily null terminated.
523 */
524static int
525pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
526{
527 int cnt = 0;
528
529 while (len > 0 && *from)
530 {
531 int char_len;
532
533 unicode_to_utf8(*from, to);
535 cnt += char_len;
536 to += char_len;
537 from++;
538 len--;
539 }
540 *to = 0;
541 return cnt;
542}
543
544/*
545 * Return the byte length of a UTF8 character pointed to by s
546 *
547 * Note: in the current implementation we do not support UTF8 sequences
548 * of more than 4 bytes; hence do NOT return a value larger than 4.
549 * We return "1" for any leading byte that is either flat-out illegal or
550 * indicates a length larger than we support.
551 *
552 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
553 * other places would need to be fixed to change this.
554 */
555int
556pg_utf_mblen(const unsigned char *s)
557{
558 int len;
559
560 if ((*s & 0x80) == 0)
561 len = 1;
562 else if ((*s & 0xe0) == 0xc0)
563 len = 2;
564 else if ((*s & 0xf0) == 0xe0)
565 len = 3;
566 else if ((*s & 0xf8) == 0xf0)
567 len = 4;
568#ifdef NOT_USED
569 else if ((*s & 0xfc) == 0xf8)
570 len = 5;
571 else if ((*s & 0xfe) == 0xfc)
572 len = 6;
573#endif
574 else
575 len = 1;
576 return len;
577}
578
579/*
580 * This is an implementation of wcwidth() and wcswidth() as defined in
581 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
582 * <http://www.unix.org/online.html>
583 *
584 * Markus Kuhn -- 2001-09-08 -- public domain
585 *
586 * customised for PostgreSQL
587 *
588 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
589 */
590
592{
593 unsigned int first;
594 unsigned int last;
595};
596
597/* auxiliary function for binary search in interval table */
598static int
599mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
600{
601 int min = 0;
602 int mid;
603
604 if (ucs < table[0].first || ucs > table[max].last)
605 return 0;
606 while (max >= min)
607 {
608 mid = (min + max) / 2;
609 if (ucs > table[mid].last)
610 min = mid + 1;
611 else if (ucs < table[mid].first)
612 max = mid - 1;
613 else
614 return 1;
615 }
616
617 return 0;
618}
619
620
621/* The following functions define the column width of an ISO 10646
622 * character as follows:
623 *
624 * - The null character (U+0000) has a column width of 0.
625 *
626 * - Other C0/C1 control characters and DEL will lead to a return
627 * value of -1.
628 *
629 * - Non-spacing and enclosing combining characters (general
630 * category code Mn, Me or Cf in the Unicode database) have a
631 * column width of 0.
632 *
633 * - Spacing characters in the East Asian Wide (W) or East Asian
634 * FullWidth (F) category as defined in Unicode Technical
635 * Report #11 have a column width of 2.
636 *
637 * - All remaining characters (including all printable
638 * ISO 8859-1 and WGL4 characters, Unicode control characters,
639 * etc.) have a column width of 1.
640 *
641 * This implementation assumes that wchar_t characters are encoded
642 * in ISO 10646.
643 */
644
645static int
647{
650
651 /* test for 8-bit control characters */
652 if (ucs == 0)
653 return 0;
654
655 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
656 return -1;
657
658 /*
659 * binary search in table of non-spacing characters
660 *
661 * XXX: In the official Unicode sources, it is possible for a character to
662 * be described as both non-spacing and wide at the same time. As of
663 * Unicode 13.0, treating the non-spacing property as the determining
664 * factor for display width leads to the correct behavior, so do that
665 * search first.
666 */
668 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
669 return 0;
670
671 /* binary search in table of wide characters */
673 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
674 return 2;
675
676 return 1;
677}
678
679static int
680pg_utf_dsplen(const unsigned char *s)
681{
682 return ucs_wcwidth(utf8_to_unicode(s));
683}
684
685/*
686 * ISO8859-1
687 */
688static int
689pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
690{
691 int cnt = 0;
692
693 while (len > 0 && *from)
694 {
695 *to++ = *from++;
696 len--;
697 cnt++;
698 }
699 *to = 0;
700 return cnt;
701}
702
703/*
704 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
705 * high bits.
706 * caller should allocate enough space for "to"
707 * len: length of from.
708 * "from" not necessarily null terminated.
709 */
710static int
711pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
712{
713 int cnt = 0;
714
715 while (len > 0 && *from)
716 {
717 *to++ = *from++;
718 len--;
719 cnt++;
720 }
721 *to = 0;
722 return cnt;
723}
724
725static int
726pg_latin1_mblen(const unsigned char *s)
727{
728 return 1;
729}
730
731static int
732pg_latin1_dsplen(const unsigned char *s)
733{
734 return pg_ascii_dsplen(s);
735}
736
737/*
738 * SJIS
739 */
740static int
741pg_sjis_mblen(const unsigned char *s)
742{
743 int len;
744
745 if (*s >= 0xa1 && *s <= 0xdf)
746 len = 1; /* 1 byte kana? */
747 else if (IS_HIGHBIT_SET(*s))
748 len = 2; /* kanji? */
749 else
750 len = 1; /* should be ASCII */
751 return len;
752}
753
754static int
755pg_sjis_dsplen(const unsigned char *s)
756{
757 int len;
758
759 if (*s >= 0xa1 && *s <= 0xdf)
760 len = 1; /* 1 byte kana? */
761 else if (IS_HIGHBIT_SET(*s))
762 len = 2; /* kanji? */
763 else
764 len = pg_ascii_dsplen(s); /* should be ASCII */
765 return len;
766}
767
768/*
769 * Big5
770 */
771static int
772pg_big5_mblen(const unsigned char *s)
773{
774 int len;
775
776 if (IS_HIGHBIT_SET(*s))
777 len = 2; /* kanji? */
778 else
779 len = 1; /* should be ASCII */
780 return len;
781}
782
783static int
784pg_big5_dsplen(const unsigned char *s)
785{
786 int len;
787
788 if (IS_HIGHBIT_SET(*s))
789 len = 2; /* kanji? */
790 else
791 len = pg_ascii_dsplen(s); /* should be ASCII */
792 return len;
793}
794
795/*
796 * GBK
797 */
798static int
799pg_gbk_mblen(const unsigned char *s)
800{
801 int len;
802
803 if (IS_HIGHBIT_SET(*s))
804 len = 2; /* kanji? */
805 else
806 len = 1; /* should be ASCII */
807 return len;
808}
809
810static int
811pg_gbk_dsplen(const unsigned char *s)
812{
813 int len;
814
815 if (IS_HIGHBIT_SET(*s))
816 len = 2; /* kanji? */
817 else
818 len = pg_ascii_dsplen(s); /* should be ASCII */
819 return len;
820}
821
822/*
823 * UHC
824 */
825static int
826pg_uhc_mblen(const unsigned char *s)
827{
828 int len;
829
830 if (IS_HIGHBIT_SET(*s))
831 len = 2; /* 2byte? */
832 else
833 len = 1; /* should be ASCII */
834 return len;
835}
836
837static int
838pg_uhc_dsplen(const unsigned char *s)
839{
840 int len;
841
842 if (IS_HIGHBIT_SET(*s))
843 len = 2; /* 2byte? */
844 else
845 len = pg_ascii_dsplen(s); /* should be ASCII */
846 return len;
847}
848
849/*
850 * GB18030
851 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
852 */
853
854/*
855 * Unlike all other mblen() functions, this also looks at the second byte of
856 * the input. However, if you only pass the first byte of a multi-byte
857 * string, and \0 as the second byte, this still works in a predictable way:
858 * a 4-byte character will be reported as two 2-byte characters. That's
859 * enough for all current uses, as a client-only encoding. It works that
860 * way, because in any valid 4-byte GB18030-encoded character, the third and
861 * fourth byte look like a 2-byte encoded character, when looked at
862 * separately.
863 */
864static int
865pg_gb18030_mblen(const unsigned char *s)
866{
867 int len;
868
869 if (!IS_HIGHBIT_SET(*s))
870 len = 1; /* ASCII */
871 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
872 len = 4;
873 else
874 len = 2;
875 return len;
876}
877
878static int
879pg_gb18030_dsplen(const unsigned char *s)
880{
881 int len;
882
883 if (IS_HIGHBIT_SET(*s))
884 len = 2;
885 else
886 len = pg_ascii_dsplen(s); /* ASCII */
887 return len;
888}
889
890/*
891 *-------------------------------------------------------------------
892 * multibyte sequence validators
893 *
894 * The verifychar functions accept "s", a pointer to the first byte of a
895 * string, and "len", the remaining length of the string. If there is a
896 * validly encoded character beginning at *s, return its length in bytes;
897 * else return -1.
898 *
899 * The verifystr functions also accept "s", a pointer to a string and "len",
900 * the length of the string. They verify the whole string, and return the
901 * number of input bytes (<= len) that are valid. In other words, if the
902 * whole string is valid, verifystr returns "len", otherwise it returns the
903 * byte offset of the first invalid character. The verifystr functions must
904 * test for and reject zeroes in the input.
905 *
906 * The verifychar functions can assume that len > 0 and that *s != '\0', but
907 * they must test for and reject zeroes in any additional bytes of a
908 * multibyte character. Note that this definition allows the function for a
909 * single-byte encoding to be just "return 1".
910 *-------------------------------------------------------------------
911 */
912static int
913pg_ascii_verifychar(const unsigned char *s, int len)
914{
915 return 1;
916}
917
918static int
919pg_ascii_verifystr(const unsigned char *s, int len)
920{
921 const unsigned char *nullpos = memchr(s, 0, len);
922
923 if (nullpos == NULL)
924 return len;
925 else
926 return nullpos - s;
927}
928
929#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
930
931static int
932pg_eucjp_verifychar(const unsigned char *s, int len)
933{
934 int l;
935 unsigned char c1,
936 c2;
937
938 c1 = *s++;
939
940 switch (c1)
941 {
942 case SS2: /* JIS X 0201 */
943 l = 2;
944 if (l > len)
945 return -1;
946 c2 = *s++;
947 if (c2 < 0xa1 || c2 > 0xdf)
948 return -1;
949 break;
950
951 case SS3: /* JIS X 0212 */
952 l = 3;
953 if (l > len)
954 return -1;
955 c2 = *s++;
957 return -1;
958 c2 = *s++;
960 return -1;
961 break;
962
963 default:
964 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
965 {
966 l = 2;
967 if (l > len)
968 return -1;
970 return -1;
971 c2 = *s++;
973 return -1;
974 }
975 else
976 /* must be ASCII */
977 {
978 l = 1;
979 }
980 break;
981 }
982
983 return l;
984}
985
986static int
987pg_eucjp_verifystr(const unsigned char *s, int len)
988{
989 const unsigned char *start = s;
990
991 while (len > 0)
992 {
993 int l;
994
995 /* fast path for ASCII-subset characters */
996 if (!IS_HIGHBIT_SET(*s))
997 {
998 if (*s == '\0')
999 break;
1000 l = 1;
1001 }
1002 else
1003 {
1004 l = pg_eucjp_verifychar(s, len);
1005 if (l == -1)
1006 break;
1007 }
1008 s += l;
1009 len -= l;
1010 }
1011
1012 return s - start;
1013}
1014
1015static int
1016pg_euckr_verifychar(const unsigned char *s, int len)
1017{
1018 int l;
1019 unsigned char c1,
1020 c2;
1021
1022 c1 = *s++;
1023
1024 if (IS_HIGHBIT_SET(c1))
1025 {
1026 l = 2;
1027 if (l > len)
1028 return -1;
1029 if (!IS_EUC_RANGE_VALID(c1))
1030 return -1;
1031 c2 = *s++;
1032 if (!IS_EUC_RANGE_VALID(c2))
1033 return -1;
1034 }
1035 else
1036 /* must be ASCII */
1037 {
1038 l = 1;
1039 }
1040
1041 return l;
1042}
1043
1044static int
1045pg_euckr_verifystr(const unsigned char *s, int len)
1046{
1047 const unsigned char *start = s;
1048
1049 while (len > 0)
1050 {
1051 int l;
1052
1053 /* fast path for ASCII-subset characters */
1054 if (!IS_HIGHBIT_SET(*s))
1055 {
1056 if (*s == '\0')
1057 break;
1058 l = 1;
1059 }
1060 else
1061 {
1062 l = pg_euckr_verifychar(s, len);
1063 if (l == -1)
1064 break;
1065 }
1066 s += l;
1067 len -= l;
1068 }
1069
1070 return s - start;
1071}
1072
1073/* EUC-CN byte sequences are exactly same as EUC-KR */
1074#define pg_euccn_verifychar pg_euckr_verifychar
1075#define pg_euccn_verifystr pg_euckr_verifystr
1076
1077static int
1078pg_euctw_verifychar(const unsigned char *s, int len)
1079{
1080 int l;
1081 unsigned char c1,
1082 c2;
1083
1084 c1 = *s++;
1085
1086 switch (c1)
1087 {
1088 case SS2: /* CNS 11643 Plane 1-7 */
1089 l = 4;
1090 if (l > len)
1091 return -1;
1092 c2 = *s++;
1093 if (c2 < 0xa1 || c2 > 0xa7)
1094 return -1;
1095 c2 = *s++;
1096 if (!IS_EUC_RANGE_VALID(c2))
1097 return -1;
1098 c2 = *s++;
1099 if (!IS_EUC_RANGE_VALID(c2))
1100 return -1;
1101 break;
1102
1103 case SS3: /* unused */
1104 return -1;
1105
1106 default:
1107 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1108 {
1109 l = 2;
1110 if (l > len)
1111 return -1;
1112 /* no further range check on c1? */
1113 c2 = *s++;
1114 if (!IS_EUC_RANGE_VALID(c2))
1115 return -1;
1116 }
1117 else
1118 /* must be ASCII */
1119 {
1120 l = 1;
1121 }
1122 break;
1123 }
1124 return l;
1125}
1126
1127static int
1128pg_euctw_verifystr(const unsigned char *s, int len)
1129{
1130 const unsigned char *start = s;
1131
1132 while (len > 0)
1133 {
1134 int l;
1135
1136 /* fast path for ASCII-subset characters */
1137 if (!IS_HIGHBIT_SET(*s))
1138 {
1139 if (*s == '\0')
1140 break;
1141 l = 1;
1142 }
1143 else
1144 {
1145 l = pg_euctw_verifychar(s, len);
1146 if (l == -1)
1147 break;
1148 }
1149 s += l;
1150 len -= l;
1151 }
1152
1153 return s - start;
1154}
1155
1156static int
1157pg_johab_verifychar(const unsigned char *s, int len)
1158{
1159 int l,
1160 mbl;
1161 unsigned char c;
1162
1163 l = mbl = pg_johab_mblen(s);
1164
1165 if (len < l)
1166 return -1;
1167
1168 if (!IS_HIGHBIT_SET(*s))
1169 return mbl;
1170
1171 while (--l > 0)
1172 {
1173 c = *++s;
1174 if (!IS_EUC_RANGE_VALID(c))
1175 return -1;
1176 }
1177 return mbl;
1178}
1179
1180static int
1181pg_johab_verifystr(const unsigned char *s, int len)
1182{
1183 const unsigned char *start = s;
1184
1185 while (len > 0)
1186 {
1187 int l;
1188
1189 /* fast path for ASCII-subset characters */
1190 if (!IS_HIGHBIT_SET(*s))
1191 {
1192 if (*s == '\0')
1193 break;
1194 l = 1;
1195 }
1196 else
1197 {
1198 l = pg_johab_verifychar(s, len);
1199 if (l == -1)
1200 break;
1201 }
1202 s += l;
1203 len -= l;
1204 }
1205
1206 return s - start;
1207}
1208
1209static int
1210pg_latin1_verifychar(const unsigned char *s, int len)
1211{
1212 return 1;
1213}
1214
1215static int
1216pg_latin1_verifystr(const unsigned char *s, int len)
1217{
1218 const unsigned char *nullpos = memchr(s, 0, len);
1219
1220 if (nullpos == NULL)
1221 return len;
1222 else
1223 return nullpos - s;
1224}
1225
1226static int
1227pg_sjis_verifychar(const unsigned char *s, int len)
1228{
1229 int l,
1230 mbl;
1231 unsigned char c1,
1232 c2;
1233
1234 l = mbl = pg_sjis_mblen(s);
1235
1236 if (len < l)
1237 return -1;
1238
1239 if (l == 1) /* pg_sjis_mblen already verified it */
1240 return mbl;
1241
1242 c1 = *s++;
1243 c2 = *s;
1244 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1245 return -1;
1246 return mbl;
1247}
1248
1249static int
1250pg_sjis_verifystr(const unsigned char *s, int len)
1251{
1252 const unsigned char *start = s;
1253
1254 while (len > 0)
1255 {
1256 int l;
1257
1258 /* fast path for ASCII-subset characters */
1259 if (!IS_HIGHBIT_SET(*s))
1260 {
1261 if (*s == '\0')
1262 break;
1263 l = 1;
1264 }
1265 else
1266 {
1267 l = pg_sjis_verifychar(s, len);
1268 if (l == -1)
1269 break;
1270 }
1271 s += l;
1272 len -= l;
1273 }
1274
1275 return s - start;
1276}
1277
1278static int
1279pg_big5_verifychar(const unsigned char *s, int len)
1280{
1281 int l,
1282 mbl;
1283
1284 l = mbl = pg_big5_mblen(s);
1285
1286 if (len < l)
1287 return -1;
1288
1289 if (l == 2 &&
1290 s[0] == NONUTF8_INVALID_BYTE0 &&
1291 s[1] == NONUTF8_INVALID_BYTE1)
1292 return -1;
1293
1294 while (--l > 0)
1295 {
1296 if (*++s == '\0')
1297 return -1;
1298 }
1299
1300 return mbl;
1301}
1302
1303static int
1304pg_big5_verifystr(const unsigned char *s, int len)
1305{
1306 const unsigned char *start = s;
1307
1308 while (len > 0)
1309 {
1310 int l;
1311
1312 /* fast path for ASCII-subset characters */
1313 if (!IS_HIGHBIT_SET(*s))
1314 {
1315 if (*s == '\0')
1316 break;
1317 l = 1;
1318 }
1319 else
1320 {
1321 l = pg_big5_verifychar(s, len);
1322 if (l == -1)
1323 break;
1324 }
1325 s += l;
1326 len -= l;
1327 }
1328
1329 return s - start;
1330}
1331
1332static int
1333pg_gbk_verifychar(const unsigned char *s, int len)
1334{
1335 int l,
1336 mbl;
1337
1338 l = mbl = pg_gbk_mblen(s);
1339
1340 if (len < l)
1341 return -1;
1342
1343 if (l == 2 &&
1344 s[0] == NONUTF8_INVALID_BYTE0 &&
1345 s[1] == NONUTF8_INVALID_BYTE1)
1346 return -1;
1347
1348 while (--l > 0)
1349 {
1350 if (*++s == '\0')
1351 return -1;
1352 }
1353
1354 return mbl;
1355}
1356
1357static int
1358pg_gbk_verifystr(const unsigned char *s, int len)
1359{
1360 const unsigned char *start = s;
1361
1362 while (len > 0)
1363 {
1364 int l;
1365
1366 /* fast path for ASCII-subset characters */
1367 if (!IS_HIGHBIT_SET(*s))
1368 {
1369 if (*s == '\0')
1370 break;
1371 l = 1;
1372 }
1373 else
1374 {
1375 l = pg_gbk_verifychar(s, len);
1376 if (l == -1)
1377 break;
1378 }
1379 s += l;
1380 len -= l;
1381 }
1382
1383 return s - start;
1384}
1385
1386static int
1387pg_uhc_verifychar(const unsigned char *s, int len)
1388{
1389 int l,
1390 mbl;
1391
1392 l = mbl = pg_uhc_mblen(s);
1393
1394 if (len < l)
1395 return -1;
1396
1397 if (l == 2 &&
1398 s[0] == NONUTF8_INVALID_BYTE0 &&
1399 s[1] == NONUTF8_INVALID_BYTE1)
1400 return -1;
1401
1402 while (--l > 0)
1403 {
1404 if (*++s == '\0')
1405 return -1;
1406 }
1407
1408 return mbl;
1409}
1410
1411static int
1412pg_uhc_verifystr(const unsigned char *s, int len)
1413{
1414 const unsigned char *start = s;
1415
1416 while (len > 0)
1417 {
1418 int l;
1419
1420 /* fast path for ASCII-subset characters */
1421 if (!IS_HIGHBIT_SET(*s))
1422 {
1423 if (*s == '\0')
1424 break;
1425 l = 1;
1426 }
1427 else
1428 {
1429 l = pg_uhc_verifychar(s, len);
1430 if (l == -1)
1431 break;
1432 }
1433 s += l;
1434 len -= l;
1435 }
1436
1437 return s - start;
1438}
1439
1440static int
1441pg_gb18030_verifychar(const unsigned char *s, int len)
1442{
1443 int l;
1444
1445 if (!IS_HIGHBIT_SET(*s))
1446 l = 1; /* ASCII */
1447 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1448 {
1449 /* Should be 4-byte, validate remaining bytes */
1450 if (*s >= 0x81 && *s <= 0xfe &&
1451 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1452 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1453 l = 4;
1454 else
1455 l = -1;
1456 }
1457 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1458 {
1459 /* Should be 2-byte, validate */
1460 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1461 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1462 l = 2;
1463 else
1464 l = -1;
1465 }
1466 else
1467 l = -1;
1468 return l;
1469}
1470
1471static int
1472pg_gb18030_verifystr(const unsigned char *s, int len)
1473{
1474 const unsigned char *start = s;
1475
1476 while (len > 0)
1477 {
1478 int l;
1479
1480 /* fast path for ASCII-subset characters */
1481 if (!IS_HIGHBIT_SET(*s))
1482 {
1483 if (*s == '\0')
1484 break;
1485 l = 1;
1486 }
1487 else
1488 {
1489 l = pg_gb18030_verifychar(s, len);
1490 if (l == -1)
1491 break;
1492 }
1493 s += l;
1494 len -= l;
1495 }
1496
1497 return s - start;
1498}
1499
1500static int
1501pg_utf8_verifychar(const unsigned char *s, int len)
1502{
1503 int l;
1504
1505 if ((*s & 0x80) == 0)
1506 {
1507 if (*s == '\0')
1508 return -1;
1509 return 1;
1510 }
1511 else if ((*s & 0xe0) == 0xc0)
1512 l = 2;
1513 else if ((*s & 0xf0) == 0xe0)
1514 l = 3;
1515 else if ((*s & 0xf8) == 0xf0)
1516 l = 4;
1517 else
1518 l = 1;
1519
1520 if (l > len)
1521 return -1;
1522
1523 if (!pg_utf8_islegal(s, l))
1524 return -1;
1525
1526 return l;
1527}
1528
1529/*
1530 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1531 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1532 * input byte and current state are used to compute an index into an array of
1533 * state transitions. Since the address of the next transition is dependent
1534 * on this computation, there is latency in executing the load instruction,
1535 * and the CPU is not kept busy.
1536 *
1537 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1538 *
1539 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1540 *
1541 * In a shift-based DFA, the input byte is an index into array of integers
1542 * whose bit pattern encodes the state transitions. To compute the next
1543 * state, we simply right-shift the integer by the current state and apply a
1544 * mask. In this scheme, the address of the transition only depends on the
1545 * input byte, so there is better pipelining.
1546 *
1547 * The naming convention for states and transitions was adopted from a UTF-8
1548 * to UTF-16/32 transcoder, whose table is reproduced below:
1549 *
1550 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1551 *
1552 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1553 * ==========================================================================
1554 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1555 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1556 * |
1557 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1558 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1559 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1560 * |
1561 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1562 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1563 * |
1564 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1565 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1566 *
1567 * In the most straightforward implementation, a shift-based DFA for UTF-8
1568 * requires 64-bit integers to encode the transitions, but with an SMT solver
1569 * it's possible to find state numbers such that the transitions fit within
1570 * 32-bit integers, as Dougall Johnson demonstrated:
1571 *
1572 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1573 *
1574 * This packed representation is the reason for the seemingly odd choice of
1575 * state values below.
1576 */
1577
1578/* Error */
1579#define ERR 0
1580/* Begin */
1581#define BGN 11
1582/* Continuation states, expect 1/2/3 continuation bytes */
1583#define CS1 16
1584#define CS2 1
1585#define CS3 5
1586/* Partial states, where the first continuation byte has a restricted range */
1587#define P3A 6 /* Lead was E0, check for 3-byte overlong */
1588#define P3B 20 /* Lead was ED, check for surrogate */
1589#define P4A 25 /* Lead was F0, check for 4-byte overlong */
1590#define P4B 30 /* Lead was F4, check for too-large */
1591/* Begin and End are the same state */
1592#define END BGN
1593
1594/* the encoded state transitions for the lookup table */
1595
1596/* ASCII */
1597#define ASC (END << BGN)
1598/* 2-byte lead */
1599#define L2A (CS1 << BGN)
1600/* 3-byte lead */
1601#define L3A (P3A << BGN)
1602#define L3B (CS2 << BGN)
1603#define L3C (P3B << BGN)
1604/* 4-byte lead */
1605#define L4A (P4A << BGN)
1606#define L4B (CS3 << BGN)
1607#define L4C (P4B << BGN)
1608/* continuation byte */
1609#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1610#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1611#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1612/* invalid byte */
1613#define ILL ERR
1614
1615static const uint32 Utf8Transition[256] =
1616{
1617 /* ASCII */
1618
1619 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1620 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1621 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1622 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1623
1624 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1625 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1626 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1627 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1628
1629 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1630 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1631 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1632 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1633
1634 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1635 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1636 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1637 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1638
1639 /* continuation bytes */
1640
1641 /* 80..8F */
1642 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1643 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1644
1645 /* 90..9F */
1646 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1647 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1648
1649 /* A0..BF */
1650 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1651 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1652 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1653 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1654
1655 /* leading bytes */
1656
1657 /* C0..DF */
1658 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1659 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1660 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1661 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1662
1663 /* E0..EF */
1664 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1665 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1666
1667 /* F0..FF */
1668 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1669 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1670};
1671
1672static void
1673utf8_advance(const unsigned char *s, uint32 *state, int len)
1674{
1675 /* Note: We deliberately don't check the state's value here. */
1676 while (len > 0)
1677 {
1678 /*
1679 * It's important that the mask value is 31: In most instruction sets,
1680 * a shift by a 32-bit operand is understood to be a shift by its mod
1681 * 32, so the compiler should elide the mask operation.
1682 */
1683 *state = Utf8Transition[*s++] >> (*state & 31);
1684 len--;
1685 }
1686
1687 *state &= 31;
1688}
1689
1690static int
1691pg_utf8_verifystr(const unsigned char *s, int len)
1692{
1693 const unsigned char *start = s;
1694 const int orig_len = len;
1695 uint32 state = BGN;
1696
1697/*
1698 * With a stride of two vector widths, gcc will unroll the loop. Even if
1699 * the compiler can unroll a longer loop, it's not worth it because we
1700 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1701 */
1702#define STRIDE_LENGTH (2 * sizeof(Vector8))
1703
1704 if (len >= STRIDE_LENGTH)
1705 {
1706 while (len >= STRIDE_LENGTH)
1707 {
1708 /*
1709 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1710 * but we must first check for a non-END state, which means the
1711 * previous chunk ended in the middle of a multibyte sequence.
1712 */
1713 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1715
1716 s += STRIDE_LENGTH;
1717 len -= STRIDE_LENGTH;
1718 }
1719
1720 /* The error state persists, so we only need to check for it here. */
1721 if (state == ERR)
1722 {
1723 /*
1724 * Start over from the beginning with the slow path so we can
1725 * count the valid bytes.
1726 */
1727 len = orig_len;
1728 s = start;
1729 }
1730 else if (state != END)
1731 {
1732 /*
1733 * The fast path exited in the middle of a multibyte sequence.
1734 * Walk backwards to find the leading byte so that the slow path
1735 * can resume checking from there. We must always backtrack at
1736 * least one byte, since the current byte could be e.g. an ASCII
1737 * byte after a 2-byte lead, which is invalid.
1738 */
1739 do
1740 {
1741 Assert(s > start);
1742 s--;
1743 len++;
1745 } while (pg_utf_mblen(s) <= 1);
1746 }
1747 }
1748
1749 /* check remaining bytes */
1750 while (len > 0)
1751 {
1752 int l;
1753
1754 /* fast path for ASCII-subset characters */
1755 if (!IS_HIGHBIT_SET(*s))
1756 {
1757 if (*s == '\0')
1758 break;
1759 l = 1;
1760 }
1761 else
1762 {
1763 l = pg_utf8_verifychar(s, len);
1764 if (l == -1)
1765 break;
1766 }
1767 s += l;
1768 len -= l;
1769 }
1770
1771 return s - start;
1772}
1773
1774/*
1775 * Check for validity of a single UTF-8 encoded character
1776 *
1777 * This directly implements the rules in RFC3629. The bizarre-looking
1778 * restrictions on the second byte are meant to ensure that there isn't
1779 * more than one encoding of a given Unicode character point; that is,
1780 * you may not use a longer-than-necessary byte sequence with high order
1781 * zero bits to represent a character that would fit in fewer bytes.
1782 * To do otherwise is to create security hazards (eg, create an apparent
1783 * non-ASCII character that decodes to plain ASCII).
1784 *
1785 * length is assumed to have been obtained by pg_utf_mblen(), and the
1786 * caller must have checked that that many bytes are present in the buffer.
1787 */
1788bool
1789pg_utf8_islegal(const unsigned char *source, int length)
1790{
1791 unsigned char a;
1792
1793 switch (length)
1794 {
1795 default:
1796 /* reject lengths 5 and 6 for now */
1797 return false;
1798 case 4:
1799 a = source[3];
1800 if (a < 0x80 || a > 0xBF)
1801 return false;
1803 case 3:
1804 a = source[2];
1805 if (a < 0x80 || a > 0xBF)
1806 return false;
1808 case 2:
1809 a = source[1];
1810 switch (*source)
1811 {
1812 case 0xE0:
1813 if (a < 0xA0 || a > 0xBF)
1814 return false;
1815 break;
1816 case 0xED:
1817 if (a < 0x80 || a > 0x9F)
1818 return false;
1819 break;
1820 case 0xF0:
1821 if (a < 0x90 || a > 0xBF)
1822 return false;
1823 break;
1824 case 0xF4:
1825 if (a < 0x80 || a > 0x8F)
1826 return false;
1827 break;
1828 default:
1829 if (a < 0x80 || a > 0xBF)
1830 return false;
1831 break;
1832 }
1834 case 1:
1835 a = *source;
1836 if (a >= 0x80 && a < 0xC2)
1837 return false;
1838 if (a > 0xF4)
1839 return false;
1840 break;
1841 }
1842 return true;
1843}
1844
1845
1846/*
1847 * Fills the provided buffer with two bytes such that:
1848 * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
1849 */
1850void
1858
1859/*
1860 *-------------------------------------------------------------------
1861 * encoding info table
1862 *-------------------------------------------------------------------
1863 */
1906};
1907
1908/*
1909 * Returns the byte length of a multibyte character.
1910 *
1911 * Choose "mblen" functions based on the input string characteristics.
1912 * pg_encoding_mblen() can be used when ANY of these conditions are met:
1913 *
1914 * - The input string is zero-terminated
1915 *
1916 * - The input string is known to be valid in the encoding (e.g., string
1917 * converted from database encoding)
1918 *
1919 * - The encoding is not GB18030 (e.g., when only database encodings are
1920 * passed to 'encoding' parameter)
1921 *
1922 * encoding==GB18030 requires examining up to two bytes to determine character
1923 * length. Therefore, callers satisfying none of those conditions must use
1924 * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
1925 * guaranteed to be within allocation bounds.
1926 *
1927 * When dealing with text that is not certainly valid in the specified
1928 * encoding, the result may exceed the actual remaining string length.
1929 * Callers that are not prepared to deal with that should use Min(remaining,
1930 * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
1931 * pg_encoding_mblen_bounded() are interchangeable.
1932 */
1933int
1935{
1936 return (PG_VALID_ENCODING(encoding) ?
1937 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1938 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1939}
1940
1941/*
1942 * Returns the byte length of a multibyte character (possibly not
1943 * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
1944 */
1945int
1947 size_t remaining)
1948{
1949 /*
1950 * Define zero remaining as too few, even for single-byte encodings.
1951 * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
1952 * zero; others read one.
1953 */
1954 if (remaining < 1 ||
1956 return INT_MAX;
1958}
1959
1960/*
1961 * Returns the byte length of a multibyte character; but not more than the
1962 * distance to the terminating zero byte. For input that might lack a
1963 * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
1964 */
1965int
1970
1971/*
1972 * Returns the display length of a multibyte character.
1973 */
1974int
1976{
1977 return (PG_VALID_ENCODING(encoding) ?
1978 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1979 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1980}
1981
1982/*
1983 * Verify the first multibyte character of the given string.
1984 * Return its byte length if good, -1 if bad. (See comments above for
1985 * full details of the mbverifychar API.)
1986 */
1987int
1989{
1990 return (PG_VALID_ENCODING(encoding) ?
1991 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
1992 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
1993}
1994
1995/*
1996 * Verify that a string is valid for the given encoding.
1997 * Returns the number of input bytes (<= len) that form a valid string.
1998 * (See comments above for full details of the mbverifystr API.)
1999 */
2000int
2002{
2003 return (PG_VALID_ENCODING(encoding) ?
2004 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2005 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2006}
2007
2008/*
2009 * fetch maximum length of a given encoding
2010 */
2011int
2013{
2015
2016 /*
2017 * Check for the encoding despite the assert, due to some mingw versions
2018 * otherwise issuing bogus warnings.
2019 */
2020 return PG_VALID_ENCODING(encoding) ?
2023}
static bool is_valid_ascii(const unsigned char *s, int len)
Definition ascii.h:25
#define IS_HIGHBIT_SET(ch)
Definition c.h:1244
#define Assert(condition)
Definition c.h:943
uint32_t uint32
Definition c.h:624
#define pg_fallthrough
Definition c.h:161
return str start
int remaining
Definition informix.c:692
static char * encoding
Definition initdb.c:139
int a
Definition isn.c:73
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
static char32_t utf8_to_unicode(const unsigned char *c)
Definition mbprint.c:53
const void size_t len
static const struct lconv_member_info table[]
static rewind_source * source
Definition pg_rewind.c:89
#define ISSJISTAIL(c)
Definition pg_wchar.h:56
#define pg_utf_mblen
Definition pg_wchar.h:486
@ PG_WIN1254
Definition pg_wchar.h:107
@ PG_LATIN4
Definition pg_wchar.h:87
@ PG_LATIN9
Definition pg_wchar.h:92
@ PG_JOHAB
Definition pg_wchar.h:119
@ PG_GB18030
Definition pg_wchar.h:118
@ PG_SQL_ASCII
Definition pg_wchar.h:76
@ PG_KOI8R
Definition pg_wchar.h:98
@ PG_ISO_8859_6
Definition pg_wchar.h:102
@ PG_WIN1253
Definition pg_wchar.h:106
@ PG_KOI8U
Definition pg_wchar.h:110
@ PG_LATIN6
Definition pg_wchar.h:89
@ PG_LATIN5
Definition pg_wchar.h:88
@ PG_EUC_CN
Definition pg_wchar.h:78
@ PG_UHC
Definition pg_wchar.h:117
@ PG_LATIN2
Definition pg_wchar.h:85
@ PG_ISO_8859_5
Definition pg_wchar.h:101
@ PG_LATIN10
Definition pg_wchar.h:93
@ PG_WIN1250
Definition pg_wchar.h:105
@ PG_ISO_8859_7
Definition pg_wchar.h:103
@ PG_SJIS
Definition pg_wchar.h:114
@ PG_LATIN8
Definition pg_wchar.h:91
@ PG_EUC_JP
Definition pg_wchar.h:77
@ PG_GBK
Definition pg_wchar.h:116
@ PG_LATIN3
Definition pg_wchar.h:86
@ PG_WIN1256
Definition pg_wchar.h:94
@ PG_LATIN1
Definition pg_wchar.h:84
@ PG_EUC_TW
Definition pg_wchar.h:80
@ PG_WIN1258
Definition pg_wchar.h:95
@ PG_SHIFT_JIS_2004
Definition pg_wchar.h:120
@ PG_WIN1252
Definition pg_wchar.h:100
@ PG_LATIN7
Definition pg_wchar.h:90
@ PG_WIN1255
Definition pg_wchar.h:108
@ PG_WIN1257
Definition pg_wchar.h:109
@ PG_WIN1251
Definition pg_wchar.h:99
@ PG_EUC_KR
Definition pg_wchar.h:79
@ PG_WIN866
Definition pg_wchar.h:96
@ PG_ISO_8859_8
Definition pg_wchar.h:104
@ PG_WIN874
Definition pg_wchar.h:97
@ PG_EUC_JIS_2004
Definition pg_wchar.h:81
@ PG_BIG5
Definition pg_wchar.h:115
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition pg_wchar.h:428
#define SS2
Definition pg_wchar.h:38
#define PG_VALID_ENCODING(_enc)
Definition pg_wchar.h:140
#define ISSJISHEAD(c)
Definition pg_wchar.h:55
#define SS3
Definition pg_wchar.h:39
char * c
static int fb(int x)
unsigned int first
Definition wchar.c:593
unsigned int last
Definition wchar.c:594
mbstr_verifier mbverifystr
Definition pg_wchar.h:238
mblen_converter mblen
Definition pg_wchar.h:235
mbdisplaylen_converter dsplen
Definition pg_wchar.h:236
mbchar_verifier mbverifychar
Definition pg_wchar.h:237
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition wchar.c:1412
static int pg_latin1_dsplen(const unsigned char *s)
Definition wchar.c:732
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition wchar.c:1966
static int pg_euctw_mblen(const unsigned char *s)
Definition wchar.c:360
static int pg_euckr_dsplen(const unsigned char *s)
Definition wchar.c:227
static const uint32 Utf8Transition[256]
Definition wchar.c:1615
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition wchar.c:1789
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition wchar.c:919
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition wchar.c:1210
static int pg_sjis_dsplen(const unsigned char *s)
Definition wchar.c:755
#define CR3
Definition wchar.c:1611
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition wchar.c:1227
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:108
static int pg_eucjp_dsplen(const unsigned char *s)
Definition wchar.c:196
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:73
#define L3B
Definition wchar.c:1602
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition wchar.c:1387
#define L2A
Definition wchar.c:1599
static int pg_gbk_dsplen(const unsigned char *s)
Definition wchar.c:811
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:215
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:317
#define END
Definition wchar.c:1592
#define pg_euccn_verifychar
Definition wchar.c:1074
#define L4C
Definition wchar.c:1607
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition wchar.c:1250
static int pg_johab_mblen(const unsigned char *s)
Definition wchar.c:444
static int pg_johab_dsplen(const unsigned char *s)
Definition wchar.c:450
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition wchar.c:1304
#define CR2
Definition wchar.c:1610
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:184
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition wchar.c:1216
static int pg_latin1_mblen(const unsigned char *s)
Definition wchar.c:726
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition wchar.c:913
static int pg_ascii_mblen(const unsigned char *s)
Definition wchar.c:88
void pg_encoding_set_invalid(int encoding, char *dst)
Definition wchar.c:1851
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition wchar.c:599
static int pg_big5_dsplen(const unsigned char *s)
Definition wchar.c:784
#define pg_euccn_verifystr
Definition wchar.c:1075
int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, size_t remaining)
Definition wchar.c:1946
#define NONUTF8_INVALID_BYTE0
Definition wchar.c:36
static int pg_eucjp_mblen(const unsigned char *s)
Definition wchar.c:190
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition wchar.c:1333
static int pg_big5_mblen(const unsigned char *s)
Definition wchar.c:772
static int pg_euccn_dsplen(const unsigned char *s)
Definition wchar.c:301
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition wchar.c:1078
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition wchar.c:1016
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition wchar.c:1128
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition wchar.c:1358
static int pg_gb18030_dsplen(const unsigned char *s)
Definition wchar.c:879
#define ERR
Definition wchar.c:1579
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:462
static int pg_euccn_mblen(const unsigned char *s)
Definition wchar.c:285
#define ASC
Definition wchar.c:1597
static int pg_gbk_mblen(const unsigned char *s)
Definition wchar.c:799
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition wchar.c:987
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition wchar.c:1181
static int pg_euc_dsplen(const unsigned char *s)
Definition wchar.c:165
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition wchar.c:1472
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition wchar.c:1045
static int pg_sjis_mblen(const unsigned char *s)
Definition wchar.c:741
#define IS_EUC_RANGE_VALID(c)
Definition wchar.c:929
static int pg_uhc_dsplen(const unsigned char *s)
Definition wchar.c:838
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition wchar.c:932
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition wchar.c:1279
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition wchar.c:1441
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:398
#define L3C
Definition wchar.c:1603
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition wchar.c:1501
#define MB2CHAR_NEED_AT_LEAST(len, need)
Definition wchar.c:67
#define CR1
Definition wchar.c:1609
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:711
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition wchar.c:525
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:237
static int pg_gb18030_mblen(const unsigned char *s)
Definition wchar.c:865
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition wchar.c:1975
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition wchar.c:1673
static int pg_euctw_dsplen(const unsigned char *s)
Definition wchar.c:376
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition wchar.c:689
static int pg_uhc_mblen(const unsigned char *s)
Definition wchar.c:826
static int pg_euc_mblen(const unsigned char *s)
Definition wchar.c:149
#define L3A
Definition wchar.c:1601
#define L4B
Definition wchar.c:1606
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition wchar.c:2001
#define NONUTF8_INVALID_BYTE1
Definition wchar.c:37
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition wchar.c:1691
static int pg_euckr_mblen(const unsigned char *s)
Definition wchar.c:221
const pg_wchar_tbl pg_wchar_table[]
Definition wchar.c:1864
#define BGN
Definition wchar.c:1581
int pg_encoding_max_length(int encoding)
Definition wchar.c:2012
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition wchar.c:1934
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition wchar.c:1157
#define ILL
Definition wchar.c:1613
#define STRIDE_LENGTH
#define L4A
Definition wchar.c:1605
static int pg_ascii_dsplen(const unsigned char *s)
Definition wchar.c:94
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition wchar.c:1988
static int ucs_wcwidth(pg_wchar ucs)
Definition wchar.c:646
static int pg_utf_dsplen(const unsigned char *s)
Definition wchar.c:680