PostgreSQL Source Code  git master
wchar.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wchar.c
4  * Functions for working with multibyte characters in various encodings.
5  *
6  * Portions Copyright (c) 1998-2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/common/wchar.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include "mb/pg_wchar.h"
16 #include "utils/ascii.h"
17 
18 
19 /*
20  * Operations on multi-byte encodings are driven by a table of helper
21  * functions.
22  *
23  * To add an encoding support, define mblen(), dsplen(), verifychar() and
24  * verifystr() for the encoding. For server-encodings, also define mb2wchar()
25  * and wchar2mb() conversion functions.
26  *
27  * These functions generally assume that their input is validly formed.
28  * The "verifier" functions, further down in the file, have to be more
29  * paranoid.
30  *
31  * We expect that mblen() does not need to examine more than the first byte
32  * of the character to discover the correct length. GB18030 is an exception
33  * to that rule, though, as it also looks at second byte. But even that
34  * behaves in a predictable way, if you only pass the first byte: it will
35  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
36  * good enough for all current uses.
37  *
38  * Note: for the display output of psql to work properly, the return values
39  * of the dsplen functions must conform to the Unicode standard. In particular
40  * the NUL character is zero width and control characters are generally
41  * width -1. It is recommended that non-ASCII encodings refer their ASCII
42  * subset to the ASCII routines to ensure consistency.
43  */
44 
45 /*
46  * SQL/ASCII
47  */
48 static int
49 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
50 {
51  int cnt = 0;
52 
53  while (len > 0 && *from)
54  {
55  *to++ = *from++;
56  len--;
57  cnt++;
58  }
59  *to = 0;
60  return cnt;
61 }
62 
63 static int
64 pg_ascii_mblen(const unsigned char *s)
65 {
66  return 1;
67 }
68 
69 static int
70 pg_ascii_dsplen(const unsigned char *s)
71 {
72  if (*s == '\0')
73  return 0;
74  if (*s < 0x20 || *s == 0x7f)
75  return -1;
76 
77  return 1;
78 }
79 
80 /*
81  * EUC
82  */
83 static int
84 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
85 {
86  int cnt = 0;
87 
88  while (len > 0 && *from)
89  {
90  if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
91  * KANA") */
92  {
93  from++;
94  *to = (SS2 << 8) | *from++;
95  len -= 2;
96  }
97  else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
98  {
99  from++;
100  *to = (SS3 << 16) | (*from++ << 8);
101  *to |= *from++;
102  len -= 3;
103  }
104  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
105  {
106  *to = *from++ << 8;
107  *to |= *from++;
108  len -= 2;
109  }
110  else /* must be ASCII */
111  {
112  *to = *from++;
113  len--;
114  }
115  to++;
116  cnt++;
117  }
118  *to = 0;
119  return cnt;
120 }
121 
122 static inline int
123 pg_euc_mblen(const unsigned char *s)
124 {
125  int len;
126 
127  if (*s == SS2)
128  len = 2;
129  else if (*s == SS3)
130  len = 3;
131  else if (IS_HIGHBIT_SET(*s))
132  len = 2;
133  else
134  len = 1;
135  return len;
136 }
137 
138 static inline int
139 pg_euc_dsplen(const unsigned char *s)
140 {
141  int len;
142 
143  if (*s == SS2)
144  len = 2;
145  else if (*s == SS3)
146  len = 2;
147  else if (IS_HIGHBIT_SET(*s))
148  len = 2;
149  else
150  len = pg_ascii_dsplen(s);
151  return len;
152 }
153 
154 /*
155  * EUC_JP
156  */
157 static int
158 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
159 {
160  return pg_euc2wchar_with_len(from, to, len);
161 }
162 
163 static int
164 pg_eucjp_mblen(const unsigned char *s)
165 {
166  return pg_euc_mblen(s);
167 }
168 
169 static int
170 pg_eucjp_dsplen(const unsigned char *s)
171 {
172  int len;
173 
174  if (*s == SS2)
175  len = 1;
176  else if (*s == SS3)
177  len = 2;
178  else if (IS_HIGHBIT_SET(*s))
179  len = 2;
180  else
181  len = pg_ascii_dsplen(s);
182  return len;
183 }
184 
185 /*
186  * EUC_KR
187  */
188 static int
189 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
190 {
191  return pg_euc2wchar_with_len(from, to, len);
192 }
193 
194 static int
195 pg_euckr_mblen(const unsigned char *s)
196 {
197  return pg_euc_mblen(s);
198 }
199 
200 static int
201 pg_euckr_dsplen(const unsigned char *s)
202 {
203  return pg_euc_dsplen(s);
204 }
205 
206 /*
207  * EUC_CN
208  *
209  */
210 static int
211 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
212 {
213  int cnt = 0;
214 
215  while (len > 0 && *from)
216  {
217  if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
218  {
219  from++;
220  *to = (SS2 << 16) | (*from++ << 8);
221  *to |= *from++;
222  len -= 3;
223  }
224  else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
225  {
226  from++;
227  *to = (SS3 << 16) | (*from++ << 8);
228  *to |= *from++;
229  len -= 3;
230  }
231  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
232  {
233  *to = *from++ << 8;
234  *to |= *from++;
235  len -= 2;
236  }
237  else
238  {
239  *to = *from++;
240  len--;
241  }
242  to++;
243  cnt++;
244  }
245  *to = 0;
246  return cnt;
247 }
248 
249 static int
250 pg_euccn_mblen(const unsigned char *s)
251 {
252  int len;
253 
254  if (IS_HIGHBIT_SET(*s))
255  len = 2;
256  else
257  len = 1;
258  return len;
259 }
260 
261 static int
262 pg_euccn_dsplen(const unsigned char *s)
263 {
264  int len;
265 
266  if (IS_HIGHBIT_SET(*s))
267  len = 2;
268  else
269  len = pg_ascii_dsplen(s);
270  return len;
271 }
272 
273 /*
274  * EUC_TW
275  *
276  */
277 static int
278 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
279 {
280  int cnt = 0;
281 
282  while (len > 0 && *from)
283  {
284  if (*from == SS2 && len >= 4) /* code set 2 */
285  {
286  from++;
287  *to = (((uint32) SS2) << 24) | (*from++ << 16);
288  *to |= *from++ << 8;
289  *to |= *from++;
290  len -= 4;
291  }
292  else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
293  {
294  from++;
295  *to = (SS3 << 16) | (*from++ << 8);
296  *to |= *from++;
297  len -= 3;
298  }
299  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
300  {
301  *to = *from++ << 8;
302  *to |= *from++;
303  len -= 2;
304  }
305  else
306  {
307  *to = *from++;
308  len--;
309  }
310  to++;
311  cnt++;
312  }
313  *to = 0;
314  return cnt;
315 }
316 
317 static int
318 pg_euctw_mblen(const unsigned char *s)
319 {
320  int len;
321 
322  if (*s == SS2)
323  len = 4;
324  else if (*s == SS3)
325  len = 3;
326  else if (IS_HIGHBIT_SET(*s))
327  len = 2;
328  else
329  len = 1;
330  return len;
331 }
332 
333 static int
334 pg_euctw_dsplen(const unsigned char *s)
335 {
336  int len;
337 
338  if (*s == SS2)
339  len = 2;
340  else if (*s == SS3)
341  len = 2;
342  else if (IS_HIGHBIT_SET(*s))
343  len = 2;
344  else
345  len = pg_ascii_dsplen(s);
346  return len;
347 }
348 
349 /*
350  * Convert pg_wchar to EUC_* encoding.
351  * caller must allocate enough space for "to", including a trailing zero!
352  * len: length of from.
353  * "from" not necessarily null terminated.
354  */
355 static int
356 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
357 {
358  int cnt = 0;
359 
360  while (len > 0 && *from)
361  {
362  unsigned char c;
363 
364  if ((c = (*from >> 24)))
365  {
366  *to++ = c;
367  *to++ = (*from >> 16) & 0xff;
368  *to++ = (*from >> 8) & 0xff;
369  *to++ = *from & 0xff;
370  cnt += 4;
371  }
372  else if ((c = (*from >> 16)))
373  {
374  *to++ = c;
375  *to++ = (*from >> 8) & 0xff;
376  *to++ = *from & 0xff;
377  cnt += 3;
378  }
379  else if ((c = (*from >> 8)))
380  {
381  *to++ = c;
382  *to++ = *from & 0xff;
383  cnt += 2;
384  }
385  else
386  {
387  *to++ = *from;
388  cnt++;
389  }
390  from++;
391  len--;
392  }
393  *to = 0;
394  return cnt;
395 }
396 
397 
398 /*
399  * JOHAB
400  */
401 static int
402 pg_johab_mblen(const unsigned char *s)
403 {
404  return pg_euc_mblen(s);
405 }
406 
407 static int
408 pg_johab_dsplen(const unsigned char *s)
409 {
410  return pg_euc_dsplen(s);
411 }
412 
413 /*
414  * convert UTF8 string to pg_wchar (UCS-4)
415  * caller must allocate enough space for "to", including a trailing zero!
416  * len: length of from.
417  * "from" not necessarily null terminated.
418  */
419 static int
420 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
421 {
422  int cnt = 0;
423  uint32 c1,
424  c2,
425  c3,
426  c4;
427 
428  while (len > 0 && *from)
429  {
430  if ((*from & 0x80) == 0)
431  {
432  *to = *from++;
433  len--;
434  }
435  else if ((*from & 0xe0) == 0xc0)
436  {
437  if (len < 2)
438  break; /* drop trailing incomplete char */
439  c1 = *from++ & 0x1f;
440  c2 = *from++ & 0x3f;
441  *to = (c1 << 6) | c2;
442  len -= 2;
443  }
444  else if ((*from & 0xf0) == 0xe0)
445  {
446  if (len < 3)
447  break; /* drop trailing incomplete char */
448  c1 = *from++ & 0x0f;
449  c2 = *from++ & 0x3f;
450  c3 = *from++ & 0x3f;
451  *to = (c1 << 12) | (c2 << 6) | c3;
452  len -= 3;
453  }
454  else if ((*from & 0xf8) == 0xf0)
455  {
456  if (len < 4)
457  break; /* drop trailing incomplete char */
458  c1 = *from++ & 0x07;
459  c2 = *from++ & 0x3f;
460  c3 = *from++ & 0x3f;
461  c4 = *from++ & 0x3f;
462  *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
463  len -= 4;
464  }
465  else
466  {
467  /* treat a bogus char as length 1; not ours to raise error */
468  *to = *from++;
469  len--;
470  }
471  to++;
472  cnt++;
473  }
474  *to = 0;
475  return cnt;
476 }
477 
478 
479 /*
480  * Trivial conversion from pg_wchar to UTF-8.
481  * caller should allocate enough space for "to"
482  * len: length of from.
483  * "from" not necessarily null terminated.
484  */
485 static int
486 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
487 {
488  int cnt = 0;
489 
490  while (len > 0 && *from)
491  {
492  int char_len;
493 
494  unicode_to_utf8(*from, to);
495  char_len = pg_utf_mblen(to);
496  cnt += char_len;
497  to += char_len;
498  from++;
499  len--;
500  }
501  *to = 0;
502  return cnt;
503 }
504 
505 /*
506  * Return the byte length of a UTF8 character pointed to by s
507  *
508  * Note: in the current implementation we do not support UTF8 sequences
509  * of more than 4 bytes; hence do NOT return a value larger than 4.
510  * We return "1" for any leading byte that is either flat-out illegal or
511  * indicates a length larger than we support.
512  *
513  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
514  * other places would need to be fixed to change this.
515  */
516 int
517 pg_utf_mblen(const unsigned char *s)
518 {
519  int len;
520 
521  if ((*s & 0x80) == 0)
522  len = 1;
523  else if ((*s & 0xe0) == 0xc0)
524  len = 2;
525  else if ((*s & 0xf0) == 0xe0)
526  len = 3;
527  else if ((*s & 0xf8) == 0xf0)
528  len = 4;
529 #ifdef NOT_USED
530  else if ((*s & 0xfc) == 0xf8)
531  len = 5;
532  else if ((*s & 0xfe) == 0xfc)
533  len = 6;
534 #endif
535  else
536  len = 1;
537  return len;
538 }
539 
540 /*
541  * This is an implementation of wcwidth() and wcswidth() as defined in
542  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
543  * <http://www.unix.org/online.html>
544  *
545  * Markus Kuhn -- 2001-09-08 -- public domain
546  *
547  * customised for PostgreSQL
548  *
549  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
550  */
551 
553 {
554  unsigned int first;
555  unsigned int last;
556 };
557 
558 /* auxiliary function for binary search in interval table */
559 static int
560 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
561 {
562  int min = 0;
563  int mid;
564 
565  if (ucs < table[0].first || ucs > table[max].last)
566  return 0;
567  while (max >= min)
568  {
569  mid = (min + max) / 2;
570  if (ucs > table[mid].last)
571  min = mid + 1;
572  else if (ucs < table[mid].first)
573  max = mid - 1;
574  else
575  return 1;
576  }
577 
578  return 0;
579 }
580 
581 
582 /* The following functions define the column width of an ISO 10646
583  * character as follows:
584  *
585  * - The null character (U+0000) has a column width of 0.
586  *
587  * - Other C0/C1 control characters and DEL will lead to a return
588  * value of -1.
589  *
590  * - Non-spacing and enclosing combining characters (general
591  * category code Mn, Me or Cf in the Unicode database) have a
592  * column width of 0.
593  *
594  * - Spacing characters in the East Asian Wide (W) or East Asian
595  * FullWidth (F) category as defined in Unicode Technical
596  * Report #11 have a column width of 2.
597  *
598  * - All remaining characters (including all printable
599  * ISO 8859-1 and WGL4 characters, Unicode control characters,
600  * etc.) have a column width of 1.
601  *
602  * This implementation assumes that wchar_t characters are encoded
603  * in ISO 10646.
604  */
605 
606 static int
608 {
611 
612  /* test for 8-bit control characters */
613  if (ucs == 0)
614  return 0;
615 
616  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
617  return -1;
618 
619  /*
620  * binary search in table of non-spacing characters
621  *
622  * XXX: In the official Unicode sources, it is possible for a character to
623  * be described as both non-spacing and wide at the same time. As of
624  * Unicode 13.0, treating the non-spacing property as the determining
625  * factor for display width leads to the correct behavior, so do that
626  * search first.
627  */
628  if (mbbisearch(ucs, nonspacing,
629  sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
630  return 0;
631 
632  /* binary search in table of wide characters */
633  if (mbbisearch(ucs, east_asian_fw,
634  sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
635  return 2;
636 
637  return 1;
638 }
639 
640 static int
641 pg_utf_dsplen(const unsigned char *s)
642 {
643  return ucs_wcwidth(utf8_to_unicode(s));
644 }
645 
646 /*
647  * convert mule internal code to pg_wchar
648  * caller should allocate enough space for "to"
649  * len: length of from.
650  * "from" not necessarily null terminated.
651  */
652 static int
653 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
654 {
655  int cnt = 0;
656 
657  while (len > 0 && *from)
658  {
659  if (IS_LC1(*from) && len >= 2)
660  {
661  *to = *from++ << 16;
662  *to |= *from++;
663  len -= 2;
664  }
665  else if (IS_LCPRV1(*from) && len >= 3)
666  {
667  from++;
668  *to = *from++ << 16;
669  *to |= *from++;
670  len -= 3;
671  }
672  else if (IS_LC2(*from) && len >= 3)
673  {
674  *to = *from++ << 16;
675  *to |= *from++ << 8;
676  *to |= *from++;
677  len -= 3;
678  }
679  else if (IS_LCPRV2(*from) && len >= 4)
680  {
681  from++;
682  *to = *from++ << 16;
683  *to |= *from++ << 8;
684  *to |= *from++;
685  len -= 4;
686  }
687  else
688  { /* assume ASCII */
689  *to = (unsigned char) *from++;
690  len--;
691  }
692  to++;
693  cnt++;
694  }
695  *to = 0;
696  return cnt;
697 }
698 
699 /*
700  * convert pg_wchar to mule internal code
701  * caller should allocate enough space for "to"
702  * len: length of from.
703  * "from" not necessarily null terminated.
704  */
705 static int
706 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
707 {
708  int cnt = 0;
709 
710  while (len > 0 && *from)
711  {
712  unsigned char lb;
713 
714  lb = (*from >> 16) & 0xff;
715  if (IS_LC1(lb))
716  {
717  *to++ = lb;
718  *to++ = *from & 0xff;
719  cnt += 2;
720  }
721  else if (IS_LC2(lb))
722  {
723  *to++ = lb;
724  *to++ = (*from >> 8) & 0xff;
725  *to++ = *from & 0xff;
726  cnt += 3;
727  }
728  else if (IS_LCPRV1_A_RANGE(lb))
729  {
730  *to++ = LCPRV1_A;
731  *to++ = lb;
732  *to++ = *from & 0xff;
733  cnt += 3;
734  }
735  else if (IS_LCPRV1_B_RANGE(lb))
736  {
737  *to++ = LCPRV1_B;
738  *to++ = lb;
739  *to++ = *from & 0xff;
740  cnt += 3;
741  }
742  else if (IS_LCPRV2_A_RANGE(lb))
743  {
744  *to++ = LCPRV2_A;
745  *to++ = lb;
746  *to++ = (*from >> 8) & 0xff;
747  *to++ = *from & 0xff;
748  cnt += 4;
749  }
750  else if (IS_LCPRV2_B_RANGE(lb))
751  {
752  *to++ = LCPRV2_B;
753  *to++ = lb;
754  *to++ = (*from >> 8) & 0xff;
755  *to++ = *from & 0xff;
756  cnt += 4;
757  }
758  else
759  {
760  *to++ = *from & 0xff;
761  cnt += 1;
762  }
763  from++;
764  len--;
765  }
766  *to = 0;
767  return cnt;
768 }
769 
770 /* exported for direct use by conv.c */
771 int
772 pg_mule_mblen(const unsigned char *s)
773 {
774  int len;
775 
776  if (IS_LC1(*s))
777  len = 2;
778  else if (IS_LCPRV1(*s))
779  len = 3;
780  else if (IS_LC2(*s))
781  len = 3;
782  else if (IS_LCPRV2(*s))
783  len = 4;
784  else
785  len = 1; /* assume ASCII */
786  return len;
787 }
788 
789 static int
790 pg_mule_dsplen(const unsigned char *s)
791 {
792  int len;
793 
794  /*
795  * Note: it's not really appropriate to assume that all multibyte charsets
796  * are double-wide on screen. But this seems an okay approximation for
797  * the MULE charsets we currently support.
798  */
799 
800  if (IS_LC1(*s))
801  len = 1;
802  else if (IS_LCPRV1(*s))
803  len = 1;
804  else if (IS_LC2(*s))
805  len = 2;
806  else if (IS_LCPRV2(*s))
807  len = 2;
808  else
809  len = 1; /* assume ASCII */
810 
811  return len;
812 }
813 
814 /*
815  * ISO8859-1
816  */
817 static int
818 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
819 {
820  int cnt = 0;
821 
822  while (len > 0 && *from)
823  {
824  *to++ = *from++;
825  len--;
826  cnt++;
827  }
828  *to = 0;
829  return cnt;
830 }
831 
832 /*
833  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
834  * high bits.
835  * caller should allocate enough space for "to"
836  * len: length of from.
837  * "from" not necessarily null terminated.
838  */
839 static int
840 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
841 {
842  int cnt = 0;
843 
844  while (len > 0 && *from)
845  {
846  *to++ = *from++;
847  len--;
848  cnt++;
849  }
850  *to = 0;
851  return cnt;
852 }
853 
854 static int
855 pg_latin1_mblen(const unsigned char *s)
856 {
857  return 1;
858 }
859 
860 static int
861 pg_latin1_dsplen(const unsigned char *s)
862 {
863  return pg_ascii_dsplen(s);
864 }
865 
866 /*
867  * SJIS
868  */
869 static int
870 pg_sjis_mblen(const unsigned char *s)
871 {
872  int len;
873 
874  if (*s >= 0xa1 && *s <= 0xdf)
875  len = 1; /* 1 byte kana? */
876  else if (IS_HIGHBIT_SET(*s))
877  len = 2; /* kanji? */
878  else
879  len = 1; /* should be ASCII */
880  return len;
881 }
882 
883 static int
884 pg_sjis_dsplen(const unsigned char *s)
885 {
886  int len;
887 
888  if (*s >= 0xa1 && *s <= 0xdf)
889  len = 1; /* 1 byte kana? */
890  else if (IS_HIGHBIT_SET(*s))
891  len = 2; /* kanji? */
892  else
893  len = pg_ascii_dsplen(s); /* should be ASCII */
894  return len;
895 }
896 
897 /*
898  * Big5
899  */
900 static int
901 pg_big5_mblen(const unsigned char *s)
902 {
903  int len;
904 
905  if (IS_HIGHBIT_SET(*s))
906  len = 2; /* kanji? */
907  else
908  len = 1; /* should be ASCII */
909  return len;
910 }
911 
912 static int
913 pg_big5_dsplen(const unsigned char *s)
914 {
915  int len;
916 
917  if (IS_HIGHBIT_SET(*s))
918  len = 2; /* kanji? */
919  else
920  len = pg_ascii_dsplen(s); /* should be ASCII */
921  return len;
922 }
923 
924 /*
925  * GBK
926  */
927 static int
928 pg_gbk_mblen(const unsigned char *s)
929 {
930  int len;
931 
932  if (IS_HIGHBIT_SET(*s))
933  len = 2; /* kanji? */
934  else
935  len = 1; /* should be ASCII */
936  return len;
937 }
938 
939 static int
940 pg_gbk_dsplen(const unsigned char *s)
941 {
942  int len;
943 
944  if (IS_HIGHBIT_SET(*s))
945  len = 2; /* kanji? */
946  else
947  len = pg_ascii_dsplen(s); /* should be ASCII */
948  return len;
949 }
950 
951 /*
952  * UHC
953  */
954 static int
955 pg_uhc_mblen(const unsigned char *s)
956 {
957  int len;
958 
959  if (IS_HIGHBIT_SET(*s))
960  len = 2; /* 2byte? */
961  else
962  len = 1; /* should be ASCII */
963  return len;
964 }
965 
966 static int
967 pg_uhc_dsplen(const unsigned char *s)
968 {
969  int len;
970 
971  if (IS_HIGHBIT_SET(*s))
972  len = 2; /* 2byte? */
973  else
974  len = pg_ascii_dsplen(s); /* should be ASCII */
975  return len;
976 }
977 
978 /*
979  * GB18030
980  * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
981  */
982 
983 /*
984  * Unlike all other mblen() functions, this also looks at the second byte of
985  * the input. However, if you only pass the first byte of a multi-byte
986  * string, and \0 as the second byte, this still works in a predictable way:
987  * a 4-byte character will be reported as two 2-byte characters. That's
988  * enough for all current uses, as a client-only encoding. It works that
989  * way, because in any valid 4-byte GB18030-encoded character, the third and
990  * fourth byte look like a 2-byte encoded character, when looked at
991  * separately.
992  */
993 static int
994 pg_gb18030_mblen(const unsigned char *s)
995 {
996  int len;
997 
998  if (!IS_HIGHBIT_SET(*s))
999  len = 1; /* ASCII */
1000  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1001  len = 4;
1002  else
1003  len = 2;
1004  return len;
1005 }
1006 
1007 static int
1008 pg_gb18030_dsplen(const unsigned char *s)
1009 {
1010  int len;
1011 
1012  if (IS_HIGHBIT_SET(*s))
1013  len = 2;
1014  else
1015  len = pg_ascii_dsplen(s); /* ASCII */
1016  return len;
1017 }
1018 
1019 /*
1020  *-------------------------------------------------------------------
1021  * multibyte sequence validators
1022  *
1023  * The verifychar functions accept "s", a pointer to the first byte of a
1024  * string, and "len", the remaining length of the string. If there is a
1025  * validly encoded character beginning at *s, return its length in bytes;
1026  * else return -1.
1027  *
1028  * The verifystr functions also accept "s", a pointer to a string and "len",
1029  * the length of the string. They verify the whole string, and return the
1030  * number of input bytes (<= len) that are valid. In other words, if the
1031  * whole string is valid, verifystr returns "len", otherwise it returns the
1032  * byte offset of the first invalid character. The verifystr functions must
1033  * test for and reject zeroes in the input.
1034  *
1035  * The verifychar functions can assume that len > 0 and that *s != '\0', but
1036  * they must test for and reject zeroes in any additional bytes of a
1037  * multibyte character. Note that this definition allows the function for a
1038  * single-byte encoding to be just "return 1".
1039  *-------------------------------------------------------------------
1040  */
1041 static int
1042 pg_ascii_verifychar(const unsigned char *s, int len)
1043 {
1044  return 1;
1045 }
1046 
1047 static int
1048 pg_ascii_verifystr(const unsigned char *s, int len)
1049 {
1050  const unsigned char *nullpos = memchr(s, 0, len);
1051 
1052  if (nullpos == NULL)
1053  return len;
1054  else
1055  return nullpos - s;
1056 }
1057 
1058 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1059 
1060 static int
1061 pg_eucjp_verifychar(const unsigned char *s, int len)
1062 {
1063  int l;
1064  unsigned char c1,
1065  c2;
1066 
1067  c1 = *s++;
1068 
1069  switch (c1)
1070  {
1071  case SS2: /* JIS X 0201 */
1072  l = 2;
1073  if (l > len)
1074  return -1;
1075  c2 = *s++;
1076  if (c2 < 0xa1 || c2 > 0xdf)
1077  return -1;
1078  break;
1079 
1080  case SS3: /* JIS X 0212 */
1081  l = 3;
1082  if (l > len)
1083  return -1;
1084  c2 = *s++;
1085  if (!IS_EUC_RANGE_VALID(c2))
1086  return -1;
1087  c2 = *s++;
1088  if (!IS_EUC_RANGE_VALID(c2))
1089  return -1;
1090  break;
1091 
1092  default:
1093  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1094  {
1095  l = 2;
1096  if (l > len)
1097  return -1;
1098  if (!IS_EUC_RANGE_VALID(c1))
1099  return -1;
1100  c2 = *s++;
1101  if (!IS_EUC_RANGE_VALID(c2))
1102  return -1;
1103  }
1104  else
1105  /* must be ASCII */
1106  {
1107  l = 1;
1108  }
1109  break;
1110  }
1111 
1112  return l;
1113 }
1114 
1115 static int
1116 pg_eucjp_verifystr(const unsigned char *s, int len)
1117 {
1118  const unsigned char *start = s;
1119 
1120  while (len > 0)
1121  {
1122  int l;
1123 
1124  /* fast path for ASCII-subset characters */
1125  if (!IS_HIGHBIT_SET(*s))
1126  {
1127  if (*s == '\0')
1128  break;
1129  l = 1;
1130  }
1131  else
1132  {
1133  l = pg_eucjp_verifychar(s, len);
1134  if (l == -1)
1135  break;
1136  }
1137  s += l;
1138  len -= l;
1139  }
1140 
1141  return s - start;
1142 }
1143 
1144 static int
1145 pg_euckr_verifychar(const unsigned char *s, int len)
1146 {
1147  int l;
1148  unsigned char c1,
1149  c2;
1150 
1151  c1 = *s++;
1152 
1153  if (IS_HIGHBIT_SET(c1))
1154  {
1155  l = 2;
1156  if (l > len)
1157  return -1;
1158  if (!IS_EUC_RANGE_VALID(c1))
1159  return -1;
1160  c2 = *s++;
1161  if (!IS_EUC_RANGE_VALID(c2))
1162  return -1;
1163  }
1164  else
1165  /* must be ASCII */
1166  {
1167  l = 1;
1168  }
1169 
1170  return l;
1171 }
1172 
1173 static int
1174 pg_euckr_verifystr(const unsigned char *s, int len)
1175 {
1176  const unsigned char *start = s;
1177 
1178  while (len > 0)
1179  {
1180  int l;
1181 
1182  /* fast path for ASCII-subset characters */
1183  if (!IS_HIGHBIT_SET(*s))
1184  {
1185  if (*s == '\0')
1186  break;
1187  l = 1;
1188  }
1189  else
1190  {
1191  l = pg_euckr_verifychar(s, len);
1192  if (l == -1)
1193  break;
1194  }
1195  s += l;
1196  len -= l;
1197  }
1198 
1199  return s - start;
1200 }
1201 
1202 /* EUC-CN byte sequences are exactly same as EUC-KR */
1203 #define pg_euccn_verifychar pg_euckr_verifychar
1204 #define pg_euccn_verifystr pg_euckr_verifystr
1205 
1206 static int
1207 pg_euctw_verifychar(const unsigned char *s, int len)
1208 {
1209  int l;
1210  unsigned char c1,
1211  c2;
1212 
1213  c1 = *s++;
1214 
1215  switch (c1)
1216  {
1217  case SS2: /* CNS 11643 Plane 1-7 */
1218  l = 4;
1219  if (l > len)
1220  return -1;
1221  c2 = *s++;
1222  if (c2 < 0xa1 || c2 > 0xa7)
1223  return -1;
1224  c2 = *s++;
1225  if (!IS_EUC_RANGE_VALID(c2))
1226  return -1;
1227  c2 = *s++;
1228  if (!IS_EUC_RANGE_VALID(c2))
1229  return -1;
1230  break;
1231 
1232  case SS3: /* unused */
1233  return -1;
1234 
1235  default:
1236  if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1237  {
1238  l = 2;
1239  if (l > len)
1240  return -1;
1241  /* no further range check on c1? */
1242  c2 = *s++;
1243  if (!IS_EUC_RANGE_VALID(c2))
1244  return -1;
1245  }
1246  else
1247  /* must be ASCII */
1248  {
1249  l = 1;
1250  }
1251  break;
1252  }
1253  return l;
1254 }
1255 
1256 static int
1257 pg_euctw_verifystr(const unsigned char *s, int len)
1258 {
1259  const unsigned char *start = s;
1260 
1261  while (len > 0)
1262  {
1263  int l;
1264 
1265  /* fast path for ASCII-subset characters */
1266  if (!IS_HIGHBIT_SET(*s))
1267  {
1268  if (*s == '\0')
1269  break;
1270  l = 1;
1271  }
1272  else
1273  {
1274  l = pg_euctw_verifychar(s, len);
1275  if (l == -1)
1276  break;
1277  }
1278  s += l;
1279  len -= l;
1280  }
1281 
1282  return s - start;
1283 }
1284 
1285 static int
1286 pg_johab_verifychar(const unsigned char *s, int len)
1287 {
1288  int l,
1289  mbl;
1290  unsigned char c;
1291 
1292  l = mbl = pg_johab_mblen(s);
1293 
1294  if (len < l)
1295  return -1;
1296 
1297  if (!IS_HIGHBIT_SET(*s))
1298  return mbl;
1299 
1300  while (--l > 0)
1301  {
1302  c = *++s;
1303  if (!IS_EUC_RANGE_VALID(c))
1304  return -1;
1305  }
1306  return mbl;
1307 }
1308 
1309 static int
1310 pg_johab_verifystr(const unsigned char *s, int len)
1311 {
1312  const unsigned char *start = s;
1313 
1314  while (len > 0)
1315  {
1316  int l;
1317 
1318  /* fast path for ASCII-subset characters */
1319  if (!IS_HIGHBIT_SET(*s))
1320  {
1321  if (*s == '\0')
1322  break;
1323  l = 1;
1324  }
1325  else
1326  {
1327  l = pg_johab_verifychar(s, len);
1328  if (l == -1)
1329  break;
1330  }
1331  s += l;
1332  len -= l;
1333  }
1334 
1335  return s - start;
1336 }
1337 
1338 static int
1339 pg_mule_verifychar(const unsigned char *s, int len)
1340 {
1341  int l,
1342  mbl;
1343  unsigned char c;
1344 
1345  l = mbl = pg_mule_mblen(s);
1346 
1347  if (len < l)
1348  return -1;
1349 
1350  while (--l > 0)
1351  {
1352  c = *++s;
1353  if (!IS_HIGHBIT_SET(c))
1354  return -1;
1355  }
1356  return mbl;
1357 }
1358 
1359 static int
1360 pg_mule_verifystr(const unsigned char *s, int len)
1361 {
1362  const unsigned char *start = s;
1363 
1364  while (len > 0)
1365  {
1366  int l;
1367 
1368  /* fast path for ASCII-subset characters */
1369  if (!IS_HIGHBIT_SET(*s))
1370  {
1371  if (*s == '\0')
1372  break;
1373  l = 1;
1374  }
1375  else
1376  {
1377  l = pg_mule_verifychar(s, len);
1378  if (l == -1)
1379  break;
1380  }
1381  s += l;
1382  len -= l;
1383  }
1384 
1385  return s - start;
1386 }
1387 
1388 static int
1389 pg_latin1_verifychar(const unsigned char *s, int len)
1390 {
1391  return 1;
1392 }
1393 
1394 static int
1395 pg_latin1_verifystr(const unsigned char *s, int len)
1396 {
1397  const unsigned char *nullpos = memchr(s, 0, len);
1398 
1399  if (nullpos == NULL)
1400  return len;
1401  else
1402  return nullpos - s;
1403 }
1404 
1405 static int
1406 pg_sjis_verifychar(const unsigned char *s, int len)
1407 {
1408  int l,
1409  mbl;
1410  unsigned char c1,
1411  c2;
1412 
1413  l = mbl = pg_sjis_mblen(s);
1414 
1415  if (len < l)
1416  return -1;
1417 
1418  if (l == 1) /* pg_sjis_mblen already verified it */
1419  return mbl;
1420 
1421  c1 = *s++;
1422  c2 = *s;
1423  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1424  return -1;
1425  return mbl;
1426 }
1427 
1428 static int
1429 pg_sjis_verifystr(const unsigned char *s, int len)
1430 {
1431  const unsigned char *start = s;
1432 
1433  while (len > 0)
1434  {
1435  int l;
1436 
1437  /* fast path for ASCII-subset characters */
1438  if (!IS_HIGHBIT_SET(*s))
1439  {
1440  if (*s == '\0')
1441  break;
1442  l = 1;
1443  }
1444  else
1445  {
1446  l = pg_sjis_verifychar(s, len);
1447  if (l == -1)
1448  break;
1449  }
1450  s += l;
1451  len -= l;
1452  }
1453 
1454  return s - start;
1455 }
1456 
1457 static int
1458 pg_big5_verifychar(const unsigned char *s, int len)
1459 {
1460  int l,
1461  mbl;
1462 
1463  l = mbl = pg_big5_mblen(s);
1464 
1465  if (len < l)
1466  return -1;
1467 
1468  while (--l > 0)
1469  {
1470  if (*++s == '\0')
1471  return -1;
1472  }
1473 
1474  return mbl;
1475 }
1476 
1477 static int
1478 pg_big5_verifystr(const unsigned char *s, int len)
1479 {
1480  const unsigned char *start = s;
1481 
1482  while (len > 0)
1483  {
1484  int l;
1485 
1486  /* fast path for ASCII-subset characters */
1487  if (!IS_HIGHBIT_SET(*s))
1488  {
1489  if (*s == '\0')
1490  break;
1491  l = 1;
1492  }
1493  else
1494  {
1495  l = pg_big5_verifychar(s, len);
1496  if (l == -1)
1497  break;
1498  }
1499  s += l;
1500  len -= l;
1501  }
1502 
1503  return s - start;
1504 }
1505 
1506 static int
1507 pg_gbk_verifychar(const unsigned char *s, int len)
1508 {
1509  int l,
1510  mbl;
1511 
1512  l = mbl = pg_gbk_mblen(s);
1513 
1514  if (len < l)
1515  return -1;
1516 
1517  while (--l > 0)
1518  {
1519  if (*++s == '\0')
1520  return -1;
1521  }
1522 
1523  return mbl;
1524 }
1525 
1526 static int
1527 pg_gbk_verifystr(const unsigned char *s, int len)
1528 {
1529  const unsigned char *start = s;
1530 
1531  while (len > 0)
1532  {
1533  int l;
1534 
1535  /* fast path for ASCII-subset characters */
1536  if (!IS_HIGHBIT_SET(*s))
1537  {
1538  if (*s == '\0')
1539  break;
1540  l = 1;
1541  }
1542  else
1543  {
1544  l = pg_gbk_verifychar(s, len);
1545  if (l == -1)
1546  break;
1547  }
1548  s += l;
1549  len -= l;
1550  }
1551 
1552  return s - start;
1553 }
1554 
1555 static int
1556 pg_uhc_verifychar(const unsigned char *s, int len)
1557 {
1558  int l,
1559  mbl;
1560 
1561  l = mbl = pg_uhc_mblen(s);
1562 
1563  if (len < l)
1564  return -1;
1565 
1566  while (--l > 0)
1567  {
1568  if (*++s == '\0')
1569  return -1;
1570  }
1571 
1572  return mbl;
1573 }
1574 
1575 static int
1576 pg_uhc_verifystr(const unsigned char *s, int len)
1577 {
1578  const unsigned char *start = s;
1579 
1580  while (len > 0)
1581  {
1582  int l;
1583 
1584  /* fast path for ASCII-subset characters */
1585  if (!IS_HIGHBIT_SET(*s))
1586  {
1587  if (*s == '\0')
1588  break;
1589  l = 1;
1590  }
1591  else
1592  {
1593  l = pg_uhc_verifychar(s, len);
1594  if (l == -1)
1595  break;
1596  }
1597  s += l;
1598  len -= l;
1599  }
1600 
1601  return s - start;
1602 }
1603 
1604 static int
1605 pg_gb18030_verifychar(const unsigned char *s, int len)
1606 {
1607  int l;
1608 
1609  if (!IS_HIGHBIT_SET(*s))
1610  l = 1; /* ASCII */
1611  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1612  {
1613  /* Should be 4-byte, validate remaining bytes */
1614  if (*s >= 0x81 && *s <= 0xfe &&
1615  *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1616  *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1617  l = 4;
1618  else
1619  l = -1;
1620  }
1621  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1622  {
1623  /* Should be 2-byte, validate */
1624  if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1625  (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1626  l = 2;
1627  else
1628  l = -1;
1629  }
1630  else
1631  l = -1;
1632  return l;
1633 }
1634 
1635 static int
1636 pg_gb18030_verifystr(const unsigned char *s, int len)
1637 {
1638  const unsigned char *start = s;
1639 
1640  while (len > 0)
1641  {
1642  int l;
1643 
1644  /* fast path for ASCII-subset characters */
1645  if (!IS_HIGHBIT_SET(*s))
1646  {
1647  if (*s == '\0')
1648  break;
1649  l = 1;
1650  }
1651  else
1652  {
1653  l = pg_gb18030_verifychar(s, len);
1654  if (l == -1)
1655  break;
1656  }
1657  s += l;
1658  len -= l;
1659  }
1660 
1661  return s - start;
1662 }
1663 
1664 static int
1665 pg_utf8_verifychar(const unsigned char *s, int len)
1666 {
1667  int l;
1668 
1669  if ((*s & 0x80) == 0)
1670  {
1671  if (*s == '\0')
1672  return -1;
1673  return 1;
1674  }
1675  else if ((*s & 0xe0) == 0xc0)
1676  l = 2;
1677  else if ((*s & 0xf0) == 0xe0)
1678  l = 3;
1679  else if ((*s & 0xf8) == 0xf0)
1680  l = 4;
1681  else
1682  l = 1;
1683 
1684  if (l > len)
1685  return -1;
1686 
1687  if (!pg_utf8_islegal(s, l))
1688  return -1;
1689 
1690  return l;
1691 }
1692 
1693 /*
1694  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1695  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1696  * input byte and current state are used to compute an index into an array of
1697  * state transitions. Since the address of the next transition is dependent
1698  * on this computation, there is latency in executing the load instruction,
1699  * and the CPU is not kept busy.
1700  *
1701  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1702  *
1703  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1704  *
1705  * In a shift-based DFA, the input byte is an index into array of integers
1706  * whose bit pattern encodes the state transitions. To compute the next
1707  * state, we simply right-shift the integer by the current state and apply a
1708  * mask. In this scheme, the address of the transition only depends on the
1709  * input byte, so there is better pipelining.
1710  *
1711  * The naming convention for states and transitions was adopted from a UTF-8
1712  * to UTF-16/32 transcoder, whose table is reproduced below:
1713  *
1714  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1715  *
1716  * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1717  * ==========================================================================
1718  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1719  * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1720  * |
1721  * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1722  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1723  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1724  * |
1725  * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1726  * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1727  * |
1728  * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1729  * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1730  *
1731  * In the most straightforward implementation, a shift-based DFA for UTF-8
1732  * requires 64-bit integers to encode the transitions, but with an SMT solver
1733  * it's possible to find state numbers such that the transitions fit within
1734  * 32-bit integers, as Dougall Johnson demonstrated:
1735  *
1736  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1737  *
1738  * This packed representation is the reason for the seemingly odd choice of
1739  * state values below.
1740  */
1741 
1742 /* Error */
1743 #define ERR 0
1744 /* Begin */
1745 #define BGN 11
1746 /* Continuation states, expect 1/2/3 continuation bytes */
1747 #define CS1 16
1748 #define CS2 1
1749 #define CS3 5
1750 /* Partial states, where the first continuation byte has a restricted range */
1751 #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1752 #define P3B 20 /* Lead was ED, check for surrogate */
1753 #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1754 #define P4B 30 /* Lead was F4, check for too-large */
1755 /* Begin and End are the same state */
1756 #define END BGN
1757 
1758 /* the encoded state transitions for the lookup table */
1759 
1760 /* ASCII */
1761 #define ASC (END << BGN)
1762 /* 2-byte lead */
1763 #define L2A (CS1 << BGN)
1764 /* 3-byte lead */
1765 #define L3A (P3A << BGN)
1766 #define L3B (CS2 << BGN)
1767 #define L3C (P3B << BGN)
1768 /* 4-byte lead */
1769 #define L4A (P4A << BGN)
1770 #define L4B (CS3 << BGN)
1771 #define L4C (P4B << BGN)
1772 /* continuation byte */
1773 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1774 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1775 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1776 /* invalid byte */
1777 #define ILL ERR
1778 
1779 static const uint32 Utf8Transition[256] =
1780 {
1781  /* ASCII */
1782 
1783  ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1784  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1785  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1786  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1787 
1788  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1789  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1790  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1791  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1792 
1793  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1794  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1795  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1796  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1797 
1798  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1799  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1800  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1801  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1802 
1803  /* continuation bytes */
1804 
1805  /* 80..8F */
1806  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1807  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1808 
1809  /* 90..9F */
1810  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1811  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1812 
1813  /* A0..BF */
1814  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1815  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1816  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1817  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1818 
1819  /* leading bytes */
1820 
1821  /* C0..DF */
1822  ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1823  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1824  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1825  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1826 
1827  /* E0..EF */
1828  L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1829  L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1830 
1831  /* F0..FF */
1832  L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1833  ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1834 };
1835 
1836 static void
1837 utf8_advance(const unsigned char *s, uint32 *state, int len)
1838 {
1839  /* Note: We deliberately don't check the state's value here. */
1840  while (len > 0)
1841  {
1842  /*
1843  * It's important that the mask value is 31: In most instruction sets,
1844  * a shift by a 32-bit operand is understood to be a shift by its mod
1845  * 32, so the compiler should elide the mask operation.
1846  */
1847  *state = Utf8Transition[*s++] >> (*state & 31);
1848  len--;
1849  }
1850 
1851  *state &= 31;
1852 }
1853 
1854 static int
1855 pg_utf8_verifystr(const unsigned char *s, int len)
1856 {
1857  const unsigned char *start = s;
1858  const int orig_len = len;
1859  uint32 state = BGN;
1860 
1861 /*
1862  * With a stride of two vector widths, gcc will unroll the loop. Even if
1863  * the compiler can unroll a longer loop, it's not worth it because we
1864  * must fall back to the byte-wise algorithm if we find any non-ASCII.
1865  */
1866 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1867 
1868  if (len >= STRIDE_LENGTH)
1869  {
1870  while (len >= STRIDE_LENGTH)
1871  {
1872  /*
1873  * If the chunk is all ASCII, we can skip the full UTF-8 check,
1874  * but we must first check for a non-END state, which means the
1875  * previous chunk ended in the middle of a multibyte sequence.
1876  */
1877  if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1879 
1880  s += STRIDE_LENGTH;
1881  len -= STRIDE_LENGTH;
1882  }
1883 
1884  /* The error state persists, so we only need to check for it here. */
1885  if (state == ERR)
1886  {
1887  /*
1888  * Start over from the beginning with the slow path so we can
1889  * count the valid bytes.
1890  */
1891  len = orig_len;
1892  s = start;
1893  }
1894  else if (state != END)
1895  {
1896  /*
1897  * The fast path exited in the middle of a multibyte sequence.
1898  * Walk backwards to find the leading byte so that the slow path
1899  * can resume checking from there. We must always backtrack at
1900  * least one byte, since the current byte could be e.g. an ASCII
1901  * byte after a 2-byte lead, which is invalid.
1902  */
1903  do
1904  {
1905  Assert(s > start);
1906  s--;
1907  len++;
1908  Assert(IS_HIGHBIT_SET(*s));
1909  } while (pg_utf_mblen(s) <= 1);
1910  }
1911  }
1912 
1913  /* check remaining bytes */
1914  while (len > 0)
1915  {
1916  int l;
1917 
1918  /* fast path for ASCII-subset characters */
1919  if (!IS_HIGHBIT_SET(*s))
1920  {
1921  if (*s == '\0')
1922  break;
1923  l = 1;
1924  }
1925  else
1926  {
1927  l = pg_utf8_verifychar(s, len);
1928  if (l == -1)
1929  break;
1930  }
1931  s += l;
1932  len -= l;
1933  }
1934 
1935  return s - start;
1936 }
1937 
1938 /*
1939  * Check for validity of a single UTF-8 encoded character
1940  *
1941  * This directly implements the rules in RFC3629. The bizarre-looking
1942  * restrictions on the second byte are meant to ensure that there isn't
1943  * more than one encoding of a given Unicode character point; that is,
1944  * you may not use a longer-than-necessary byte sequence with high order
1945  * zero bits to represent a character that would fit in fewer bytes.
1946  * To do otherwise is to create security hazards (eg, create an apparent
1947  * non-ASCII character that decodes to plain ASCII).
1948  *
1949  * length is assumed to have been obtained by pg_utf_mblen(), and the
1950  * caller must have checked that that many bytes are present in the buffer.
1951  */
1952 bool
1953 pg_utf8_islegal(const unsigned char *source, int length)
1954 {
1955  unsigned char a;
1956 
1957  switch (length)
1958  {
1959  default:
1960  /* reject lengths 5 and 6 for now */
1961  return false;
1962  case 4:
1963  a = source[3];
1964  if (a < 0x80 || a > 0xBF)
1965  return false;
1966  /* FALL THRU */
1967  case 3:
1968  a = source[2];
1969  if (a < 0x80 || a > 0xBF)
1970  return false;
1971  /* FALL THRU */
1972  case 2:
1973  a = source[1];
1974  switch (*source)
1975  {
1976  case 0xE0:
1977  if (a < 0xA0 || a > 0xBF)
1978  return false;
1979  break;
1980  case 0xED:
1981  if (a < 0x80 || a > 0x9F)
1982  return false;
1983  break;
1984  case 0xF0:
1985  if (a < 0x90 || a > 0xBF)
1986  return false;
1987  break;
1988  case 0xF4:
1989  if (a < 0x80 || a > 0x8F)
1990  return false;
1991  break;
1992  default:
1993  if (a < 0x80 || a > 0xBF)
1994  return false;
1995  break;
1996  }
1997  /* FALL THRU */
1998  case 1:
1999  a = *source;
2000  if (a >= 0x80 && a < 0xC2)
2001  return false;
2002  if (a > 0xF4)
2003  return false;
2004  break;
2005  }
2006  return true;
2007 }
2008 
2009 
2010 /*
2011  *-------------------------------------------------------------------
2012  * encoding info table
2013  *-------------------------------------------------------------------
2014  */
2058 };
2059 
2060 /*
2061  * Returns the byte length of a multibyte character.
2062  *
2063  * Caution: when dealing with text that is not certainly valid in the
2064  * specified encoding, the result may exceed the actual remaining
2065  * string length. Callers that are not prepared to deal with that
2066  * should use pg_encoding_mblen_bounded() instead.
2067  */
2068 int
2069 pg_encoding_mblen(int encoding, const char *mbstr)
2070 {
2071  return (PG_VALID_ENCODING(encoding) ?
2072  pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2073  pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2074 }
2075 
2076 /*
2077  * Returns the byte length of a multibyte character; but not more than
2078  * the distance to end of string.
2079  */
2080 int
2081 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2082 {
2083  return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2084 }
2085 
2086 /*
2087  * Returns the display length of a multibyte character.
2088  */
2089 int
2090 pg_encoding_dsplen(int encoding, const char *mbstr)
2091 {
2092  return (PG_VALID_ENCODING(encoding) ?
2093  pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2094  pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2095 }
2096 
2097 /*
2098  * Verify the first multibyte character of the given string.
2099  * Return its byte length if good, -1 if bad. (See comments above for
2100  * full details of the mbverifychar API.)
2101  */
2102 int
2103 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2104 {
2105  return (PG_VALID_ENCODING(encoding) ?
2106  pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2107  pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2108 }
2109 
2110 /*
2111  * Verify that a string is valid for the given encoding.
2112  * Returns the number of input bytes (<= len) that form a valid string.
2113  * (See comments above for full details of the mbverifystr API.)
2114  */
2115 int
2116 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2117 {
2118  return (PG_VALID_ENCODING(encoding) ?
2119  pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2120  pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2121 }
2122 
2123 /*
2124  * fetch maximum length of a given encoding
2125  */
2126 int
2128 {
2130 
2132 }
static bool is_valid_ascii(const unsigned char *s, int len)
Definition: ascii.h:25
unsigned int uint32
Definition: c.h:506
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1155
#define Assert(condition)
Definition: c.h:858
return str start
int a
Definition: isn.c:69
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
int32 encoding
Definition: pg_database.h:41
static rewind_source * source
Definition: pg_rewind.c:89
#define IS_LCPRV2(c)
Definition: pg_wchar.h:164
#define ISSJISTAIL(c)
Definition: pg_wchar.h:45
@ PG_WIN1254
Definition: pg_wchar.h:257
@ PG_LATIN4
Definition: pg_wchar.h:237
@ PG_LATIN9
Definition: pg_wchar.h:242
@ PG_JOHAB
Definition: pg_wchar.h:269
@ PG_GB18030
Definition: pg_wchar.h:268
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_ISO_8859_6
Definition: pg_wchar.h:252
@ PG_WIN1253
Definition: pg_wchar.h:256
@ PG_KOI8U
Definition: pg_wchar.h:260
@ PG_LATIN6
Definition: pg_wchar.h:239
@ PG_MULE_INTERNAL
Definition: pg_wchar.h:233
@ PG_LATIN5
Definition: pg_wchar.h:238
@ PG_EUC_CN
Definition: pg_wchar.h:228
@ PG_UHC
Definition: pg_wchar.h:267
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_ISO_8859_5
Definition: pg_wchar.h:251
@ PG_LATIN10
Definition: pg_wchar.h:243
@ PG_WIN1250
Definition: pg_wchar.h:255
@ PG_ISO_8859_7
Definition: pg_wchar.h:253
@ PG_SJIS
Definition: pg_wchar.h:264
@ PG_LATIN8
Definition: pg_wchar.h:241
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_GBK
Definition: pg_wchar.h:266
@ PG_LATIN3
Definition: pg_wchar.h:236
@ PG_WIN1256
Definition: pg_wchar.h:244
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_EUC_TW
Definition: pg_wchar.h:230
@ PG_WIN1258
Definition: pg_wchar.h:245
@ PG_SHIFT_JIS_2004
Definition: pg_wchar.h:270
@ PG_WIN1252
Definition: pg_wchar.h:250
@ PG_LATIN7
Definition: pg_wchar.h:240
@ PG_UTF8
Definition: pg_wchar.h:232
@ PG_WIN1255
Definition: pg_wchar.h:258
@ PG_WIN1257
Definition: pg_wchar.h:259
@ PG_WIN1251
Definition: pg_wchar.h:249
@ PG_EUC_KR
Definition: pg_wchar.h:229
@ PG_WIN866
Definition: pg_wchar.h:246
@ PG_ISO_8859_8
Definition: pg_wchar.h:254
@ PG_WIN874
Definition: pg_wchar.h:247
@ PG_EUC_JIS_2004
Definition: pg_wchar.h:231
@ PG_BIG5
Definition: pg_wchar.h:265
#define LCPRV1_A
Definition: pg_wchar.h:150
#define LCPRV1_B
Definition: pg_wchar.h:151
#define IS_LC2(c)
Definition: pg_wchar.h:144
#define IS_LCPRV1(c)
Definition: pg_wchar.h:152
#define LCPRV2_A
Definition: pg_wchar.h:162
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:167
#define SS2
Definition: pg_wchar.h:38
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:153
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:155
#define ISSJISHEAD(c)
Definition: pg_wchar.h:44
#define IS_LC1(c)
Definition: pg_wchar.h:126
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:165
#define SS3
Definition: pg_wchar.h:39
#define LCPRV2_B
Definition: pg_wchar.h:163
size_t strnlen(const char *str, size_t maxlen)
Definition: strnlen.c:26
char * c
unsigned int first
Definition: wchar.c:554
unsigned int last
Definition: wchar.c:555
int maxmblen
Definition: pg_wchar.h:386
Definition: regguts.h:323
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1576
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:861
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition: wchar.c:2081
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:318
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:201
static const uint32 Utf8Transition[256]
Definition: wchar.c:1779
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1953
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1048
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1389
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:884
#define CR3
Definition: wchar.c:1775
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1406
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:84
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:170
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:49
#define L3B
Definition: wchar.c:1766
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1556
#define L2A
Definition: wchar.c:1763
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:940
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:189
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:278
#define END
Definition: wchar.c:1756
#define pg_euccn_verifychar
Definition: wchar.c:1203
#define L4C
Definition: wchar.c:1771
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1429
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:402
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:408
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1478
#define CR2
Definition: wchar.c:1774
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1339
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:158
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1395
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:855
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1042
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:64
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:560
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:913
#define pg_euccn_verifystr
Definition: wchar.c:1204
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:517
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:164
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1507
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:901
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:262
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1207
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1145
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1257
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1527
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1008
#define ERR
Definition: wchar.c:1743
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:420
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:772
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:250
#define ASC
Definition: wchar.c:1761
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:928
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1116
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1310
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:139
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1636
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1174
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:706
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:870
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1058
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:967
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1061
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1458
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1605
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1360
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:356
#define L3C
Definition: wchar.c:1767
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1665
#define CR1
Definition: wchar.c:1773
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:840
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:486
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:211
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:994
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:2090
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition: wchar.c:1837
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:334
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:818
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:955
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:123
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:790
#define L3A
Definition: wchar.c:1765
#define L4B
Definition: wchar.c:1770
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition: wchar.c:2116
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1855
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:195
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2015
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:653
#define BGN
Definition: wchar.c:1745
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2127
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2069
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1286
#define ILL
Definition: wchar.c:1777
#define STRIDE_LENGTH
#define L4A
Definition: wchar.c:1769
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:70
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:2103
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:607
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:641