PostgreSQL Source Code  git master
wchar.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wchar.c
4  * Functions for working with multibyte characters in various encodings.
5  *
6  * Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/common/wchar.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include "mb/pg_wchar.h"
16 
17 
18 /*
19  * Operations on multi-byte encodings are driven by a table of helper
20  * functions.
21  *
22  * To add an encoding support, define mblen(), dsplen(), verifychar() and
23  * verifystr() for the encoding. For server-encodings, also define mb2wchar()
24  * and wchar2mb() conversion functions.
25  *
26  * These functions generally assume that their input is validly formed.
27  * The "verifier" functions, further down in the file, have to be more
28  * paranoid.
29  *
30  * We expect that mblen() does not need to examine more than the first byte
31  * of the character to discover the correct length. GB18030 is an exception
32  * to that rule, though, as it also looks at second byte. But even that
33  * behaves in a predictable way, if you only pass the first byte: it will
34  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
35  * good enough for all current uses.
36  *
37  * Note: for the display output of psql to work properly, the return values
38  * of the dsplen functions must conform to the Unicode standard. In particular
39  * the NUL character is zero width and control characters are generally
40  * width -1. It is recommended that non-ASCII encodings refer their ASCII
41  * subset to the ASCII routines to ensure consistency.
42  */
43 
44 /*
45  * SQL/ASCII
46  */
47 static int
48 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
49 {
50  int cnt = 0;
51 
52  while (len > 0 && *from)
53  {
54  *to++ = *from++;
55  len--;
56  cnt++;
57  }
58  *to = 0;
59  return cnt;
60 }
61 
62 static int
63 pg_ascii_mblen(const unsigned char *s)
64 {
65  return 1;
66 }
67 
68 static int
69 pg_ascii_dsplen(const unsigned char *s)
70 {
71  if (*s == '\0')
72  return 0;
73  if (*s < 0x20 || *s == 0x7f)
74  return -1;
75 
76  return 1;
77 }
78 
79 /*
80  * EUC
81  */
82 static int
83 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
84 {
85  int cnt = 0;
86 
87  while (len > 0 && *from)
88  {
89  if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
90  * KANA") */
91  {
92  from++;
93  *to = (SS2 << 8) | *from++;
94  len -= 2;
95  }
96  else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
97  {
98  from++;
99  *to = (SS3 << 16) | (*from++ << 8);
100  *to |= *from++;
101  len -= 3;
102  }
103  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
104  {
105  *to = *from++ << 8;
106  *to |= *from++;
107  len -= 2;
108  }
109  else /* must be ASCII */
110  {
111  *to = *from++;
112  len--;
113  }
114  to++;
115  cnt++;
116  }
117  *to = 0;
118  return cnt;
119 }
120 
121 static inline int
122 pg_euc_mblen(const unsigned char *s)
123 {
124  int len;
125 
126  if (*s == SS2)
127  len = 2;
128  else if (*s == SS3)
129  len = 3;
130  else if (IS_HIGHBIT_SET(*s))
131  len = 2;
132  else
133  len = 1;
134  return len;
135 }
136 
137 static inline int
138 pg_euc_dsplen(const unsigned char *s)
139 {
140  int len;
141 
142  if (*s == SS2)
143  len = 2;
144  else if (*s == SS3)
145  len = 2;
146  else if (IS_HIGHBIT_SET(*s))
147  len = 2;
148  else
149  len = pg_ascii_dsplen(s);
150  return len;
151 }
152 
153 /*
154  * EUC_JP
155  */
156 static int
157 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
158 {
159  return pg_euc2wchar_with_len(from, to, len);
160 }
161 
162 static int
163 pg_eucjp_mblen(const unsigned char *s)
164 {
165  return pg_euc_mblen(s);
166 }
167 
168 static int
169 pg_eucjp_dsplen(const unsigned char *s)
170 {
171  int len;
172 
173  if (*s == SS2)
174  len = 1;
175  else if (*s == SS3)
176  len = 2;
177  else if (IS_HIGHBIT_SET(*s))
178  len = 2;
179  else
180  len = pg_ascii_dsplen(s);
181  return len;
182 }
183 
184 /*
185  * EUC_KR
186  */
187 static int
188 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
189 {
190  return pg_euc2wchar_with_len(from, to, len);
191 }
192 
193 static int
194 pg_euckr_mblen(const unsigned char *s)
195 {
196  return pg_euc_mblen(s);
197 }
198 
199 static int
200 pg_euckr_dsplen(const unsigned char *s)
201 {
202  return pg_euc_dsplen(s);
203 }
204 
205 /*
206  * EUC_CN
207  *
208  */
209 static int
210 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211 {
212  int cnt = 0;
213 
214  while (len > 0 && *from)
215  {
216  if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
217  {
218  from++;
219  *to = (SS2 << 16) | (*from++ << 8);
220  *to |= *from++;
221  len -= 3;
222  }
223  else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
224  {
225  from++;
226  *to = (SS3 << 16) | (*from++ << 8);
227  *to |= *from++;
228  len -= 3;
229  }
230  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
231  {
232  *to = *from++ << 8;
233  *to |= *from++;
234  len -= 2;
235  }
236  else
237  {
238  *to = *from++;
239  len--;
240  }
241  to++;
242  cnt++;
243  }
244  *to = 0;
245  return cnt;
246 }
247 
248 static int
249 pg_euccn_mblen(const unsigned char *s)
250 {
251  int len;
252 
253  if (IS_HIGHBIT_SET(*s))
254  len = 2;
255  else
256  len = 1;
257  return len;
258 }
259 
260 static int
261 pg_euccn_dsplen(const unsigned char *s)
262 {
263  int len;
264 
265  if (IS_HIGHBIT_SET(*s))
266  len = 2;
267  else
268  len = pg_ascii_dsplen(s);
269  return len;
270 }
271 
272 /*
273  * EUC_TW
274  *
275  */
276 static int
277 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
278 {
279  int cnt = 0;
280 
281  while (len > 0 && *from)
282  {
283  if (*from == SS2 && len >= 4) /* code set 2 */
284  {
285  from++;
286  *to = (((uint32) SS2) << 24) | (*from++ << 16);
287  *to |= *from++ << 8;
288  *to |= *from++;
289  len -= 4;
290  }
291  else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
292  {
293  from++;
294  *to = (SS3 << 16) | (*from++ << 8);
295  *to |= *from++;
296  len -= 3;
297  }
298  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
299  {
300  *to = *from++ << 8;
301  *to |= *from++;
302  len -= 2;
303  }
304  else
305  {
306  *to = *from++;
307  len--;
308  }
309  to++;
310  cnt++;
311  }
312  *to = 0;
313  return cnt;
314 }
315 
316 static int
317 pg_euctw_mblen(const unsigned char *s)
318 {
319  int len;
320 
321  if (*s == SS2)
322  len = 4;
323  else if (*s == SS3)
324  len = 3;
325  else if (IS_HIGHBIT_SET(*s))
326  len = 2;
327  else
328  len = 1;
329  return len;
330 }
331 
332 static int
333 pg_euctw_dsplen(const unsigned char *s)
334 {
335  int len;
336 
337  if (*s == SS2)
338  len = 2;
339  else if (*s == SS3)
340  len = 2;
341  else if (IS_HIGHBIT_SET(*s))
342  len = 2;
343  else
344  len = pg_ascii_dsplen(s);
345  return len;
346 }
347 
348 /*
349  * Convert pg_wchar to EUC_* encoding.
350  * caller must allocate enough space for "to", including a trailing zero!
351  * len: length of from.
352  * "from" not necessarily null terminated.
353  */
354 static int
355 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
356 {
357  int cnt = 0;
358 
359  while (len > 0 && *from)
360  {
361  unsigned char c;
362 
363  if ((c = (*from >> 24)))
364  {
365  *to++ = c;
366  *to++ = (*from >> 16) & 0xff;
367  *to++ = (*from >> 8) & 0xff;
368  *to++ = *from & 0xff;
369  cnt += 4;
370  }
371  else if ((c = (*from >> 16)))
372  {
373  *to++ = c;
374  *to++ = (*from >> 8) & 0xff;
375  *to++ = *from & 0xff;
376  cnt += 3;
377  }
378  else if ((c = (*from >> 8)))
379  {
380  *to++ = c;
381  *to++ = *from & 0xff;
382  cnt += 2;
383  }
384  else
385  {
386  *to++ = *from;
387  cnt++;
388  }
389  from++;
390  len--;
391  }
392  *to = 0;
393  return cnt;
394 }
395 
396 
397 /*
398  * JOHAB
399  */
400 static int
401 pg_johab_mblen(const unsigned char *s)
402 {
403  return pg_euc_mblen(s);
404 }
405 
406 static int
407 pg_johab_dsplen(const unsigned char *s)
408 {
409  return pg_euc_dsplen(s);
410 }
411 
412 /*
413  * convert UTF8 string to pg_wchar (UCS-4)
414  * caller must allocate enough space for "to", including a trailing zero!
415  * len: length of from.
416  * "from" not necessarily null terminated.
417  */
418 static int
419 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
420 {
421  int cnt = 0;
422  uint32 c1,
423  c2,
424  c3,
425  c4;
426 
427  while (len > 0 && *from)
428  {
429  if ((*from & 0x80) == 0)
430  {
431  *to = *from++;
432  len--;
433  }
434  else if ((*from & 0xe0) == 0xc0)
435  {
436  if (len < 2)
437  break; /* drop trailing incomplete char */
438  c1 = *from++ & 0x1f;
439  c2 = *from++ & 0x3f;
440  *to = (c1 << 6) | c2;
441  len -= 2;
442  }
443  else if ((*from & 0xf0) == 0xe0)
444  {
445  if (len < 3)
446  break; /* drop trailing incomplete char */
447  c1 = *from++ & 0x0f;
448  c2 = *from++ & 0x3f;
449  c3 = *from++ & 0x3f;
450  *to = (c1 << 12) | (c2 << 6) | c3;
451  len -= 3;
452  }
453  else if ((*from & 0xf8) == 0xf0)
454  {
455  if (len < 4)
456  break; /* drop trailing incomplete char */
457  c1 = *from++ & 0x07;
458  c2 = *from++ & 0x3f;
459  c3 = *from++ & 0x3f;
460  c4 = *from++ & 0x3f;
461  *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
462  len -= 4;
463  }
464  else
465  {
466  /* treat a bogus char as length 1; not ours to raise error */
467  *to = *from++;
468  len--;
469  }
470  to++;
471  cnt++;
472  }
473  *to = 0;
474  return cnt;
475 }
476 
477 
478 /*
479  * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
480  * space allocated.
481  */
482 unsigned char *
483 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
484 {
485  if (c <= 0x7F)
486  {
487  utf8string[0] = c;
488  }
489  else if (c <= 0x7FF)
490  {
491  utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
492  utf8string[1] = 0x80 | (c & 0x3F);
493  }
494  else if (c <= 0xFFFF)
495  {
496  utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
497  utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
498  utf8string[2] = 0x80 | (c & 0x3F);
499  }
500  else
501  {
502  utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
503  utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
504  utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
505  utf8string[3] = 0x80 | (c & 0x3F);
506  }
507 
508  return utf8string;
509 }
510 
511 /*
512  * Trivial conversion from pg_wchar to UTF-8.
513  * caller should allocate enough space for "to"
514  * len: length of from.
515  * "from" not necessarily null terminated.
516  */
517 static int
518 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
519 {
520  int cnt = 0;
521 
522  while (len > 0 && *from)
523  {
524  int char_len;
525 
526  unicode_to_utf8(*from, to);
527  char_len = pg_utf_mblen(to);
528  cnt += char_len;
529  to += char_len;
530  from++;
531  len--;
532  }
533  *to = 0;
534  return cnt;
535 }
536 
537 /*
538  * Return the byte length of a UTF8 character pointed to by s
539  *
540  * Note: in the current implementation we do not support UTF8 sequences
541  * of more than 4 bytes; hence do NOT return a value larger than 4.
542  * We return "1" for any leading byte that is either flat-out illegal or
543  * indicates a length larger than we support.
544  *
545  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
546  * other places would need to be fixed to change this.
547  */
548 int
549 pg_utf_mblen(const unsigned char *s)
550 {
551  int len;
552 
553  if ((*s & 0x80) == 0)
554  len = 1;
555  else if ((*s & 0xe0) == 0xc0)
556  len = 2;
557  else if ((*s & 0xf0) == 0xe0)
558  len = 3;
559  else if ((*s & 0xf8) == 0xf0)
560  len = 4;
561 #ifdef NOT_USED
562  else if ((*s & 0xfc) == 0xf8)
563  len = 5;
564  else if ((*s & 0xfe) == 0xfc)
565  len = 6;
566 #endif
567  else
568  len = 1;
569  return len;
570 }
571 
572 /*
573  * This is an implementation of wcwidth() and wcswidth() as defined in
574  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
575  * <http://www.unix.org/online.html>
576  *
577  * Markus Kuhn -- 2001-09-08 -- public domain
578  *
579  * customised for PostgreSQL
580  *
581  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
582  */
583 
585 {
586  unsigned int first;
587  unsigned int last;
588 };
589 
590 /* auxiliary function for binary search in interval table */
591 static int
592 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
593 {
594  int min = 0;
595  int mid;
596 
597  if (ucs < table[0].first || ucs > table[max].last)
598  return 0;
599  while (max >= min)
600  {
601  mid = (min + max) / 2;
602  if (ucs > table[mid].last)
603  min = mid + 1;
604  else if (ucs < table[mid].first)
605  max = mid - 1;
606  else
607  return 1;
608  }
609 
610  return 0;
611 }
612 
613 
614 /* The following functions define the column width of an ISO 10646
615  * character as follows:
616  *
617  * - The null character (U+0000) has a column width of 0.
618  *
619  * - Other C0/C1 control characters and DEL will lead to a return
620  * value of -1.
621  *
622  * - Non-spacing and enclosing combining characters (general
623  * category code Mn, Me or Cf in the Unicode database) have a
624  * column width of 0.
625  *
626  * - Spacing characters in the East Asian Wide (W) or East Asian
627  * FullWidth (F) category as defined in Unicode Technical
628  * Report #11 have a column width of 2.
629  *
630  * - All remaining characters (including all printable
631  * ISO 8859-1 and WGL4 characters, Unicode control characters,
632  * etc.) have a column width of 1.
633  *
634  * This implementation assumes that wchar_t characters are encoded
635  * in ISO 10646.
636  */
637 
638 static int
640 {
643 
644  /* test for 8-bit control characters */
645  if (ucs == 0)
646  return 0;
647 
648  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
649  return -1;
650 
651  /*
652  * binary search in table of non-spacing characters
653  *
654  * XXX: In the official Unicode sources, it is possible for a character to
655  * be described as both non-spacing and wide at the same time. As of
656  * Unicode 13.0, treating the non-spacing property as the determining
657  * factor for display width leads to the correct behavior, so do that
658  * search first.
659  */
660  if (mbbisearch(ucs, nonspacing,
661  sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
662  return 0;
663 
664  /* binary search in table of wide characters */
665  if (mbbisearch(ucs, east_asian_fw,
666  sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
667  return 2;
668 
669  return 1;
670 }
671 
672 /*
673  * Convert a UTF-8 character to a Unicode code point.
674  * This is a one-character version of pg_utf2wchar_with_len.
675  *
676  * No error checks here, c must point to a long-enough string.
677  */
678 pg_wchar
679 utf8_to_unicode(const unsigned char *c)
680 {
681  if ((*c & 0x80) == 0)
682  return (pg_wchar) c[0];
683  else if ((*c & 0xe0) == 0xc0)
684  return (pg_wchar) (((c[0] & 0x1f) << 6) |
685  (c[1] & 0x3f));
686  else if ((*c & 0xf0) == 0xe0)
687  return (pg_wchar) (((c[0] & 0x0f) << 12) |
688  ((c[1] & 0x3f) << 6) |
689  (c[2] & 0x3f));
690  else if ((*c & 0xf8) == 0xf0)
691  return (pg_wchar) (((c[0] & 0x07) << 18) |
692  ((c[1] & 0x3f) << 12) |
693  ((c[2] & 0x3f) << 6) |
694  (c[3] & 0x3f));
695  else
696  /* that is an invalid code on purpose */
697  return 0xffffffff;
698 }
699 
700 static int
701 pg_utf_dsplen(const unsigned char *s)
702 {
703  return ucs_wcwidth(utf8_to_unicode(s));
704 }
705 
706 /*
707  * convert mule internal code to pg_wchar
708  * caller should allocate enough space for "to"
709  * len: length of from.
710  * "from" not necessarily null terminated.
711  */
712 static int
713 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
714 {
715  int cnt = 0;
716 
717  while (len > 0 && *from)
718  {
719  if (IS_LC1(*from) && len >= 2)
720  {
721  *to = *from++ << 16;
722  *to |= *from++;
723  len -= 2;
724  }
725  else if (IS_LCPRV1(*from) && len >= 3)
726  {
727  from++;
728  *to = *from++ << 16;
729  *to |= *from++;
730  len -= 3;
731  }
732  else if (IS_LC2(*from) && len >= 3)
733  {
734  *to = *from++ << 16;
735  *to |= *from++ << 8;
736  *to |= *from++;
737  len -= 3;
738  }
739  else if (IS_LCPRV2(*from) && len >= 4)
740  {
741  from++;
742  *to = *from++ << 16;
743  *to |= *from++ << 8;
744  *to |= *from++;
745  len -= 4;
746  }
747  else
748  { /* assume ASCII */
749  *to = (unsigned char) *from++;
750  len--;
751  }
752  to++;
753  cnt++;
754  }
755  *to = 0;
756  return cnt;
757 }
758 
759 /*
760  * convert pg_wchar to mule internal code
761  * caller should allocate enough space for "to"
762  * len: length of from.
763  * "from" not necessarily null terminated.
764  */
765 static int
766 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
767 {
768  int cnt = 0;
769 
770  while (len > 0 && *from)
771  {
772  unsigned char lb;
773 
774  lb = (*from >> 16) & 0xff;
775  if (IS_LC1(lb))
776  {
777  *to++ = lb;
778  *to++ = *from & 0xff;
779  cnt += 2;
780  }
781  else if (IS_LC2(lb))
782  {
783  *to++ = lb;
784  *to++ = (*from >> 8) & 0xff;
785  *to++ = *from & 0xff;
786  cnt += 3;
787  }
788  else if (IS_LCPRV1_A_RANGE(lb))
789  {
790  *to++ = LCPRV1_A;
791  *to++ = lb;
792  *to++ = *from & 0xff;
793  cnt += 3;
794  }
795  else if (IS_LCPRV1_B_RANGE(lb))
796  {
797  *to++ = LCPRV1_B;
798  *to++ = lb;
799  *to++ = *from & 0xff;
800  cnt += 3;
801  }
802  else if (IS_LCPRV2_A_RANGE(lb))
803  {
804  *to++ = LCPRV2_A;
805  *to++ = lb;
806  *to++ = (*from >> 8) & 0xff;
807  *to++ = *from & 0xff;
808  cnt += 4;
809  }
810  else if (IS_LCPRV2_B_RANGE(lb))
811  {
812  *to++ = LCPRV2_B;
813  *to++ = lb;
814  *to++ = (*from >> 8) & 0xff;
815  *to++ = *from & 0xff;
816  cnt += 4;
817  }
818  else
819  {
820  *to++ = *from & 0xff;
821  cnt += 1;
822  }
823  from++;
824  len--;
825  }
826  *to = 0;
827  return cnt;
828 }
829 
830 /* exported for direct use by conv.c */
831 int
832 pg_mule_mblen(const unsigned char *s)
833 {
834  int len;
835 
836  if (IS_LC1(*s))
837  len = 2;
838  else if (IS_LCPRV1(*s))
839  len = 3;
840  else if (IS_LC2(*s))
841  len = 3;
842  else if (IS_LCPRV2(*s))
843  len = 4;
844  else
845  len = 1; /* assume ASCII */
846  return len;
847 }
848 
849 static int
850 pg_mule_dsplen(const unsigned char *s)
851 {
852  int len;
853 
854  /*
855  * Note: it's not really appropriate to assume that all multibyte charsets
856  * are double-wide on screen. But this seems an okay approximation for
857  * the MULE charsets we currently support.
858  */
859 
860  if (IS_LC1(*s))
861  len = 1;
862  else if (IS_LCPRV1(*s))
863  len = 1;
864  else if (IS_LC2(*s))
865  len = 2;
866  else if (IS_LCPRV2(*s))
867  len = 2;
868  else
869  len = 1; /* assume ASCII */
870 
871  return len;
872 }
873 
874 /*
875  * ISO8859-1
876  */
877 static int
878 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
879 {
880  int cnt = 0;
881 
882  while (len > 0 && *from)
883  {
884  *to++ = *from++;
885  len--;
886  cnt++;
887  }
888  *to = 0;
889  return cnt;
890 }
891 
892 /*
893  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
894  * high bits.
895  * caller should allocate enough space for "to"
896  * len: length of from.
897  * "from" not necessarily null terminated.
898  */
899 static int
900 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
901 {
902  int cnt = 0;
903 
904  while (len > 0 && *from)
905  {
906  *to++ = *from++;
907  len--;
908  cnt++;
909  }
910  *to = 0;
911  return cnt;
912 }
913 
914 static int
915 pg_latin1_mblen(const unsigned char *s)
916 {
917  return 1;
918 }
919 
920 static int
921 pg_latin1_dsplen(const unsigned char *s)
922 {
923  return pg_ascii_dsplen(s);
924 }
925 
926 /*
927  * SJIS
928  */
929 static int
930 pg_sjis_mblen(const unsigned char *s)
931 {
932  int len;
933 
934  if (*s >= 0xa1 && *s <= 0xdf)
935  len = 1; /* 1 byte kana? */
936  else if (IS_HIGHBIT_SET(*s))
937  len = 2; /* kanji? */
938  else
939  len = 1; /* should be ASCII */
940  return len;
941 }
942 
943 static int
944 pg_sjis_dsplen(const unsigned char *s)
945 {
946  int len;
947 
948  if (*s >= 0xa1 && *s <= 0xdf)
949  len = 1; /* 1 byte kana? */
950  else if (IS_HIGHBIT_SET(*s))
951  len = 2; /* kanji? */
952  else
953  len = pg_ascii_dsplen(s); /* should be ASCII */
954  return len;
955 }
956 
957 /*
958  * Big5
959  */
960 static int
961 pg_big5_mblen(const unsigned char *s)
962 {
963  int len;
964 
965  if (IS_HIGHBIT_SET(*s))
966  len = 2; /* kanji? */
967  else
968  len = 1; /* should be ASCII */
969  return len;
970 }
971 
972 static int
973 pg_big5_dsplen(const unsigned char *s)
974 {
975  int len;
976 
977  if (IS_HIGHBIT_SET(*s))
978  len = 2; /* kanji? */
979  else
980  len = pg_ascii_dsplen(s); /* should be ASCII */
981  return len;
982 }
983 
984 /*
985  * GBK
986  */
987 static int
988 pg_gbk_mblen(const unsigned char *s)
989 {
990  int len;
991 
992  if (IS_HIGHBIT_SET(*s))
993  len = 2; /* kanji? */
994  else
995  len = 1; /* should be ASCII */
996  return len;
997 }
998 
999 static int
1000 pg_gbk_dsplen(const unsigned char *s)
1001 {
1002  int len;
1003 
1004  if (IS_HIGHBIT_SET(*s))
1005  len = 2; /* kanji? */
1006  else
1007  len = pg_ascii_dsplen(s); /* should be ASCII */
1008  return len;
1009 }
1010 
1011 /*
1012  * UHC
1013  */
1014 static int
1015 pg_uhc_mblen(const unsigned char *s)
1016 {
1017  int len;
1018 
1019  if (IS_HIGHBIT_SET(*s))
1020  len = 2; /* 2byte? */
1021  else
1022  len = 1; /* should be ASCII */
1023  return len;
1024 }
1025 
1026 static int
1027 pg_uhc_dsplen(const unsigned char *s)
1028 {
1029  int len;
1030 
1031  if (IS_HIGHBIT_SET(*s))
1032  len = 2; /* 2byte? */
1033  else
1034  len = pg_ascii_dsplen(s); /* should be ASCII */
1035  return len;
1036 }
1037 
1038 /*
1039  * GB18030
1040  * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1041  */
1042 
1043 /*
1044  * Unlike all other mblen() functions, this also looks at the second byte of
1045  * the input. However, if you only pass the first byte of a multi-byte
1046  * string, and \0 as the second byte, this still works in a predictable way:
1047  * a 4-byte character will be reported as two 2-byte characters. That's
1048  * enough for all current uses, as a client-only encoding. It works that
1049  * way, because in any valid 4-byte GB18030-encoded character, the third and
1050  * fourth byte look like a 2-byte encoded character, when looked at
1051  * separately.
1052  */
1053 static int
1054 pg_gb18030_mblen(const unsigned char *s)
1055 {
1056  int len;
1057 
1058  if (!IS_HIGHBIT_SET(*s))
1059  len = 1; /* ASCII */
1060  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1061  len = 4;
1062  else
1063  len = 2;
1064  return len;
1065 }
1066 
1067 static int
1068 pg_gb18030_dsplen(const unsigned char *s)
1069 {
1070  int len;
1071 
1072  if (IS_HIGHBIT_SET(*s))
1073  len = 2;
1074  else
1075  len = pg_ascii_dsplen(s); /* ASCII */
1076  return len;
1077 }
1078 
1079 /*
1080  *-------------------------------------------------------------------
1081  * multibyte sequence validators
1082  *
1083  * The verifychar functions accept "s", a pointer to the first byte of a
1084  * string, and "len", the remaining length of the string. If there is a
1085  * validly encoded character beginning at *s, return its length in bytes;
1086  * else return -1.
1087  *
1088  * The verifystr functions also accept "s", a pointer to a string and "len",
1089  * the length of the string. They verify the whole string, and return the
1090  * number of input bytes (<= len) that are valid. In other words, if the
1091  * whole string is valid, verifystr returns "len", otherwise it returns the
1092  * byte offset of the first invalid character. The verifystr functions must
1093  * test for and reject zeroes in the input.
1094  *
1095  * The verifychar functions can assume that len > 0 and that *s != '\0', but
1096  * they must test for and reject zeroes in any additional bytes of a
1097  * multibyte character. Note that this definition allows the function for a
1098  * single-byte encoding to be just "return 1".
1099  *-------------------------------------------------------------------
1100  */
1101 static int
1102 pg_ascii_verifychar(const unsigned char *s, int len)
1103 {
1104  return 1;
1105 }
1106 
1107 static int
1108 pg_ascii_verifystr(const unsigned char *s, int len)
1109 {
1110  const unsigned char *nullpos = memchr(s, 0, len);
1111 
1112  if (nullpos == NULL)
1113  return len;
1114  else
1115  return nullpos - s;
1116 }
1117 
1118 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1119 
1120 static int
1121 pg_eucjp_verifychar(const unsigned char *s, int len)
1122 {
1123  int l;
1124  unsigned char c1,
1125  c2;
1126 
1127  c1 = *s++;
1128 
1129  switch (c1)
1130  {
1131  case SS2: /* JIS X 0201 */
1132  l = 2;
1133  if (l > len)
1134  return -1;
1135  c2 = *s++;
1136  if (c2 < 0xa1 || c2 > 0xdf)
1137  return -1;
1138  break;
1139 
1140  case SS3: /* JIS X 0212 */
1141  l = 3;
1142  if (l > len)
1143  return -1;
1144  c2 = *s++;
1145  if (!IS_EUC_RANGE_VALID(c2))
1146  return -1;
1147  c2 = *s++;
1148  if (!IS_EUC_RANGE_VALID(c2))
1149  return -1;
1150  break;
1151 
1152  default:
1153  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1154  {
1155  l = 2;
1156  if (l > len)
1157  return -1;
1158  if (!IS_EUC_RANGE_VALID(c1))
1159  return -1;
1160  c2 = *s++;
1161  if (!IS_EUC_RANGE_VALID(c2))
1162  return -1;
1163  }
1164  else
1165  /* must be ASCII */
1166  {
1167  l = 1;
1168  }
1169  break;
1170  }
1171 
1172  return l;
1173 }
1174 
1175 static int
1176 pg_eucjp_verifystr(const unsigned char *s, int len)
1177 {
1178  const unsigned char *start = s;
1179 
1180  while (len > 0)
1181  {
1182  int l;
1183 
1184  /* fast path for ASCII-subset characters */
1185  if (!IS_HIGHBIT_SET(*s))
1186  {
1187  if (*s == '\0')
1188  break;
1189  l = 1;
1190  }
1191  else
1192  {
1193  l = pg_eucjp_verifychar(s, len);
1194  if (l == -1)
1195  break;
1196  }
1197  s += l;
1198  len -= l;
1199  }
1200 
1201  return s - start;
1202 }
1203 
1204 static int
1205 pg_euckr_verifychar(const unsigned char *s, int len)
1206 {
1207  int l;
1208  unsigned char c1,
1209  c2;
1210 
1211  c1 = *s++;
1212 
1213  if (IS_HIGHBIT_SET(c1))
1214  {
1215  l = 2;
1216  if (l > len)
1217  return -1;
1218  if (!IS_EUC_RANGE_VALID(c1))
1219  return -1;
1220  c2 = *s++;
1221  if (!IS_EUC_RANGE_VALID(c2))
1222  return -1;
1223  }
1224  else
1225  /* must be ASCII */
1226  {
1227  l = 1;
1228  }
1229 
1230  return l;
1231 }
1232 
1233 static int
1234 pg_euckr_verifystr(const unsigned char *s, int len)
1235 {
1236  const unsigned char *start = s;
1237 
1238  while (len > 0)
1239  {
1240  int l;
1241 
1242  /* fast path for ASCII-subset characters */
1243  if (!IS_HIGHBIT_SET(*s))
1244  {
1245  if (*s == '\0')
1246  break;
1247  l = 1;
1248  }
1249  else
1250  {
1251  l = pg_euckr_verifychar(s, len);
1252  if (l == -1)
1253  break;
1254  }
1255  s += l;
1256  len -= l;
1257  }
1258 
1259  return s - start;
1260 }
1261 
1262 /* EUC-CN byte sequences are exactly same as EUC-KR */
1263 #define pg_euccn_verifychar pg_euckr_verifychar
1264 #define pg_euccn_verifystr pg_euckr_verifystr
1265 
1266 static int
1267 pg_euctw_verifychar(const unsigned char *s, int len)
1268 {
1269  int l;
1270  unsigned char c1,
1271  c2;
1272 
1273  c1 = *s++;
1274 
1275  switch (c1)
1276  {
1277  case SS2: /* CNS 11643 Plane 1-7 */
1278  l = 4;
1279  if (l > len)
1280  return -1;
1281  c2 = *s++;
1282  if (c2 < 0xa1 || c2 > 0xa7)
1283  return -1;
1284  c2 = *s++;
1285  if (!IS_EUC_RANGE_VALID(c2))
1286  return -1;
1287  c2 = *s++;
1288  if (!IS_EUC_RANGE_VALID(c2))
1289  return -1;
1290  break;
1291 
1292  case SS3: /* unused */
1293  return -1;
1294 
1295  default:
1296  if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1297  {
1298  l = 2;
1299  if (l > len)
1300  return -1;
1301  /* no further range check on c1? */
1302  c2 = *s++;
1303  if (!IS_EUC_RANGE_VALID(c2))
1304  return -1;
1305  }
1306  else
1307  /* must be ASCII */
1308  {
1309  l = 1;
1310  }
1311  break;
1312  }
1313  return l;
1314 }
1315 
1316 static int
1317 pg_euctw_verifystr(const unsigned char *s, int len)
1318 {
1319  const unsigned char *start = s;
1320 
1321  while (len > 0)
1322  {
1323  int l;
1324 
1325  /* fast path for ASCII-subset characters */
1326  if (!IS_HIGHBIT_SET(*s))
1327  {
1328  if (*s == '\0')
1329  break;
1330  l = 1;
1331  }
1332  else
1333  {
1334  l = pg_euctw_verifychar(s, len);
1335  if (l == -1)
1336  break;
1337  }
1338  s += l;
1339  len -= l;
1340  }
1341 
1342  return s - start;
1343 }
1344 
1345 static int
1346 pg_johab_verifychar(const unsigned char *s, int len)
1347 {
1348  int l,
1349  mbl;
1350  unsigned char c;
1351 
1352  l = mbl = pg_johab_mblen(s);
1353 
1354  if (len < l)
1355  return -1;
1356 
1357  if (!IS_HIGHBIT_SET(*s))
1358  return mbl;
1359 
1360  while (--l > 0)
1361  {
1362  c = *++s;
1363  if (!IS_EUC_RANGE_VALID(c))
1364  return -1;
1365  }
1366  return mbl;
1367 }
1368 
1369 static int
1370 pg_johab_verifystr(const unsigned char *s, int len)
1371 {
1372  const unsigned char *start = s;
1373 
1374  while (len > 0)
1375  {
1376  int l;
1377 
1378  /* fast path for ASCII-subset characters */
1379  if (!IS_HIGHBIT_SET(*s))
1380  {
1381  if (*s == '\0')
1382  break;
1383  l = 1;
1384  }
1385  else
1386  {
1387  l = pg_johab_verifychar(s, len);
1388  if (l == -1)
1389  break;
1390  }
1391  s += l;
1392  len -= l;
1393  }
1394 
1395  return s - start;
1396 }
1397 
1398 static int
1399 pg_mule_verifychar(const unsigned char *s, int len)
1400 {
1401  int l,
1402  mbl;
1403  unsigned char c;
1404 
1405  l = mbl = pg_mule_mblen(s);
1406 
1407  if (len < l)
1408  return -1;
1409 
1410  while (--l > 0)
1411  {
1412  c = *++s;
1413  if (!IS_HIGHBIT_SET(c))
1414  return -1;
1415  }
1416  return mbl;
1417 }
1418 
1419 static int
1420 pg_mule_verifystr(const unsigned char *s, int len)
1421 {
1422  const unsigned char *start = s;
1423 
1424  while (len > 0)
1425  {
1426  int l;
1427 
1428  /* fast path for ASCII-subset characters */
1429  if (!IS_HIGHBIT_SET(*s))
1430  {
1431  if (*s == '\0')
1432  break;
1433  l = 1;
1434  }
1435  else
1436  {
1437  l = pg_mule_verifychar(s, len);
1438  if (l == -1)
1439  break;
1440  }
1441  s += l;
1442  len -= l;
1443  }
1444 
1445  return s - start;
1446 }
1447 
1448 static int
1449 pg_latin1_verifychar(const unsigned char *s, int len)
1450 {
1451  return 1;
1452 }
1453 
1454 static int
1455 pg_latin1_verifystr(const unsigned char *s, int len)
1456 {
1457  const unsigned char *nullpos = memchr(s, 0, len);
1458 
1459  if (nullpos == NULL)
1460  return len;
1461  else
1462  return nullpos - s;
1463 }
1464 
1465 static int
1466 pg_sjis_verifychar(const unsigned char *s, int len)
1467 {
1468  int l,
1469  mbl;
1470  unsigned char c1,
1471  c2;
1472 
1473  l = mbl = pg_sjis_mblen(s);
1474 
1475  if (len < l)
1476  return -1;
1477 
1478  if (l == 1) /* pg_sjis_mblen already verified it */
1479  return mbl;
1480 
1481  c1 = *s++;
1482  c2 = *s;
1483  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1484  return -1;
1485  return mbl;
1486 }
1487 
1488 static int
1489 pg_sjis_verifystr(const unsigned char *s, int len)
1490 {
1491  const unsigned char *start = s;
1492 
1493  while (len > 0)
1494  {
1495  int l;
1496 
1497  /* fast path for ASCII-subset characters */
1498  if (!IS_HIGHBIT_SET(*s))
1499  {
1500  if (*s == '\0')
1501  break;
1502  l = 1;
1503  }
1504  else
1505  {
1506  l = pg_sjis_verifychar(s, len);
1507  if (l == -1)
1508  break;
1509  }
1510  s += l;
1511  len -= l;
1512  }
1513 
1514  return s - start;
1515 }
1516 
1517 static int
1518 pg_big5_verifychar(const unsigned char *s, int len)
1519 {
1520  int l,
1521  mbl;
1522 
1523  l = mbl = pg_big5_mblen(s);
1524 
1525  if (len < l)
1526  return -1;
1527 
1528  while (--l > 0)
1529  {
1530  if (*++s == '\0')
1531  return -1;
1532  }
1533 
1534  return mbl;
1535 }
1536 
1537 static int
1538 pg_big5_verifystr(const unsigned char *s, int len)
1539 {
1540  const unsigned char *start = s;
1541 
1542  while (len > 0)
1543  {
1544  int l;
1545 
1546  /* fast path for ASCII-subset characters */
1547  if (!IS_HIGHBIT_SET(*s))
1548  {
1549  if (*s == '\0')
1550  break;
1551  l = 1;
1552  }
1553  else
1554  {
1555  l = pg_big5_verifychar(s, len);
1556  if (l == -1)
1557  break;
1558  }
1559  s += l;
1560  len -= l;
1561  }
1562 
1563  return s - start;
1564 }
1565 
1566 static int
1567 pg_gbk_verifychar(const unsigned char *s, int len)
1568 {
1569  int l,
1570  mbl;
1571 
1572  l = mbl = pg_gbk_mblen(s);
1573 
1574  if (len < l)
1575  return -1;
1576 
1577  while (--l > 0)
1578  {
1579  if (*++s == '\0')
1580  return -1;
1581  }
1582 
1583  return mbl;
1584 }
1585 
1586 static int
1587 pg_gbk_verifystr(const unsigned char *s, int len)
1588 {
1589  const unsigned char *start = s;
1590 
1591  while (len > 0)
1592  {
1593  int l;
1594 
1595  /* fast path for ASCII-subset characters */
1596  if (!IS_HIGHBIT_SET(*s))
1597  {
1598  if (*s == '\0')
1599  break;
1600  l = 1;
1601  }
1602  else
1603  {
1604  l = pg_gbk_verifychar(s, len);
1605  if (l == -1)
1606  break;
1607  }
1608  s += l;
1609  len -= l;
1610  }
1611 
1612  return s - start;
1613 }
1614 
1615 static int
1616 pg_uhc_verifychar(const unsigned char *s, int len)
1617 {
1618  int l,
1619  mbl;
1620 
1621  l = mbl = pg_uhc_mblen(s);
1622 
1623  if (len < l)
1624  return -1;
1625 
1626  while (--l > 0)
1627  {
1628  if (*++s == '\0')
1629  return -1;
1630  }
1631 
1632  return mbl;
1633 }
1634 
1635 static int
1636 pg_uhc_verifystr(const unsigned char *s, int len)
1637 {
1638  const unsigned char *start = s;
1639 
1640  while (len > 0)
1641  {
1642  int l;
1643 
1644  /* fast path for ASCII-subset characters */
1645  if (!IS_HIGHBIT_SET(*s))
1646  {
1647  if (*s == '\0')
1648  break;
1649  l = 1;
1650  }
1651  else
1652  {
1653  l = pg_uhc_verifychar(s, len);
1654  if (l == -1)
1655  break;
1656  }
1657  s += l;
1658  len -= l;
1659  }
1660 
1661  return s - start;
1662 }
1663 
1664 static int
1665 pg_gb18030_verifychar(const unsigned char *s, int len)
1666 {
1667  int l;
1668 
1669  if (!IS_HIGHBIT_SET(*s))
1670  l = 1; /* ASCII */
1671  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1672  {
1673  /* Should be 4-byte, validate remaining bytes */
1674  if (*s >= 0x81 && *s <= 0xfe &&
1675  *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1676  *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1677  l = 4;
1678  else
1679  l = -1;
1680  }
1681  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1682  {
1683  /* Should be 2-byte, validate */
1684  if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1685  (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1686  l = 2;
1687  else
1688  l = -1;
1689  }
1690  else
1691  l = -1;
1692  return l;
1693 }
1694 
1695 static int
1696 pg_gb18030_verifystr(const unsigned char *s, int len)
1697 {
1698  const unsigned char *start = s;
1699 
1700  while (len > 0)
1701  {
1702  int l;
1703 
1704  /* fast path for ASCII-subset characters */
1705  if (!IS_HIGHBIT_SET(*s))
1706  {
1707  if (*s == '\0')
1708  break;
1709  l = 1;
1710  }
1711  else
1712  {
1713  l = pg_gb18030_verifychar(s, len);
1714  if (l == -1)
1715  break;
1716  }
1717  s += l;
1718  len -= l;
1719  }
1720 
1721  return s - start;
1722 }
1723 
1724 static int
1725 pg_utf8_verifychar(const unsigned char *s, int len)
1726 {
1727  int l;
1728 
1729  if ((*s & 0x80) == 0)
1730  {
1731  if (*s == '\0')
1732  return -1;
1733  return 1;
1734  }
1735  else if ((*s & 0xe0) == 0xc0)
1736  l = 2;
1737  else if ((*s & 0xf0) == 0xe0)
1738  l = 3;
1739  else if ((*s & 0xf8) == 0xf0)
1740  l = 4;
1741  else
1742  l = 1;
1743 
1744  if (l > len)
1745  return -1;
1746 
1747  if (!pg_utf8_islegal(s, l))
1748  return -1;
1749 
1750  return l;
1751 }
1752 
1753 /*
1754  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1755  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1756  * input byte and current state are used to compute an index into an array of
1757  * state transitions. Since the address of the next transition is dependent
1758  * on this computation, there is latency in executing the load instruction,
1759  * and the CPU is not kept busy.
1760  *
1761  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1762  *
1763  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1764  *
1765  * In a shift-based DFA, the input byte is an index into array of integers
1766  * whose bit pattern encodes the state transitions. To compute the next
1767  * state, we simply right-shift the integer by the current state and apply a
1768  * mask. In this scheme, the address of the transition only depends on the
1769  * input byte, so there is better pipelining.
1770  *
1771  * The naming convention for states and transitions was adopted from a UTF-8
1772  * to UTF-16/32 transcoder, whose table is reproduced below:
1773  *
1774  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1775  *
1776  * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1777  * ==========================================================================
1778  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1779  * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1780  * |
1781  * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1782  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1783  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1784  * |
1785  * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1786  * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1787  * |
1788  * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1789  * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1790  *
1791  * In the most straightforward implementation, a shift-based DFA for UTF-8
1792  * requires 64-bit integers to encode the transitions, but with an SMT solver
1793  * it's possible to find state numbers such that the transitions fit within
1794  * 32-bit integers, as Dougall Johnson demonstrated:
1795  *
1796  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1797  *
1798  * This packed representation is the reason for the seemingly odd choice of
1799  * state values below.
1800  */
1801 
1802 /* Error */
1803 #define ERR 0
1804 /* Begin */
1805 #define BGN 11
1806 /* Continuation states, expect 1/2/3 continuation bytes */
1807 #define CS1 16
1808 #define CS2 1
1809 #define CS3 5
1810 /* Partial states, where the first continuation byte has a restricted range */
1811 #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1812 #define P3B 20 /* Lead was ED, check for surrogate */
1813 #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1814 #define P4B 30 /* Lead was F4, check for too-large */
1815 /* Begin and End are the same state */
1816 #define END BGN
1817 
1818 /* the encoded state transitions for the lookup table */
1819 
1820 /* ASCII */
1821 #define ASC (END << BGN)
1822 /* 2-byte lead */
1823 #define L2A (CS1 << BGN)
1824 /* 3-byte lead */
1825 #define L3A (P3A << BGN)
1826 #define L3B (CS2 << BGN)
1827 #define L3C (P3B << BGN)
1828 /* 4-byte lead */
1829 #define L4A (P4A << BGN)
1830 #define L4B (CS3 << BGN)
1831 #define L4C (P4B << BGN)
1832 /* continuation byte */
1833 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1834 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1835 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1836 /* invalid byte */
1837 #define ILL ERR
1838 
1839 static const uint32 Utf8Transition[256] =
1840 {
1841  /* ASCII */
1842 
1843  ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1844  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1845  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1846  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1847 
1848  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1849  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1850  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1851  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1852 
1853  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1854  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1855  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1856  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1857 
1858  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1859  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1860  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1861  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1862 
1863  /* continuation bytes */
1864 
1865  /* 80..8F */
1866  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1867  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1868 
1869  /* 90..9F */
1870  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1871  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1872 
1873  /* A0..BF */
1874  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1875  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1876  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1877  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1878 
1879  /* leading bytes */
1880 
1881  /* C0..DF */
1882  ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1883  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1884  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1885  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1886 
1887  /* E0..EF */
1888  L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1889  L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1890 
1891  /* F0..FF */
1892  L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1893  ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1894 };
1895 
1896 static void
1897 utf8_advance(const unsigned char *s, uint32 *state, int len)
1898 {
1899  /* Note: We deliberately don't check the state's value here. */
1900  while (len > 0)
1901  {
1902  /*
1903  * It's important that the mask value is 31: In most instruction sets,
1904  * a shift by a 32-bit operand is understood to be a shift by its mod
1905  * 32, so the compiler should elide the mask operation.
1906  */
1907  *state = Utf8Transition[*s++] >> (*state & 31);
1908  len--;
1909  }
1910 
1911  *state &= 31;
1912 }
1913 
1914 static int
1915 pg_utf8_verifystr(const unsigned char *s, int len)
1916 {
1917  const unsigned char *start = s;
1918  const int orig_len = len;
1919  uint32 state = BGN;
1920 
1921 /*
1922  * With a stride of two vector widths, gcc will unroll the loop. Even if
1923  * the compiler can unroll a longer loop, it's not worth it because we
1924  * must fall back to the byte-wise algorithm if we find any non-ASCII.
1925  */
1926 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1927 
1928  if (len >= STRIDE_LENGTH)
1929  {
1930  while (len >= STRIDE_LENGTH)
1931  {
1932  /*
1933  * If the chunk is all ASCII, we can skip the full UTF-8 check,
1934  * but we must first check for a non-END state, which means the
1935  * previous chunk ended in the middle of a multibyte sequence.
1936  */
1937  if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1939 
1940  s += STRIDE_LENGTH;
1941  len -= STRIDE_LENGTH;
1942  }
1943 
1944  /* The error state persists, so we only need to check for it here. */
1945  if (state == ERR)
1946  {
1947  /*
1948  * Start over from the beginning with the slow path so we can
1949  * count the valid bytes.
1950  */
1951  len = orig_len;
1952  s = start;
1953  }
1954  else if (state != END)
1955  {
1956  /*
1957  * The fast path exited in the middle of a multibyte sequence.
1958  * Walk backwards to find the leading byte so that the slow path
1959  * can resume checking from there. We must always backtrack at
1960  * least one byte, since the current byte could be e.g. an ASCII
1961  * byte after a 2-byte lead, which is invalid.
1962  */
1963  do
1964  {
1965  Assert(s > start);
1966  s--;
1967  len++;
1968  Assert(IS_HIGHBIT_SET(*s));
1969  } while (pg_utf_mblen(s) <= 1);
1970  }
1971  }
1972 
1973  /* check remaining bytes */
1974  while (len > 0)
1975  {
1976  int l;
1977 
1978  /* fast path for ASCII-subset characters */
1979  if (!IS_HIGHBIT_SET(*s))
1980  {
1981  if (*s == '\0')
1982  break;
1983  l = 1;
1984  }
1985  else
1986  {
1987  l = pg_utf8_verifychar(s, len);
1988  if (l == -1)
1989  break;
1990  }
1991  s += l;
1992  len -= l;
1993  }
1994 
1995  return s - start;
1996 }
1997 
1998 /*
1999  * Check for validity of a single UTF-8 encoded character
2000  *
2001  * This directly implements the rules in RFC3629. The bizarre-looking
2002  * restrictions on the second byte are meant to ensure that there isn't
2003  * more than one encoding of a given Unicode character point; that is,
2004  * you may not use a longer-than-necessary byte sequence with high order
2005  * zero bits to represent a character that would fit in fewer bytes.
2006  * To do otherwise is to create security hazards (eg, create an apparent
2007  * non-ASCII character that decodes to plain ASCII).
2008  *
2009  * length is assumed to have been obtained by pg_utf_mblen(), and the
2010  * caller must have checked that that many bytes are present in the buffer.
2011  */
2012 bool
2013 pg_utf8_islegal(const unsigned char *source, int length)
2014 {
2015  unsigned char a;
2016 
2017  switch (length)
2018  {
2019  default:
2020  /* reject lengths 5 and 6 for now */
2021  return false;
2022  case 4:
2023  a = source[3];
2024  if (a < 0x80 || a > 0xBF)
2025  return false;
2026  /* FALL THRU */
2027  case 3:
2028  a = source[2];
2029  if (a < 0x80 || a > 0xBF)
2030  return false;
2031  /* FALL THRU */
2032  case 2:
2033  a = source[1];
2034  switch (*source)
2035  {
2036  case 0xE0:
2037  if (a < 0xA0 || a > 0xBF)
2038  return false;
2039  break;
2040  case 0xED:
2041  if (a < 0x80 || a > 0x9F)
2042  return false;
2043  break;
2044  case 0xF0:
2045  if (a < 0x90 || a > 0xBF)
2046  return false;
2047  break;
2048  case 0xF4:
2049  if (a < 0x80 || a > 0x8F)
2050  return false;
2051  break;
2052  default:
2053  if (a < 0x80 || a > 0xBF)
2054  return false;
2055  break;
2056  }
2057  /* FALL THRU */
2058  case 1:
2059  a = *source;
2060  if (a >= 0x80 && a < 0xC2)
2061  return false;
2062  if (a > 0xF4)
2063  return false;
2064  break;
2065  }
2066  return true;
2067 }
2068 
2069 
2070 /*
2071  *-------------------------------------------------------------------
2072  * encoding info table
2073  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
2074  *-------------------------------------------------------------------
2075  */
2114  {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, /* PG_GBK */
2115  {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, /* PG_UHC */
2118  {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
2119 };
2120 
2121 /*
2122  * Returns the byte length of a multibyte character.
2123  *
2124  * Caution: when dealing with text that is not certainly valid in the
2125  * specified encoding, the result may exceed the actual remaining
2126  * string length. Callers that are not prepared to deal with that
2127  * should use pg_encoding_mblen_bounded() instead.
2128  */
2129 int
2130 pg_encoding_mblen(int encoding, const char *mbstr)
2131 {
2132  return (PG_VALID_ENCODING(encoding) ?
2133  pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2134  pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2135 }
2136 
2137 /*
2138  * Returns the byte length of a multibyte character; but not more than
2139  * the distance to end of string.
2140  */
2141 int
2142 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2143 {
2144  return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2145 }
2146 
2147 /*
2148  * Returns the display length of a multibyte character.
2149  */
2150 int
2151 pg_encoding_dsplen(int encoding, const char *mbstr)
2152 {
2153  return (PG_VALID_ENCODING(encoding) ?
2154  pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2155  pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2156 }
2157 
2158 /*
2159  * Verify the first multibyte character of the given string.
2160  * Return its byte length if good, -1 if bad. (See comments above for
2161  * full details of the mbverifychar API.)
2162  */
2163 int
2164 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2165 {
2166  return (PG_VALID_ENCODING(encoding) ?
2167  pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2168  pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2169 }
2170 
2171 /*
2172  * Verify that a string is valid for the given encoding.
2173  * Returns the number of input bytes (<= len) that form a valid string.
2174  * (See comments above for full details of the mbverifystr API.)
2175  */
2176 int
2177 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2178 {
2179  return (PG_VALID_ENCODING(encoding) ?
2180  pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2181  pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2182 }
2183 
2184 /*
2185  * fetch maximum length of a given encoding
2186  */
2187 int
2189 {
2191 
2193 }
unsigned int uint32
Definition: c.h:490
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1163
int a
Definition: isn.c:69
Assert(fmt[strlen(fmt) - 1] !='\n')
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
int32 encoding
Definition: pg_database.h:41
static rewind_source * source
Definition: pg_rewind.c:87
#define IS_LCPRV2(c)
Definition: pg_wchar.h:163
#define ISSJISTAIL(c)
Definition: pg_wchar.h:44
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
#define LCPRV1_A
Definition: pg_wchar.h:149
#define LCPRV1_B
Definition: pg_wchar.h:150
#define IS_LC2(c)
Definition: pg_wchar.h:143
#define IS_LCPRV1(c)
Definition: pg_wchar.h:151
#define LCPRV2_A
Definition: pg_wchar.h:161
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:166
#define SS2
Definition: pg_wchar.h:37
static bool is_valid_ascii(const unsigned char *s, int len)
Definition: pg_wchar.h:697
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:152
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:154
#define ISSJISHEAD(c)
Definition: pg_wchar.h:43
#define IS_LC1(c)
Definition: pg_wchar.h:125
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:164
#define SS3
Definition: pg_wchar.h:38
#define LCPRV2_B
Definition: pg_wchar.h:162
size_t strnlen(const char *str, size_t maxlen)
Definition: strnlen.c:26
char * c
unsigned int first
Definition: wchar.c:586
unsigned int last
Definition: wchar.c:587
int maxmblen
Definition: pg_wchar.h:392
Definition: regguts.h:323
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1636
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:921
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition: wchar.c:2142
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:317
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:200
static const uint32 Utf8Transition[256]
Definition: wchar.c:1839
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:2013
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1108
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1449
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:944
#define CR3
Definition: wchar.c:1835
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1466
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:83
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:169
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:48
#define L3B
Definition: wchar.c:1826
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1616
#define L2A
Definition: wchar.c:1823
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:1000
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:188
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:277
#define END
Definition: wchar.c:1816
#define pg_euccn_verifychar
Definition: wchar.c:1263
#define L4C
Definition: wchar.c:1831
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1489
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:401
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:483
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:407
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1538
#define CR2
Definition: wchar.c:1834
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1399
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:157
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1455
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:915
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1102
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:63
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:592
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:973
#define pg_euccn_verifystr
Definition: wchar.c:1264
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:549
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:163
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1567
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:961
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:261
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1267
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1205
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1317
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1587
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1068
#define ERR
Definition: wchar.c:1803
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:419
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:832
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:249
#define ASC
Definition: wchar.c:1821
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:988
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1176
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1370
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:138
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1696
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1234
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:766
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:930
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1118
pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: wchar.c:679
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:1027
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1121
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1518
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1665
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1420
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:355
#define L3C
Definition: wchar.c:1827
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1725
#define CR1
Definition: wchar.c:1833
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:900
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:518
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:210
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:1054
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:2151
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition: wchar.c:1897
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:333
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:878
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:1015
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:122
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:850
#define L3A
Definition: wchar.c:1825
#define L4B
Definition: wchar.c:1830
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition: wchar.c:2177
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1915
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:194
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2076
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:713
#define BGN
Definition: wchar.c:1805
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2188
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2130
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1346
#define ILL
Definition: wchar.c:1837
#define STRIDE_LENGTH
#define L4A
Definition: wchar.c:1829
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:69
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:2164
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:639
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:701