PostgreSQL Source Code  git master
wchar.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wchar.c
4  * Functions for working with multibyte characters in various encodings.
5  *
6  * Portions Copyright (c) 1998-2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/common/wchar.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include "mb/pg_wchar.h"
16 #include "utils/ascii.h"
17 
18 
19 /*
20  * Operations on multi-byte encodings are driven by a table of helper
21  * functions.
22  *
23  * To add an encoding support, define mblen(), dsplen(), verifychar() and
24  * verifystr() for the encoding. For server-encodings, also define mb2wchar()
25  * and wchar2mb() conversion functions.
26  *
27  * These functions generally assume that their input is validly formed.
28  * The "verifier" functions, further down in the file, have to be more
29  * paranoid.
30  *
31  * We expect that mblen() does not need to examine more than the first byte
32  * of the character to discover the correct length. GB18030 is an exception
33  * to that rule, though, as it also looks at second byte. But even that
34  * behaves in a predictable way, if you only pass the first byte: it will
35  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
36  * good enough for all current uses.
37  *
38  * Note: for the display output of psql to work properly, the return values
39  * of the dsplen functions must conform to the Unicode standard. In particular
40  * the NUL character is zero width and control characters are generally
41  * width -1. It is recommended that non-ASCII encodings refer their ASCII
42  * subset to the ASCII routines to ensure consistency.
43  */
44 
45 /*
46  * SQL/ASCII
47  */
48 static int
49 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
50 {
51  int cnt = 0;
52 
53  while (len > 0 && *from)
54  {
55  *to++ = *from++;
56  len--;
57  cnt++;
58  }
59  *to = 0;
60  return cnt;
61 }
62 
63 static int
64 pg_ascii_mblen(const unsigned char *s)
65 {
66  return 1;
67 }
68 
69 static int
70 pg_ascii_dsplen(const unsigned char *s)
71 {
72  if (*s == '\0')
73  return 0;
74  if (*s < 0x20 || *s == 0x7f)
75  return -1;
76 
77  return 1;
78 }
79 
80 /*
81  * EUC
82  */
83 static int
84 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
85 {
86  int cnt = 0;
87 
88  while (len > 0 && *from)
89  {
90  if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
91  * KANA") */
92  {
93  from++;
94  *to = (SS2 << 8) | *from++;
95  len -= 2;
96  }
97  else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
98  {
99  from++;
100  *to = (SS3 << 16) | (*from++ << 8);
101  *to |= *from++;
102  len -= 3;
103  }
104  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
105  {
106  *to = *from++ << 8;
107  *to |= *from++;
108  len -= 2;
109  }
110  else /* must be ASCII */
111  {
112  *to = *from++;
113  len--;
114  }
115  to++;
116  cnt++;
117  }
118  *to = 0;
119  return cnt;
120 }
121 
122 static inline int
123 pg_euc_mblen(const unsigned char *s)
124 {
125  int len;
126 
127  if (*s == SS2)
128  len = 2;
129  else if (*s == SS3)
130  len = 3;
131  else if (IS_HIGHBIT_SET(*s))
132  len = 2;
133  else
134  len = 1;
135  return len;
136 }
137 
138 static inline int
139 pg_euc_dsplen(const unsigned char *s)
140 {
141  int len;
142 
143  if (*s == SS2)
144  len = 2;
145  else if (*s == SS3)
146  len = 2;
147  else if (IS_HIGHBIT_SET(*s))
148  len = 2;
149  else
150  len = pg_ascii_dsplen(s);
151  return len;
152 }
153 
154 /*
155  * EUC_JP
156  */
157 static int
158 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
159 {
160  return pg_euc2wchar_with_len(from, to, len);
161 }
162 
163 static int
164 pg_eucjp_mblen(const unsigned char *s)
165 {
166  return pg_euc_mblen(s);
167 }
168 
169 static int
170 pg_eucjp_dsplen(const unsigned char *s)
171 {
172  int len;
173 
174  if (*s == SS2)
175  len = 1;
176  else if (*s == SS3)
177  len = 2;
178  else if (IS_HIGHBIT_SET(*s))
179  len = 2;
180  else
181  len = pg_ascii_dsplen(s);
182  return len;
183 }
184 
185 /*
186  * EUC_KR
187  */
188 static int
189 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
190 {
191  return pg_euc2wchar_with_len(from, to, len);
192 }
193 
194 static int
195 pg_euckr_mblen(const unsigned char *s)
196 {
197  return pg_euc_mblen(s);
198 }
199 
200 static int
201 pg_euckr_dsplen(const unsigned char *s)
202 {
203  return pg_euc_dsplen(s);
204 }
205 
206 /*
207  * EUC_CN
208  *
209  */
210 static int
211 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
212 {
213  int cnt = 0;
214 
215  while (len > 0 && *from)
216  {
217  if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
218  {
219  from++;
220  *to = (SS2 << 16) | (*from++ << 8);
221  *to |= *from++;
222  len -= 3;
223  }
224  else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
225  {
226  from++;
227  *to = (SS3 << 16) | (*from++ << 8);
228  *to |= *from++;
229  len -= 3;
230  }
231  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
232  {
233  *to = *from++ << 8;
234  *to |= *from++;
235  len -= 2;
236  }
237  else
238  {
239  *to = *from++;
240  len--;
241  }
242  to++;
243  cnt++;
244  }
245  *to = 0;
246  return cnt;
247 }
248 
249 static int
250 pg_euccn_mblen(const unsigned char *s)
251 {
252  int len;
253 
254  if (IS_HIGHBIT_SET(*s))
255  len = 2;
256  else
257  len = 1;
258  return len;
259 }
260 
261 static int
262 pg_euccn_dsplen(const unsigned char *s)
263 {
264  int len;
265 
266  if (IS_HIGHBIT_SET(*s))
267  len = 2;
268  else
269  len = pg_ascii_dsplen(s);
270  return len;
271 }
272 
273 /*
274  * EUC_TW
275  *
276  */
277 static int
278 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
279 {
280  int cnt = 0;
281 
282  while (len > 0 && *from)
283  {
284  if (*from == SS2 && len >= 4) /* code set 2 */
285  {
286  from++;
287  *to = (((uint32) SS2) << 24) | (*from++ << 16);
288  *to |= *from++ << 8;
289  *to |= *from++;
290  len -= 4;
291  }
292  else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
293  {
294  from++;
295  *to = (SS3 << 16) | (*from++ << 8);
296  *to |= *from++;
297  len -= 3;
298  }
299  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
300  {
301  *to = *from++ << 8;
302  *to |= *from++;
303  len -= 2;
304  }
305  else
306  {
307  *to = *from++;
308  len--;
309  }
310  to++;
311  cnt++;
312  }
313  *to = 0;
314  return cnt;
315 }
316 
317 static int
318 pg_euctw_mblen(const unsigned char *s)
319 {
320  int len;
321 
322  if (*s == SS2)
323  len = 4;
324  else if (*s == SS3)
325  len = 3;
326  else if (IS_HIGHBIT_SET(*s))
327  len = 2;
328  else
329  len = 1;
330  return len;
331 }
332 
333 static int
334 pg_euctw_dsplen(const unsigned char *s)
335 {
336  int len;
337 
338  if (*s == SS2)
339  len = 2;
340  else if (*s == SS3)
341  len = 2;
342  else if (IS_HIGHBIT_SET(*s))
343  len = 2;
344  else
345  len = pg_ascii_dsplen(s);
346  return len;
347 }
348 
349 /*
350  * Convert pg_wchar to EUC_* encoding.
351  * caller must allocate enough space for "to", including a trailing zero!
352  * len: length of from.
353  * "from" not necessarily null terminated.
354  */
355 static int
356 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
357 {
358  int cnt = 0;
359 
360  while (len > 0 && *from)
361  {
362  unsigned char c;
363 
364  if ((c = (*from >> 24)))
365  {
366  *to++ = c;
367  *to++ = (*from >> 16) & 0xff;
368  *to++ = (*from >> 8) & 0xff;
369  *to++ = *from & 0xff;
370  cnt += 4;
371  }
372  else if ((c = (*from >> 16)))
373  {
374  *to++ = c;
375  *to++ = (*from >> 8) & 0xff;
376  *to++ = *from & 0xff;
377  cnt += 3;
378  }
379  else if ((c = (*from >> 8)))
380  {
381  *to++ = c;
382  *to++ = *from & 0xff;
383  cnt += 2;
384  }
385  else
386  {
387  *to++ = *from;
388  cnt++;
389  }
390  from++;
391  len--;
392  }
393  *to = 0;
394  return cnt;
395 }
396 
397 
398 /*
399  * JOHAB
400  */
401 static int
402 pg_johab_mblen(const unsigned char *s)
403 {
404  return pg_euc_mblen(s);
405 }
406 
407 static int
408 pg_johab_dsplen(const unsigned char *s)
409 {
410  return pg_euc_dsplen(s);
411 }
412 
413 /*
414  * convert UTF8 string to pg_wchar (UCS-4)
415  * caller must allocate enough space for "to", including a trailing zero!
416  * len: length of from.
417  * "from" not necessarily null terminated.
418  */
419 static int
420 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
421 {
422  int cnt = 0;
423  uint32 c1,
424  c2,
425  c3,
426  c4;
427 
428  while (len > 0 && *from)
429  {
430  if ((*from & 0x80) == 0)
431  {
432  *to = *from++;
433  len--;
434  }
435  else if ((*from & 0xe0) == 0xc0)
436  {
437  if (len < 2)
438  break; /* drop trailing incomplete char */
439  c1 = *from++ & 0x1f;
440  c2 = *from++ & 0x3f;
441  *to = (c1 << 6) | c2;
442  len -= 2;
443  }
444  else if ((*from & 0xf0) == 0xe0)
445  {
446  if (len < 3)
447  break; /* drop trailing incomplete char */
448  c1 = *from++ & 0x0f;
449  c2 = *from++ & 0x3f;
450  c3 = *from++ & 0x3f;
451  *to = (c1 << 12) | (c2 << 6) | c3;
452  len -= 3;
453  }
454  else if ((*from & 0xf8) == 0xf0)
455  {
456  if (len < 4)
457  break; /* drop trailing incomplete char */
458  c1 = *from++ & 0x07;
459  c2 = *from++ & 0x3f;
460  c3 = *from++ & 0x3f;
461  c4 = *from++ & 0x3f;
462  *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
463  len -= 4;
464  }
465  else
466  {
467  /* treat a bogus char as length 1; not ours to raise error */
468  *to = *from++;
469  len--;
470  }
471  to++;
472  cnt++;
473  }
474  *to = 0;
475  return cnt;
476 }
477 
478 
479 /*
480  * Map a Unicode code point to UTF-8. utf8string must have at least
481  * unicode_utf8len(c) bytes available.
482  */
483 unsigned char *
484 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
485 {
486  if (c <= 0x7F)
487  {
488  utf8string[0] = c;
489  }
490  else if (c <= 0x7FF)
491  {
492  utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
493  utf8string[1] = 0x80 | (c & 0x3F);
494  }
495  else if (c <= 0xFFFF)
496  {
497  utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
498  utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
499  utf8string[2] = 0x80 | (c & 0x3F);
500  }
501  else
502  {
503  utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
504  utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
505  utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
506  utf8string[3] = 0x80 | (c & 0x3F);
507  }
508 
509  return utf8string;
510 }
511 
512 /*
513  * Trivial conversion from pg_wchar to UTF-8.
514  * caller should allocate enough space for "to"
515  * len: length of from.
516  * "from" not necessarily null terminated.
517  */
518 static int
519 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
520 {
521  int cnt = 0;
522 
523  while (len > 0 && *from)
524  {
525  int char_len;
526 
527  unicode_to_utf8(*from, to);
528  char_len = pg_utf_mblen(to);
529  cnt += char_len;
530  to += char_len;
531  from++;
532  len--;
533  }
534  *to = 0;
535  return cnt;
536 }
537 
538 /*
539  * Return the byte length of a UTF8 character pointed to by s
540  *
541  * Note: in the current implementation we do not support UTF8 sequences
542  * of more than 4 bytes; hence do NOT return a value larger than 4.
543  * We return "1" for any leading byte that is either flat-out illegal or
544  * indicates a length larger than we support.
545  *
546  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
547  * other places would need to be fixed to change this.
548  */
549 int
550 pg_utf_mblen(const unsigned char *s)
551 {
552  int len;
553 
554  if ((*s & 0x80) == 0)
555  len = 1;
556  else if ((*s & 0xe0) == 0xc0)
557  len = 2;
558  else if ((*s & 0xf0) == 0xe0)
559  len = 3;
560  else if ((*s & 0xf8) == 0xf0)
561  len = 4;
562 #ifdef NOT_USED
563  else if ((*s & 0xfc) == 0xf8)
564  len = 5;
565  else if ((*s & 0xfe) == 0xfc)
566  len = 6;
567 #endif
568  else
569  len = 1;
570  return len;
571 }
572 
573 /*
574  * This is an implementation of wcwidth() and wcswidth() as defined in
575  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
576  * <http://www.unix.org/online.html>
577  *
578  * Markus Kuhn -- 2001-09-08 -- public domain
579  *
580  * customised for PostgreSQL
581  *
582  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
583  */
584 
586 {
587  unsigned int first;
588  unsigned int last;
589 };
590 
591 /* auxiliary function for binary search in interval table */
592 static int
593 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
594 {
595  int min = 0;
596  int mid;
597 
598  if (ucs < table[0].first || ucs > table[max].last)
599  return 0;
600  while (max >= min)
601  {
602  mid = (min + max) / 2;
603  if (ucs > table[mid].last)
604  min = mid + 1;
605  else if (ucs < table[mid].first)
606  max = mid - 1;
607  else
608  return 1;
609  }
610 
611  return 0;
612 }
613 
614 
615 /* The following functions define the column width of an ISO 10646
616  * character as follows:
617  *
618  * - The null character (U+0000) has a column width of 0.
619  *
620  * - Other C0/C1 control characters and DEL will lead to a return
621  * value of -1.
622  *
623  * - Non-spacing and enclosing combining characters (general
624  * category code Mn, Me or Cf in the Unicode database) have a
625  * column width of 0.
626  *
627  * - Spacing characters in the East Asian Wide (W) or East Asian
628  * FullWidth (F) category as defined in Unicode Technical
629  * Report #11 have a column width of 2.
630  *
631  * - All remaining characters (including all printable
632  * ISO 8859-1 and WGL4 characters, Unicode control characters,
633  * etc.) have a column width of 1.
634  *
635  * This implementation assumes that wchar_t characters are encoded
636  * in ISO 10646.
637  */
638 
639 static int
641 {
644 
645  /* test for 8-bit control characters */
646  if (ucs == 0)
647  return 0;
648 
649  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
650  return -1;
651 
652  /*
653  * binary search in table of non-spacing characters
654  *
655  * XXX: In the official Unicode sources, it is possible for a character to
656  * be described as both non-spacing and wide at the same time. As of
657  * Unicode 13.0, treating the non-spacing property as the determining
658  * factor for display width leads to the correct behavior, so do that
659  * search first.
660  */
661  if (mbbisearch(ucs, nonspacing,
662  sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
663  return 0;
664 
665  /* binary search in table of wide characters */
666  if (mbbisearch(ucs, east_asian_fw,
667  sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
668  return 2;
669 
670  return 1;
671 }
672 
673 /*
674  * Convert a UTF-8 character to a Unicode code point.
675  * This is a one-character version of pg_utf2wchar_with_len.
676  *
677  * No error checks here, c must point to a long-enough string.
678  */
679 pg_wchar
680 utf8_to_unicode(const unsigned char *c)
681 {
682  if ((*c & 0x80) == 0)
683  return (pg_wchar) c[0];
684  else if ((*c & 0xe0) == 0xc0)
685  return (pg_wchar) (((c[0] & 0x1f) << 6) |
686  (c[1] & 0x3f));
687  else if ((*c & 0xf0) == 0xe0)
688  return (pg_wchar) (((c[0] & 0x0f) << 12) |
689  ((c[1] & 0x3f) << 6) |
690  (c[2] & 0x3f));
691  else if ((*c & 0xf8) == 0xf0)
692  return (pg_wchar) (((c[0] & 0x07) << 18) |
693  ((c[1] & 0x3f) << 12) |
694  ((c[2] & 0x3f) << 6) |
695  (c[3] & 0x3f));
696  else
697  /* that is an invalid code on purpose */
698  return 0xffffffff;
699 }
700 
701 static int
702 pg_utf_dsplen(const unsigned char *s)
703 {
704  return ucs_wcwidth(utf8_to_unicode(s));
705 }
706 
707 /*
708  * convert mule internal code to pg_wchar
709  * caller should allocate enough space for "to"
710  * len: length of from.
711  * "from" not necessarily null terminated.
712  */
713 static int
714 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
715 {
716  int cnt = 0;
717 
718  while (len > 0 && *from)
719  {
720  if (IS_LC1(*from) && len >= 2)
721  {
722  *to = *from++ << 16;
723  *to |= *from++;
724  len -= 2;
725  }
726  else if (IS_LCPRV1(*from) && len >= 3)
727  {
728  from++;
729  *to = *from++ << 16;
730  *to |= *from++;
731  len -= 3;
732  }
733  else if (IS_LC2(*from) && len >= 3)
734  {
735  *to = *from++ << 16;
736  *to |= *from++ << 8;
737  *to |= *from++;
738  len -= 3;
739  }
740  else if (IS_LCPRV2(*from) && len >= 4)
741  {
742  from++;
743  *to = *from++ << 16;
744  *to |= *from++ << 8;
745  *to |= *from++;
746  len -= 4;
747  }
748  else
749  { /* assume ASCII */
750  *to = (unsigned char) *from++;
751  len--;
752  }
753  to++;
754  cnt++;
755  }
756  *to = 0;
757  return cnt;
758 }
759 
760 /*
761  * convert pg_wchar to mule internal code
762  * caller should allocate enough space for "to"
763  * len: length of from.
764  * "from" not necessarily null terminated.
765  */
766 static int
767 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
768 {
769  int cnt = 0;
770 
771  while (len > 0 && *from)
772  {
773  unsigned char lb;
774 
775  lb = (*from >> 16) & 0xff;
776  if (IS_LC1(lb))
777  {
778  *to++ = lb;
779  *to++ = *from & 0xff;
780  cnt += 2;
781  }
782  else if (IS_LC2(lb))
783  {
784  *to++ = lb;
785  *to++ = (*from >> 8) & 0xff;
786  *to++ = *from & 0xff;
787  cnt += 3;
788  }
789  else if (IS_LCPRV1_A_RANGE(lb))
790  {
791  *to++ = LCPRV1_A;
792  *to++ = lb;
793  *to++ = *from & 0xff;
794  cnt += 3;
795  }
796  else if (IS_LCPRV1_B_RANGE(lb))
797  {
798  *to++ = LCPRV1_B;
799  *to++ = lb;
800  *to++ = *from & 0xff;
801  cnt += 3;
802  }
803  else if (IS_LCPRV2_A_RANGE(lb))
804  {
805  *to++ = LCPRV2_A;
806  *to++ = lb;
807  *to++ = (*from >> 8) & 0xff;
808  *to++ = *from & 0xff;
809  cnt += 4;
810  }
811  else if (IS_LCPRV2_B_RANGE(lb))
812  {
813  *to++ = LCPRV2_B;
814  *to++ = lb;
815  *to++ = (*from >> 8) & 0xff;
816  *to++ = *from & 0xff;
817  cnt += 4;
818  }
819  else
820  {
821  *to++ = *from & 0xff;
822  cnt += 1;
823  }
824  from++;
825  len--;
826  }
827  *to = 0;
828  return cnt;
829 }
830 
831 /* exported for direct use by conv.c */
832 int
833 pg_mule_mblen(const unsigned char *s)
834 {
835  int len;
836 
837  if (IS_LC1(*s))
838  len = 2;
839  else if (IS_LCPRV1(*s))
840  len = 3;
841  else if (IS_LC2(*s))
842  len = 3;
843  else if (IS_LCPRV2(*s))
844  len = 4;
845  else
846  len = 1; /* assume ASCII */
847  return len;
848 }
849 
850 static int
851 pg_mule_dsplen(const unsigned char *s)
852 {
853  int len;
854 
855  /*
856  * Note: it's not really appropriate to assume that all multibyte charsets
857  * are double-wide on screen. But this seems an okay approximation for
858  * the MULE charsets we currently support.
859  */
860 
861  if (IS_LC1(*s))
862  len = 1;
863  else if (IS_LCPRV1(*s))
864  len = 1;
865  else if (IS_LC2(*s))
866  len = 2;
867  else if (IS_LCPRV2(*s))
868  len = 2;
869  else
870  len = 1; /* assume ASCII */
871 
872  return len;
873 }
874 
875 /*
876  * ISO8859-1
877  */
878 static int
879 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
880 {
881  int cnt = 0;
882 
883  while (len > 0 && *from)
884  {
885  *to++ = *from++;
886  len--;
887  cnt++;
888  }
889  *to = 0;
890  return cnt;
891 }
892 
893 /*
894  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
895  * high bits.
896  * caller should allocate enough space for "to"
897  * len: length of from.
898  * "from" not necessarily null terminated.
899  */
900 static int
901 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
902 {
903  int cnt = 0;
904 
905  while (len > 0 && *from)
906  {
907  *to++ = *from++;
908  len--;
909  cnt++;
910  }
911  *to = 0;
912  return cnt;
913 }
914 
915 static int
916 pg_latin1_mblen(const unsigned char *s)
917 {
918  return 1;
919 }
920 
921 static int
922 pg_latin1_dsplen(const unsigned char *s)
923 {
924  return pg_ascii_dsplen(s);
925 }
926 
927 /*
928  * SJIS
929  */
930 static int
931 pg_sjis_mblen(const unsigned char *s)
932 {
933  int len;
934 
935  if (*s >= 0xa1 && *s <= 0xdf)
936  len = 1; /* 1 byte kana? */
937  else if (IS_HIGHBIT_SET(*s))
938  len = 2; /* kanji? */
939  else
940  len = 1; /* should be ASCII */
941  return len;
942 }
943 
944 static int
945 pg_sjis_dsplen(const unsigned char *s)
946 {
947  int len;
948 
949  if (*s >= 0xa1 && *s <= 0xdf)
950  len = 1; /* 1 byte kana? */
951  else if (IS_HIGHBIT_SET(*s))
952  len = 2; /* kanji? */
953  else
954  len = pg_ascii_dsplen(s); /* should be ASCII */
955  return len;
956 }
957 
958 /*
959  * Big5
960  */
961 static int
962 pg_big5_mblen(const unsigned char *s)
963 {
964  int len;
965 
966  if (IS_HIGHBIT_SET(*s))
967  len = 2; /* kanji? */
968  else
969  len = 1; /* should be ASCII */
970  return len;
971 }
972 
973 static int
974 pg_big5_dsplen(const unsigned char *s)
975 {
976  int len;
977 
978  if (IS_HIGHBIT_SET(*s))
979  len = 2; /* kanji? */
980  else
981  len = pg_ascii_dsplen(s); /* should be ASCII */
982  return len;
983 }
984 
985 /*
986  * GBK
987  */
988 static int
989 pg_gbk_mblen(const unsigned char *s)
990 {
991  int len;
992 
993  if (IS_HIGHBIT_SET(*s))
994  len = 2; /* kanji? */
995  else
996  len = 1; /* should be ASCII */
997  return len;
998 }
999 
1000 static int
1001 pg_gbk_dsplen(const unsigned char *s)
1002 {
1003  int len;
1004 
1005  if (IS_HIGHBIT_SET(*s))
1006  len = 2; /* kanji? */
1007  else
1008  len = pg_ascii_dsplen(s); /* should be ASCII */
1009  return len;
1010 }
1011 
1012 /*
1013  * UHC
1014  */
1015 static int
1016 pg_uhc_mblen(const unsigned char *s)
1017 {
1018  int len;
1019 
1020  if (IS_HIGHBIT_SET(*s))
1021  len = 2; /* 2byte? */
1022  else
1023  len = 1; /* should be ASCII */
1024  return len;
1025 }
1026 
1027 static int
1028 pg_uhc_dsplen(const unsigned char *s)
1029 {
1030  int len;
1031 
1032  if (IS_HIGHBIT_SET(*s))
1033  len = 2; /* 2byte? */
1034  else
1035  len = pg_ascii_dsplen(s); /* should be ASCII */
1036  return len;
1037 }
1038 
1039 /*
1040  * GB18030
1041  * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1042  */
1043 
1044 /*
1045  * Unlike all other mblen() functions, this also looks at the second byte of
1046  * the input. However, if you only pass the first byte of a multi-byte
1047  * string, and \0 as the second byte, this still works in a predictable way:
1048  * a 4-byte character will be reported as two 2-byte characters. That's
1049  * enough for all current uses, as a client-only encoding. It works that
1050  * way, because in any valid 4-byte GB18030-encoded character, the third and
1051  * fourth byte look like a 2-byte encoded character, when looked at
1052  * separately.
1053  */
1054 static int
1055 pg_gb18030_mblen(const unsigned char *s)
1056 {
1057  int len;
1058 
1059  if (!IS_HIGHBIT_SET(*s))
1060  len = 1; /* ASCII */
1061  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1062  len = 4;
1063  else
1064  len = 2;
1065  return len;
1066 }
1067 
1068 static int
1069 pg_gb18030_dsplen(const unsigned char *s)
1070 {
1071  int len;
1072 
1073  if (IS_HIGHBIT_SET(*s))
1074  len = 2;
1075  else
1076  len = pg_ascii_dsplen(s); /* ASCII */
1077  return len;
1078 }
1079 
1080 /*
1081  *-------------------------------------------------------------------
1082  * multibyte sequence validators
1083  *
1084  * The verifychar functions accept "s", a pointer to the first byte of a
1085  * string, and "len", the remaining length of the string. If there is a
1086  * validly encoded character beginning at *s, return its length in bytes;
1087  * else return -1.
1088  *
1089  * The verifystr functions also accept "s", a pointer to a string and "len",
1090  * the length of the string. They verify the whole string, and return the
1091  * number of input bytes (<= len) that are valid. In other words, if the
1092  * whole string is valid, verifystr returns "len", otherwise it returns the
1093  * byte offset of the first invalid character. The verifystr functions must
1094  * test for and reject zeroes in the input.
1095  *
1096  * The verifychar functions can assume that len > 0 and that *s != '\0', but
1097  * they must test for and reject zeroes in any additional bytes of a
1098  * multibyte character. Note that this definition allows the function for a
1099  * single-byte encoding to be just "return 1".
1100  *-------------------------------------------------------------------
1101  */
1102 static int
1103 pg_ascii_verifychar(const unsigned char *s, int len)
1104 {
1105  return 1;
1106 }
1107 
1108 static int
1109 pg_ascii_verifystr(const unsigned char *s, int len)
1110 {
1111  const unsigned char *nullpos = memchr(s, 0, len);
1112 
1113  if (nullpos == NULL)
1114  return len;
1115  else
1116  return nullpos - s;
1117 }
1118 
1119 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1120 
1121 static int
1122 pg_eucjp_verifychar(const unsigned char *s, int len)
1123 {
1124  int l;
1125  unsigned char c1,
1126  c2;
1127 
1128  c1 = *s++;
1129 
1130  switch (c1)
1131  {
1132  case SS2: /* JIS X 0201 */
1133  l = 2;
1134  if (l > len)
1135  return -1;
1136  c2 = *s++;
1137  if (c2 < 0xa1 || c2 > 0xdf)
1138  return -1;
1139  break;
1140 
1141  case SS3: /* JIS X 0212 */
1142  l = 3;
1143  if (l > len)
1144  return -1;
1145  c2 = *s++;
1146  if (!IS_EUC_RANGE_VALID(c2))
1147  return -1;
1148  c2 = *s++;
1149  if (!IS_EUC_RANGE_VALID(c2))
1150  return -1;
1151  break;
1152 
1153  default:
1154  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1155  {
1156  l = 2;
1157  if (l > len)
1158  return -1;
1159  if (!IS_EUC_RANGE_VALID(c1))
1160  return -1;
1161  c2 = *s++;
1162  if (!IS_EUC_RANGE_VALID(c2))
1163  return -1;
1164  }
1165  else
1166  /* must be ASCII */
1167  {
1168  l = 1;
1169  }
1170  break;
1171  }
1172 
1173  return l;
1174 }
1175 
1176 static int
1177 pg_eucjp_verifystr(const unsigned char *s, int len)
1178 {
1179  const unsigned char *start = s;
1180 
1181  while (len > 0)
1182  {
1183  int l;
1184 
1185  /* fast path for ASCII-subset characters */
1186  if (!IS_HIGHBIT_SET(*s))
1187  {
1188  if (*s == '\0')
1189  break;
1190  l = 1;
1191  }
1192  else
1193  {
1194  l = pg_eucjp_verifychar(s, len);
1195  if (l == -1)
1196  break;
1197  }
1198  s += l;
1199  len -= l;
1200  }
1201 
1202  return s - start;
1203 }
1204 
1205 static int
1206 pg_euckr_verifychar(const unsigned char *s, int len)
1207 {
1208  int l;
1209  unsigned char c1,
1210  c2;
1211 
1212  c1 = *s++;
1213 
1214  if (IS_HIGHBIT_SET(c1))
1215  {
1216  l = 2;
1217  if (l > len)
1218  return -1;
1219  if (!IS_EUC_RANGE_VALID(c1))
1220  return -1;
1221  c2 = *s++;
1222  if (!IS_EUC_RANGE_VALID(c2))
1223  return -1;
1224  }
1225  else
1226  /* must be ASCII */
1227  {
1228  l = 1;
1229  }
1230 
1231  return l;
1232 }
1233 
1234 static int
1235 pg_euckr_verifystr(const unsigned char *s, int len)
1236 {
1237  const unsigned char *start = s;
1238 
1239  while (len > 0)
1240  {
1241  int l;
1242 
1243  /* fast path for ASCII-subset characters */
1244  if (!IS_HIGHBIT_SET(*s))
1245  {
1246  if (*s == '\0')
1247  break;
1248  l = 1;
1249  }
1250  else
1251  {
1252  l = pg_euckr_verifychar(s, len);
1253  if (l == -1)
1254  break;
1255  }
1256  s += l;
1257  len -= l;
1258  }
1259 
1260  return s - start;
1261 }
1262 
1263 /* EUC-CN byte sequences are exactly same as EUC-KR */
1264 #define pg_euccn_verifychar pg_euckr_verifychar
1265 #define pg_euccn_verifystr pg_euckr_verifystr
1266 
1267 static int
1268 pg_euctw_verifychar(const unsigned char *s, int len)
1269 {
1270  int l;
1271  unsigned char c1,
1272  c2;
1273 
1274  c1 = *s++;
1275 
1276  switch (c1)
1277  {
1278  case SS2: /* CNS 11643 Plane 1-7 */
1279  l = 4;
1280  if (l > len)
1281  return -1;
1282  c2 = *s++;
1283  if (c2 < 0xa1 || c2 > 0xa7)
1284  return -1;
1285  c2 = *s++;
1286  if (!IS_EUC_RANGE_VALID(c2))
1287  return -1;
1288  c2 = *s++;
1289  if (!IS_EUC_RANGE_VALID(c2))
1290  return -1;
1291  break;
1292 
1293  case SS3: /* unused */
1294  return -1;
1295 
1296  default:
1297  if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1298  {
1299  l = 2;
1300  if (l > len)
1301  return -1;
1302  /* no further range check on c1? */
1303  c2 = *s++;
1304  if (!IS_EUC_RANGE_VALID(c2))
1305  return -1;
1306  }
1307  else
1308  /* must be ASCII */
1309  {
1310  l = 1;
1311  }
1312  break;
1313  }
1314  return l;
1315 }
1316 
1317 static int
1318 pg_euctw_verifystr(const unsigned char *s, int len)
1319 {
1320  const unsigned char *start = s;
1321 
1322  while (len > 0)
1323  {
1324  int l;
1325 
1326  /* fast path for ASCII-subset characters */
1327  if (!IS_HIGHBIT_SET(*s))
1328  {
1329  if (*s == '\0')
1330  break;
1331  l = 1;
1332  }
1333  else
1334  {
1335  l = pg_euctw_verifychar(s, len);
1336  if (l == -1)
1337  break;
1338  }
1339  s += l;
1340  len -= l;
1341  }
1342 
1343  return s - start;
1344 }
1345 
1346 static int
1347 pg_johab_verifychar(const unsigned char *s, int len)
1348 {
1349  int l,
1350  mbl;
1351  unsigned char c;
1352 
1353  l = mbl = pg_johab_mblen(s);
1354 
1355  if (len < l)
1356  return -1;
1357 
1358  if (!IS_HIGHBIT_SET(*s))
1359  return mbl;
1360 
1361  while (--l > 0)
1362  {
1363  c = *++s;
1364  if (!IS_EUC_RANGE_VALID(c))
1365  return -1;
1366  }
1367  return mbl;
1368 }
1369 
1370 static int
1371 pg_johab_verifystr(const unsigned char *s, int len)
1372 {
1373  const unsigned char *start = s;
1374 
1375  while (len > 0)
1376  {
1377  int l;
1378 
1379  /* fast path for ASCII-subset characters */
1380  if (!IS_HIGHBIT_SET(*s))
1381  {
1382  if (*s == '\0')
1383  break;
1384  l = 1;
1385  }
1386  else
1387  {
1388  l = pg_johab_verifychar(s, len);
1389  if (l == -1)
1390  break;
1391  }
1392  s += l;
1393  len -= l;
1394  }
1395 
1396  return s - start;
1397 }
1398 
1399 static int
1400 pg_mule_verifychar(const unsigned char *s, int len)
1401 {
1402  int l,
1403  mbl;
1404  unsigned char c;
1405 
1406  l = mbl = pg_mule_mblen(s);
1407 
1408  if (len < l)
1409  return -1;
1410 
1411  while (--l > 0)
1412  {
1413  c = *++s;
1414  if (!IS_HIGHBIT_SET(c))
1415  return -1;
1416  }
1417  return mbl;
1418 }
1419 
1420 static int
1421 pg_mule_verifystr(const unsigned char *s, int len)
1422 {
1423  const unsigned char *start = s;
1424 
1425  while (len > 0)
1426  {
1427  int l;
1428 
1429  /* fast path for ASCII-subset characters */
1430  if (!IS_HIGHBIT_SET(*s))
1431  {
1432  if (*s == '\0')
1433  break;
1434  l = 1;
1435  }
1436  else
1437  {
1438  l = pg_mule_verifychar(s, len);
1439  if (l == -1)
1440  break;
1441  }
1442  s += l;
1443  len -= l;
1444  }
1445 
1446  return s - start;
1447 }
1448 
1449 static int
1450 pg_latin1_verifychar(const unsigned char *s, int len)
1451 {
1452  return 1;
1453 }
1454 
1455 static int
1456 pg_latin1_verifystr(const unsigned char *s, int len)
1457 {
1458  const unsigned char *nullpos = memchr(s, 0, len);
1459 
1460  if (nullpos == NULL)
1461  return len;
1462  else
1463  return nullpos - s;
1464 }
1465 
1466 static int
1467 pg_sjis_verifychar(const unsigned char *s, int len)
1468 {
1469  int l,
1470  mbl;
1471  unsigned char c1,
1472  c2;
1473 
1474  l = mbl = pg_sjis_mblen(s);
1475 
1476  if (len < l)
1477  return -1;
1478 
1479  if (l == 1) /* pg_sjis_mblen already verified it */
1480  return mbl;
1481 
1482  c1 = *s++;
1483  c2 = *s;
1484  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1485  return -1;
1486  return mbl;
1487 }
1488 
1489 static int
1490 pg_sjis_verifystr(const unsigned char *s, int len)
1491 {
1492  const unsigned char *start = s;
1493 
1494  while (len > 0)
1495  {
1496  int l;
1497 
1498  /* fast path for ASCII-subset characters */
1499  if (!IS_HIGHBIT_SET(*s))
1500  {
1501  if (*s == '\0')
1502  break;
1503  l = 1;
1504  }
1505  else
1506  {
1507  l = pg_sjis_verifychar(s, len);
1508  if (l == -1)
1509  break;
1510  }
1511  s += l;
1512  len -= l;
1513  }
1514 
1515  return s - start;
1516 }
1517 
1518 static int
1519 pg_big5_verifychar(const unsigned char *s, int len)
1520 {
1521  int l,
1522  mbl;
1523 
1524  l = mbl = pg_big5_mblen(s);
1525 
1526  if (len < l)
1527  return -1;
1528 
1529  while (--l > 0)
1530  {
1531  if (*++s == '\0')
1532  return -1;
1533  }
1534 
1535  return mbl;
1536 }
1537 
1538 static int
1539 pg_big5_verifystr(const unsigned char *s, int len)
1540 {
1541  const unsigned char *start = s;
1542 
1543  while (len > 0)
1544  {
1545  int l;
1546 
1547  /* fast path for ASCII-subset characters */
1548  if (!IS_HIGHBIT_SET(*s))
1549  {
1550  if (*s == '\0')
1551  break;
1552  l = 1;
1553  }
1554  else
1555  {
1556  l = pg_big5_verifychar(s, len);
1557  if (l == -1)
1558  break;
1559  }
1560  s += l;
1561  len -= l;
1562  }
1563 
1564  return s - start;
1565 }
1566 
1567 static int
1568 pg_gbk_verifychar(const unsigned char *s, int len)
1569 {
1570  int l,
1571  mbl;
1572 
1573  l = mbl = pg_gbk_mblen(s);
1574 
1575  if (len < l)
1576  return -1;
1577 
1578  while (--l > 0)
1579  {
1580  if (*++s == '\0')
1581  return -1;
1582  }
1583 
1584  return mbl;
1585 }
1586 
1587 static int
1588 pg_gbk_verifystr(const unsigned char *s, int len)
1589 {
1590  const unsigned char *start = s;
1591 
1592  while (len > 0)
1593  {
1594  int l;
1595 
1596  /* fast path for ASCII-subset characters */
1597  if (!IS_HIGHBIT_SET(*s))
1598  {
1599  if (*s == '\0')
1600  break;
1601  l = 1;
1602  }
1603  else
1604  {
1605  l = pg_gbk_verifychar(s, len);
1606  if (l == -1)
1607  break;
1608  }
1609  s += l;
1610  len -= l;
1611  }
1612 
1613  return s - start;
1614 }
1615 
1616 static int
1617 pg_uhc_verifychar(const unsigned char *s, int len)
1618 {
1619  int l,
1620  mbl;
1621 
1622  l = mbl = pg_uhc_mblen(s);
1623 
1624  if (len < l)
1625  return -1;
1626 
1627  while (--l > 0)
1628  {
1629  if (*++s == '\0')
1630  return -1;
1631  }
1632 
1633  return mbl;
1634 }
1635 
1636 static int
1637 pg_uhc_verifystr(const unsigned char *s, int len)
1638 {
1639  const unsigned char *start = s;
1640 
1641  while (len > 0)
1642  {
1643  int l;
1644 
1645  /* fast path for ASCII-subset characters */
1646  if (!IS_HIGHBIT_SET(*s))
1647  {
1648  if (*s == '\0')
1649  break;
1650  l = 1;
1651  }
1652  else
1653  {
1654  l = pg_uhc_verifychar(s, len);
1655  if (l == -1)
1656  break;
1657  }
1658  s += l;
1659  len -= l;
1660  }
1661 
1662  return s - start;
1663 }
1664 
1665 static int
1666 pg_gb18030_verifychar(const unsigned char *s, int len)
1667 {
1668  int l;
1669 
1670  if (!IS_HIGHBIT_SET(*s))
1671  l = 1; /* ASCII */
1672  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1673  {
1674  /* Should be 4-byte, validate remaining bytes */
1675  if (*s >= 0x81 && *s <= 0xfe &&
1676  *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1677  *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1678  l = 4;
1679  else
1680  l = -1;
1681  }
1682  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1683  {
1684  /* Should be 2-byte, validate */
1685  if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1686  (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1687  l = 2;
1688  else
1689  l = -1;
1690  }
1691  else
1692  l = -1;
1693  return l;
1694 }
1695 
1696 static int
1697 pg_gb18030_verifystr(const unsigned char *s, int len)
1698 {
1699  const unsigned char *start = s;
1700 
1701  while (len > 0)
1702  {
1703  int l;
1704 
1705  /* fast path for ASCII-subset characters */
1706  if (!IS_HIGHBIT_SET(*s))
1707  {
1708  if (*s == '\0')
1709  break;
1710  l = 1;
1711  }
1712  else
1713  {
1714  l = pg_gb18030_verifychar(s, len);
1715  if (l == -1)
1716  break;
1717  }
1718  s += l;
1719  len -= l;
1720  }
1721 
1722  return s - start;
1723 }
1724 
1725 static int
1726 pg_utf8_verifychar(const unsigned char *s, int len)
1727 {
1728  int l;
1729 
1730  if ((*s & 0x80) == 0)
1731  {
1732  if (*s == '\0')
1733  return -1;
1734  return 1;
1735  }
1736  else if ((*s & 0xe0) == 0xc0)
1737  l = 2;
1738  else if ((*s & 0xf0) == 0xe0)
1739  l = 3;
1740  else if ((*s & 0xf8) == 0xf0)
1741  l = 4;
1742  else
1743  l = 1;
1744 
1745  if (l > len)
1746  return -1;
1747 
1748  if (!pg_utf8_islegal(s, l))
1749  return -1;
1750 
1751  return l;
1752 }
1753 
1754 /*
1755  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1756  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1757  * input byte and current state are used to compute an index into an array of
1758  * state transitions. Since the address of the next transition is dependent
1759  * on this computation, there is latency in executing the load instruction,
1760  * and the CPU is not kept busy.
1761  *
1762  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1763  *
1764  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1765  *
1766  * In a shift-based DFA, the input byte is an index into array of integers
1767  * whose bit pattern encodes the state transitions. To compute the next
1768  * state, we simply right-shift the integer by the current state and apply a
1769  * mask. In this scheme, the address of the transition only depends on the
1770  * input byte, so there is better pipelining.
1771  *
1772  * The naming convention for states and transitions was adopted from a UTF-8
1773  * to UTF-16/32 transcoder, whose table is reproduced below:
1774  *
1775  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1776  *
1777  * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1778  * ==========================================================================
1779  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1780  * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1781  * |
1782  * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1783  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1784  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1785  * |
1786  * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1787  * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1788  * |
1789  * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1790  * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1791  *
1792  * In the most straightforward implementation, a shift-based DFA for UTF-8
1793  * requires 64-bit integers to encode the transitions, but with an SMT solver
1794  * it's possible to find state numbers such that the transitions fit within
1795  * 32-bit integers, as Dougall Johnson demonstrated:
1796  *
1797  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1798  *
1799  * This packed representation is the reason for the seemingly odd choice of
1800  * state values below.
1801  */
1802 
1803 /* Error */
1804 #define ERR 0
1805 /* Begin */
1806 #define BGN 11
1807 /* Continuation states, expect 1/2/3 continuation bytes */
1808 #define CS1 16
1809 #define CS2 1
1810 #define CS3 5
1811 /* Partial states, where the first continuation byte has a restricted range */
1812 #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1813 #define P3B 20 /* Lead was ED, check for surrogate */
1814 #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1815 #define P4B 30 /* Lead was F4, check for too-large */
1816 /* Begin and End are the same state */
1817 #define END BGN
1818 
1819 /* the encoded state transitions for the lookup table */
1820 
1821 /* ASCII */
1822 #define ASC (END << BGN)
1823 /* 2-byte lead */
1824 #define L2A (CS1 << BGN)
1825 /* 3-byte lead */
1826 #define L3A (P3A << BGN)
1827 #define L3B (CS2 << BGN)
1828 #define L3C (P3B << BGN)
1829 /* 4-byte lead */
1830 #define L4A (P4A << BGN)
1831 #define L4B (CS3 << BGN)
1832 #define L4C (P4B << BGN)
1833 /* continuation byte */
1834 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1835 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1836 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1837 /* invalid byte */
1838 #define ILL ERR
1839 
1840 static const uint32 Utf8Transition[256] =
1841 {
1842  /* ASCII */
1843 
1844  ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1845  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1846  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1847  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1848 
1849  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1850  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1851  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1852  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1853 
1854  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1855  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1856  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1857  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1858 
1859  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1860  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1861  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1862  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1863 
1864  /* continuation bytes */
1865 
1866  /* 80..8F */
1867  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1868  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1869 
1870  /* 90..9F */
1871  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1872  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1873 
1874  /* A0..BF */
1875  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1876  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1877  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1878  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1879 
1880  /* leading bytes */
1881 
1882  /* C0..DF */
1883  ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1884  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1885  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1886  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1887 
1888  /* E0..EF */
1889  L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1890  L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1891 
1892  /* F0..FF */
1893  L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1894  ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1895 };
1896 
1897 static void
1898 utf8_advance(const unsigned char *s, uint32 *state, int len)
1899 {
1900  /* Note: We deliberately don't check the state's value here. */
1901  while (len > 0)
1902  {
1903  /*
1904  * It's important that the mask value is 31: In most instruction sets,
1905  * a shift by a 32-bit operand is understood to be a shift by its mod
1906  * 32, so the compiler should elide the mask operation.
1907  */
1908  *state = Utf8Transition[*s++] >> (*state & 31);
1909  len--;
1910  }
1911 
1912  *state &= 31;
1913 }
1914 
1915 static int
1916 pg_utf8_verifystr(const unsigned char *s, int len)
1917 {
1918  const unsigned char *start = s;
1919  const int orig_len = len;
1920  uint32 state = BGN;
1921 
1922 /*
1923  * With a stride of two vector widths, gcc will unroll the loop. Even if
1924  * the compiler can unroll a longer loop, it's not worth it because we
1925  * must fall back to the byte-wise algorithm if we find any non-ASCII.
1926  */
1927 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1928 
1929  if (len >= STRIDE_LENGTH)
1930  {
1931  while (len >= STRIDE_LENGTH)
1932  {
1933  /*
1934  * If the chunk is all ASCII, we can skip the full UTF-8 check,
1935  * but we must first check for a non-END state, which means the
1936  * previous chunk ended in the middle of a multibyte sequence.
1937  */
1938  if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1940 
1941  s += STRIDE_LENGTH;
1942  len -= STRIDE_LENGTH;
1943  }
1944 
1945  /* The error state persists, so we only need to check for it here. */
1946  if (state == ERR)
1947  {
1948  /*
1949  * Start over from the beginning with the slow path so we can
1950  * count the valid bytes.
1951  */
1952  len = orig_len;
1953  s = start;
1954  }
1955  else if (state != END)
1956  {
1957  /*
1958  * The fast path exited in the middle of a multibyte sequence.
1959  * Walk backwards to find the leading byte so that the slow path
1960  * can resume checking from there. We must always backtrack at
1961  * least one byte, since the current byte could be e.g. an ASCII
1962  * byte after a 2-byte lead, which is invalid.
1963  */
1964  do
1965  {
1966  Assert(s > start);
1967  s--;
1968  len++;
1969  Assert(IS_HIGHBIT_SET(*s));
1970  } while (pg_utf_mblen(s) <= 1);
1971  }
1972  }
1973 
1974  /* check remaining bytes */
1975  while (len > 0)
1976  {
1977  int l;
1978 
1979  /* fast path for ASCII-subset characters */
1980  if (!IS_HIGHBIT_SET(*s))
1981  {
1982  if (*s == '\0')
1983  break;
1984  l = 1;
1985  }
1986  else
1987  {
1988  l = pg_utf8_verifychar(s, len);
1989  if (l == -1)
1990  break;
1991  }
1992  s += l;
1993  len -= l;
1994  }
1995 
1996  return s - start;
1997 }
1998 
1999 /*
2000  * Check for validity of a single UTF-8 encoded character
2001  *
2002  * This directly implements the rules in RFC3629. The bizarre-looking
2003  * restrictions on the second byte are meant to ensure that there isn't
2004  * more than one encoding of a given Unicode character point; that is,
2005  * you may not use a longer-than-necessary byte sequence with high order
2006  * zero bits to represent a character that would fit in fewer bytes.
2007  * To do otherwise is to create security hazards (eg, create an apparent
2008  * non-ASCII character that decodes to plain ASCII).
2009  *
2010  * length is assumed to have been obtained by pg_utf_mblen(), and the
2011  * caller must have checked that that many bytes are present in the buffer.
2012  */
2013 bool
2014 pg_utf8_islegal(const unsigned char *source, int length)
2015 {
2016  unsigned char a;
2017 
2018  switch (length)
2019  {
2020  default:
2021  /* reject lengths 5 and 6 for now */
2022  return false;
2023  case 4:
2024  a = source[3];
2025  if (a < 0x80 || a > 0xBF)
2026  return false;
2027  /* FALL THRU */
2028  case 3:
2029  a = source[2];
2030  if (a < 0x80 || a > 0xBF)
2031  return false;
2032  /* FALL THRU */
2033  case 2:
2034  a = source[1];
2035  switch (*source)
2036  {
2037  case 0xE0:
2038  if (a < 0xA0 || a > 0xBF)
2039  return false;
2040  break;
2041  case 0xED:
2042  if (a < 0x80 || a > 0x9F)
2043  return false;
2044  break;
2045  case 0xF0:
2046  if (a < 0x90 || a > 0xBF)
2047  return false;
2048  break;
2049  case 0xF4:
2050  if (a < 0x80 || a > 0x8F)
2051  return false;
2052  break;
2053  default:
2054  if (a < 0x80 || a > 0xBF)
2055  return false;
2056  break;
2057  }
2058  /* FALL THRU */
2059  case 1:
2060  a = *source;
2061  if (a >= 0x80 && a < 0xC2)
2062  return false;
2063  if (a > 0xF4)
2064  return false;
2065  break;
2066  }
2067  return true;
2068 }
2069 
2070 
2071 /*
2072  *-------------------------------------------------------------------
2073  * encoding info table
2074  *-------------------------------------------------------------------
2075  */
2119 };
2120 
2121 /*
2122  * Returns the byte length of a multibyte character.
2123  *
2124  * Caution: when dealing with text that is not certainly valid in the
2125  * specified encoding, the result may exceed the actual remaining
2126  * string length. Callers that are not prepared to deal with that
2127  * should use pg_encoding_mblen_bounded() instead.
2128  */
2129 int
2130 pg_encoding_mblen(int encoding, const char *mbstr)
2131 {
2132  return (PG_VALID_ENCODING(encoding) ?
2133  pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2134  pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2135 }
2136 
2137 /*
2138  * Returns the byte length of a multibyte character; but not more than
2139  * the distance to end of string.
2140  */
2141 int
2142 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2143 {
2144  return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2145 }
2146 
2147 /*
2148  * Returns the display length of a multibyte character.
2149  */
2150 int
2151 pg_encoding_dsplen(int encoding, const char *mbstr)
2152 {
2153  return (PG_VALID_ENCODING(encoding) ?
2154  pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2155  pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2156 }
2157 
2158 /*
2159  * Verify the first multibyte character of the given string.
2160  * Return its byte length if good, -1 if bad. (See comments above for
2161  * full details of the mbverifychar API.)
2162  */
2163 int
2164 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2165 {
2166  return (PG_VALID_ENCODING(encoding) ?
2167  pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2168  pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2169 }
2170 
2171 /*
2172  * Verify that a string is valid for the given encoding.
2173  * Returns the number of input bytes (<= len) that form a valid string.
2174  * (See comments above for full details of the mbverifystr API.)
2175  */
2176 int
2177 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2178 {
2179  return (PG_VALID_ENCODING(encoding) ?
2180  pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2181  pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2182 }
2183 
2184 /*
2185  * fetch maximum length of a given encoding
2186  */
2187 int
2189 {
2191 
2193 }
static bool is_valid_ascii(const unsigned char *s, int len)
Definition: ascii.h:25
unsigned int uint32
Definition: c.h:493
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1142
int a
Definition: isn.c:69
Assert(fmt[strlen(fmt) - 1] !='\n')
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
int32 encoding
Definition: pg_database.h:41
static rewind_source * source
Definition: pg_rewind.c:89
#define IS_LCPRV2(c)
Definition: pg_wchar.h:164
#define ISSJISTAIL(c)
Definition: pg_wchar.h:45
@ PG_WIN1254
Definition: pg_wchar.h:257
@ PG_LATIN4
Definition: pg_wchar.h:237
@ PG_LATIN9
Definition: pg_wchar.h:242
@ PG_JOHAB
Definition: pg_wchar.h:269
@ PG_GB18030
Definition: pg_wchar.h:268
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_ISO_8859_6
Definition: pg_wchar.h:252
@ PG_WIN1253
Definition: pg_wchar.h:256
@ PG_KOI8U
Definition: pg_wchar.h:260
@ PG_LATIN6
Definition: pg_wchar.h:239
@ PG_MULE_INTERNAL
Definition: pg_wchar.h:233
@ PG_LATIN5
Definition: pg_wchar.h:238
@ PG_EUC_CN
Definition: pg_wchar.h:228
@ PG_UHC
Definition: pg_wchar.h:267
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_ISO_8859_5
Definition: pg_wchar.h:251
@ PG_LATIN10
Definition: pg_wchar.h:243
@ PG_WIN1250
Definition: pg_wchar.h:255
@ PG_ISO_8859_7
Definition: pg_wchar.h:253
@ PG_SJIS
Definition: pg_wchar.h:264
@ PG_LATIN8
Definition: pg_wchar.h:241
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_GBK
Definition: pg_wchar.h:266
@ PG_LATIN3
Definition: pg_wchar.h:236
@ PG_WIN1256
Definition: pg_wchar.h:244
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_EUC_TW
Definition: pg_wchar.h:230
@ PG_WIN1258
Definition: pg_wchar.h:245
@ PG_SHIFT_JIS_2004
Definition: pg_wchar.h:270
@ PG_WIN1252
Definition: pg_wchar.h:250
@ PG_LATIN7
Definition: pg_wchar.h:240
@ PG_UTF8
Definition: pg_wchar.h:232
@ PG_WIN1255
Definition: pg_wchar.h:258
@ PG_WIN1257
Definition: pg_wchar.h:259
@ PG_WIN1251
Definition: pg_wchar.h:249
@ PG_EUC_KR
Definition: pg_wchar.h:229
@ PG_WIN866
Definition: pg_wchar.h:246
@ PG_ISO_8859_8
Definition: pg_wchar.h:254
@ PG_WIN874
Definition: pg_wchar.h:247
@ PG_EUC_JIS_2004
Definition: pg_wchar.h:231
@ PG_BIG5
Definition: pg_wchar.h:265
#define LCPRV1_A
Definition: pg_wchar.h:150
#define LCPRV1_B
Definition: pg_wchar.h:151
#define IS_LC2(c)
Definition: pg_wchar.h:144
#define IS_LCPRV1(c)
Definition: pg_wchar.h:152
#define LCPRV2_A
Definition: pg_wchar.h:162
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:167
#define SS2
Definition: pg_wchar.h:38
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:153
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:155
#define ISSJISHEAD(c)
Definition: pg_wchar.h:44
#define IS_LC1(c)
Definition: pg_wchar.h:126
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:165
#define SS3
Definition: pg_wchar.h:39
#define LCPRV2_B
Definition: pg_wchar.h:163
size_t strnlen(const char *str, size_t maxlen)
Definition: strnlen.c:26
char * c
unsigned int first
Definition: wchar.c:587
unsigned int last
Definition: wchar.c:588
int maxmblen
Definition: pg_wchar.h:386
Definition: regguts.h:323
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1637
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:922
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition: wchar.c:2142
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:318
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:201
static const uint32 Utf8Transition[256]
Definition: wchar.c:1840
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:2014
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1109
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1450
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:945
#define CR3
Definition: wchar.c:1836
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1467
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:84
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:170
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:49
#define L3B
Definition: wchar.c:1827
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1617
#define L2A
Definition: wchar.c:1824
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:1001
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:189
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:278
#define END
Definition: wchar.c:1817
#define pg_euccn_verifychar
Definition: wchar.c:1264
#define L4C
Definition: wchar.c:1832
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1490
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:402
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:484
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:408
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1539
#define CR2
Definition: wchar.c:1835
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1400
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:158
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1456
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:916
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1103
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:64
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:593
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:974
#define pg_euccn_verifystr
Definition: wchar.c:1265
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:550
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:164
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1568
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:962
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:262
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1268
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1206
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1318
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1588
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1069
#define ERR
Definition: wchar.c:1804
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:420
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:833
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:250
#define ASC
Definition: wchar.c:1822
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:989
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1177
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1371
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:139
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1697
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1235
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:767
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:931
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1119
pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: wchar.c:680
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:1028
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1122
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1519
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1666
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1421
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:356
#define L3C
Definition: wchar.c:1828
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1726
#define CR1
Definition: wchar.c:1834
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:901
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:519
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:211
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:1055
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:2151
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition: wchar.c:1898
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:334
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:879
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:1016
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:123
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:851
#define L3A
Definition: wchar.c:1826
#define L4B
Definition: wchar.c:1831
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition: wchar.c:2177
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1916
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:195
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2076
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:714
#define BGN
Definition: wchar.c:1806
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2188
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2130
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1347
#define ILL
Definition: wchar.c:1838
#define STRIDE_LENGTH
#define L4A
Definition: wchar.c:1830
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:70
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:2164
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:640
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:702