PostgreSQL Source Code  git master
wchar.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wchar.c
4  * Functions for working with multibyte characters in various encodings.
5  *
6  * Portions Copyright (c) 1998-2021, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/common/wchar.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include "mb/pg_wchar.h"
16 
17 
18 /*
19  * Operations on multi-byte encodings are driven by a table of helper
20  * functions.
21  *
22  * To add an encoding support, define mblen(), dsplen(), verifychar() and
23  * verifystr() for the encoding. For server-encodings, also define mb2wchar()
24  * and wchar2mb() conversion functions.
25  *
26  * These functions generally assume that their input is validly formed.
27  * The "verifier" functions, further down in the file, have to be more
28  * paranoid.
29  *
30  * We expect that mblen() does not need to examine more than the first byte
31  * of the character to discover the correct length. GB18030 is an exception
32  * to that rule, though, as it also looks at second byte. But even that
33  * behaves in a predictable way, if you only pass the first byte: it will
34  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
35  * good enough for all current uses.
36  *
37  * Note: for the display output of psql to work properly, the return values
38  * of the dsplen functions must conform to the Unicode standard. In particular
39  * the NUL character is zero width and control characters are generally
40  * width -1. It is recommended that non-ASCII encodings refer their ASCII
41  * subset to the ASCII routines to ensure consistency.
42  */
43 
44 /*
45  * SQL/ASCII
46  */
47 static int
48 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
49 {
50  int cnt = 0;
51 
52  while (len > 0 && *from)
53  {
54  *to++ = *from++;
55  len--;
56  cnt++;
57  }
58  *to = 0;
59  return cnt;
60 }
61 
62 static int
63 pg_ascii_mblen(const unsigned char *s)
64 {
65  return 1;
66 }
67 
68 static int
69 pg_ascii_dsplen(const unsigned char *s)
70 {
71  if (*s == '\0')
72  return 0;
73  if (*s < 0x20 || *s == 0x7f)
74  return -1;
75 
76  return 1;
77 }
78 
79 /*
80  * EUC
81  */
82 static int
83 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
84 {
85  int cnt = 0;
86 
87  while (len > 0 && *from)
88  {
89  if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
90  * KANA") */
91  {
92  from++;
93  *to = (SS2 << 8) | *from++;
94  len -= 2;
95  }
96  else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
97  {
98  from++;
99  *to = (SS3 << 16) | (*from++ << 8);
100  *to |= *from++;
101  len -= 3;
102  }
103  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
104  {
105  *to = *from++ << 8;
106  *to |= *from++;
107  len -= 2;
108  }
109  else /* must be ASCII */
110  {
111  *to = *from++;
112  len--;
113  }
114  to++;
115  cnt++;
116  }
117  *to = 0;
118  return cnt;
119 }
120 
121 static inline int
122 pg_euc_mblen(const unsigned char *s)
123 {
124  int len;
125 
126  if (*s == SS2)
127  len = 2;
128  else if (*s == SS3)
129  len = 3;
130  else if (IS_HIGHBIT_SET(*s))
131  len = 2;
132  else
133  len = 1;
134  return len;
135 }
136 
137 static inline int
138 pg_euc_dsplen(const unsigned char *s)
139 {
140  int len;
141 
142  if (*s == SS2)
143  len = 2;
144  else if (*s == SS3)
145  len = 2;
146  else if (IS_HIGHBIT_SET(*s))
147  len = 2;
148  else
149  len = pg_ascii_dsplen(s);
150  return len;
151 }
152 
153 /*
154  * EUC_JP
155  */
156 static int
157 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
158 {
159  return pg_euc2wchar_with_len(from, to, len);
160 }
161 
162 static int
163 pg_eucjp_mblen(const unsigned char *s)
164 {
165  return pg_euc_mblen(s);
166 }
167 
168 static int
169 pg_eucjp_dsplen(const unsigned char *s)
170 {
171  int len;
172 
173  if (*s == SS2)
174  len = 1;
175  else if (*s == SS3)
176  len = 2;
177  else if (IS_HIGHBIT_SET(*s))
178  len = 2;
179  else
180  len = pg_ascii_dsplen(s);
181  return len;
182 }
183 
184 /*
185  * EUC_KR
186  */
187 static int
188 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
189 {
190  return pg_euc2wchar_with_len(from, to, len);
191 }
192 
193 static int
194 pg_euckr_mblen(const unsigned char *s)
195 {
196  return pg_euc_mblen(s);
197 }
198 
199 static int
200 pg_euckr_dsplen(const unsigned char *s)
201 {
202  return pg_euc_dsplen(s);
203 }
204 
205 /*
206  * EUC_CN
207  *
208  */
209 static int
210 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211 {
212  int cnt = 0;
213 
214  while (len > 0 && *from)
215  {
216  if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
217  {
218  from++;
219  *to = (SS2 << 16) | (*from++ << 8);
220  *to |= *from++;
221  len -= 3;
222  }
223  else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
224  {
225  from++;
226  *to = (SS3 << 16) | (*from++ << 8);
227  *to |= *from++;
228  len -= 3;
229  }
230  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
231  {
232  *to = *from++ << 8;
233  *to |= *from++;
234  len -= 2;
235  }
236  else
237  {
238  *to = *from++;
239  len--;
240  }
241  to++;
242  cnt++;
243  }
244  *to = 0;
245  return cnt;
246 }
247 
248 static int
249 pg_euccn_mblen(const unsigned char *s)
250 {
251  int len;
252 
253  if (IS_HIGHBIT_SET(*s))
254  len = 2;
255  else
256  len = 1;
257  return len;
258 }
259 
260 static int
261 pg_euccn_dsplen(const unsigned char *s)
262 {
263  int len;
264 
265  if (IS_HIGHBIT_SET(*s))
266  len = 2;
267  else
268  len = pg_ascii_dsplen(s);
269  return len;
270 }
271 
272 /*
273  * EUC_TW
274  *
275  */
276 static int
277 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
278 {
279  int cnt = 0;
280 
281  while (len > 0 && *from)
282  {
283  if (*from == SS2 && len >= 4) /* code set 2 */
284  {
285  from++;
286  *to = (((uint32) SS2) << 24) | (*from++ << 16);
287  *to |= *from++ << 8;
288  *to |= *from++;
289  len -= 4;
290  }
291  else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
292  {
293  from++;
294  *to = (SS3 << 16) | (*from++ << 8);
295  *to |= *from++;
296  len -= 3;
297  }
298  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
299  {
300  *to = *from++ << 8;
301  *to |= *from++;
302  len -= 2;
303  }
304  else
305  {
306  *to = *from++;
307  len--;
308  }
309  to++;
310  cnt++;
311  }
312  *to = 0;
313  return cnt;
314 }
315 
316 static int
317 pg_euctw_mblen(const unsigned char *s)
318 {
319  int len;
320 
321  if (*s == SS2)
322  len = 4;
323  else if (*s == SS3)
324  len = 3;
325  else if (IS_HIGHBIT_SET(*s))
326  len = 2;
327  else
328  len = 1;
329  return len;
330 }
331 
332 static int
333 pg_euctw_dsplen(const unsigned char *s)
334 {
335  int len;
336 
337  if (*s == SS2)
338  len = 2;
339  else if (*s == SS3)
340  len = 2;
341  else if (IS_HIGHBIT_SET(*s))
342  len = 2;
343  else
344  len = pg_ascii_dsplen(s);
345  return len;
346 }
347 
348 /*
349  * Convert pg_wchar to EUC_* encoding.
350  * caller must allocate enough space for "to", including a trailing zero!
351  * len: length of from.
352  * "from" not necessarily null terminated.
353  */
354 static int
355 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
356 {
357  int cnt = 0;
358 
359  while (len > 0 && *from)
360  {
361  unsigned char c;
362 
363  if ((c = (*from >> 24)))
364  {
365  *to++ = c;
366  *to++ = (*from >> 16) & 0xff;
367  *to++ = (*from >> 8) & 0xff;
368  *to++ = *from & 0xff;
369  cnt += 4;
370  }
371  else if ((c = (*from >> 16)))
372  {
373  *to++ = c;
374  *to++ = (*from >> 8) & 0xff;
375  *to++ = *from & 0xff;
376  cnt += 3;
377  }
378  else if ((c = (*from >> 8)))
379  {
380  *to++ = c;
381  *to++ = *from & 0xff;
382  cnt += 2;
383  }
384  else
385  {
386  *to++ = *from;
387  cnt++;
388  }
389  from++;
390  len--;
391  }
392  *to = 0;
393  return cnt;
394 }
395 
396 
397 /*
398  * JOHAB
399  */
400 static int
401 pg_johab_mblen(const unsigned char *s)
402 {
403  return pg_euc_mblen(s);
404 }
405 
406 static int
407 pg_johab_dsplen(const unsigned char *s)
408 {
409  return pg_euc_dsplen(s);
410 }
411 
412 /*
413  * convert UTF8 string to pg_wchar (UCS-4)
414  * caller must allocate enough space for "to", including a trailing zero!
415  * len: length of from.
416  * "from" not necessarily null terminated.
417  */
418 static int
419 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
420 {
421  int cnt = 0;
422  uint32 c1,
423  c2,
424  c3,
425  c4;
426 
427  while (len > 0 && *from)
428  {
429  if ((*from & 0x80) == 0)
430  {
431  *to = *from++;
432  len--;
433  }
434  else if ((*from & 0xe0) == 0xc0)
435  {
436  if (len < 2)
437  break; /* drop trailing incomplete char */
438  c1 = *from++ & 0x1f;
439  c2 = *from++ & 0x3f;
440  *to = (c1 << 6) | c2;
441  len -= 2;
442  }
443  else if ((*from & 0xf0) == 0xe0)
444  {
445  if (len < 3)
446  break; /* drop trailing incomplete char */
447  c1 = *from++ & 0x0f;
448  c2 = *from++ & 0x3f;
449  c3 = *from++ & 0x3f;
450  *to = (c1 << 12) | (c2 << 6) | c3;
451  len -= 3;
452  }
453  else if ((*from & 0xf8) == 0xf0)
454  {
455  if (len < 4)
456  break; /* drop trailing incomplete char */
457  c1 = *from++ & 0x07;
458  c2 = *from++ & 0x3f;
459  c3 = *from++ & 0x3f;
460  c4 = *from++ & 0x3f;
461  *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
462  len -= 4;
463  }
464  else
465  {
466  /* treat a bogus char as length 1; not ours to raise error */
467  *to = *from++;
468  len--;
469  }
470  to++;
471  cnt++;
472  }
473  *to = 0;
474  return cnt;
475 }
476 
477 
478 /*
479  * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
480  * space allocated.
481  */
482 unsigned char *
483 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
484 {
485  if (c <= 0x7F)
486  {
487  utf8string[0] = c;
488  }
489  else if (c <= 0x7FF)
490  {
491  utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
492  utf8string[1] = 0x80 | (c & 0x3F);
493  }
494  else if (c <= 0xFFFF)
495  {
496  utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
497  utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
498  utf8string[2] = 0x80 | (c & 0x3F);
499  }
500  else
501  {
502  utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
503  utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
504  utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
505  utf8string[3] = 0x80 | (c & 0x3F);
506  }
507 
508  return utf8string;
509 }
510 
511 /*
512  * Trivial conversion from pg_wchar to UTF-8.
513  * caller should allocate enough space for "to"
514  * len: length of from.
515  * "from" not necessarily null terminated.
516  */
517 static int
518 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
519 {
520  int cnt = 0;
521 
522  while (len > 0 && *from)
523  {
524  int char_len;
525 
526  unicode_to_utf8(*from, to);
527  char_len = pg_utf_mblen(to);
528  cnt += char_len;
529  to += char_len;
530  from++;
531  len--;
532  }
533  *to = 0;
534  return cnt;
535 }
536 
537 /*
538  * Return the byte length of a UTF8 character pointed to by s
539  *
540  * Note: in the current implementation we do not support UTF8 sequences
541  * of more than 4 bytes; hence do NOT return a value larger than 4.
542  * We return "1" for any leading byte that is either flat-out illegal or
543  * indicates a length larger than we support.
544  *
545  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
546  * other places would need to be fixed to change this.
547  */
548 int
549 pg_utf_mblen(const unsigned char *s)
550 {
551  int len;
552 
553  if ((*s & 0x80) == 0)
554  len = 1;
555  else if ((*s & 0xe0) == 0xc0)
556  len = 2;
557  else if ((*s & 0xf0) == 0xe0)
558  len = 3;
559  else if ((*s & 0xf8) == 0xf0)
560  len = 4;
561 #ifdef NOT_USED
562  else if ((*s & 0xfc) == 0xf8)
563  len = 5;
564  else if ((*s & 0xfe) == 0xfc)
565  len = 6;
566 #endif
567  else
568  len = 1;
569  return len;
570 }
571 
572 /*
573  * This is an implementation of wcwidth() and wcswidth() as defined in
574  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
575  * <http://www.unix.org/online.html>
576  *
577  * Markus Kuhn -- 2001-09-08 -- public domain
578  *
579  * customised for PostgreSQL
580  *
581  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
582  */
583 
585 {
586  unsigned short first;
587  unsigned short last;
588 };
589 
590 /* auxiliary function for binary search in interval table */
591 static int
592 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
593 {
594  int min = 0;
595  int mid;
596 
597  if (ucs < table[0].first || ucs > table[max].last)
598  return 0;
599  while (max >= min)
600  {
601  mid = (min + max) / 2;
602  if (ucs > table[mid].last)
603  min = mid + 1;
604  else if (ucs < table[mid].first)
605  max = mid - 1;
606  else
607  return 1;
608  }
609 
610  return 0;
611 }
612 
613 
614 /* The following functions define the column width of an ISO 10646
615  * character as follows:
616  *
617  * - The null character (U+0000) has a column width of 0.
618  *
619  * - Other C0/C1 control characters and DEL will lead to a return
620  * value of -1.
621  *
622  * - Non-spacing and enclosing combining characters (general
623  * category code Mn or Me in the Unicode database) have a
624  * column width of 0.
625  *
626  * - Other format characters (general category code Cf in the Unicode
627  * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
628  *
629  * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
630  * have a column width of 0.
631  *
632  * - Spacing characters in the East Asian Wide (W) or East Asian
633  * FullWidth (F) category as defined in Unicode Technical
634  * Report #11 have a column width of 2.
635  *
636  * - All remaining characters (including all printable
637  * ISO 8859-1 and WGL4 characters, Unicode control characters,
638  * etc.) have a column width of 1.
639  *
640  * This implementation assumes that wchar_t characters are encoded
641  * in ISO 10646.
642  */
643 
644 static int
646 {
648 
649  /* test for 8-bit control characters */
650  if (ucs == 0)
651  return 0;
652 
653  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
654  return -1;
655 
656  /* binary search in table of non-spacing characters */
657  if (mbbisearch(ucs, combining,
658  sizeof(combining) / sizeof(struct mbinterval) - 1))
659  return 0;
660 
661  /*
662  * if we arrive here, ucs is not a combining or C0/C1 control character
663  */
664 
665  return 1 +
666  (ucs >= 0x1100 &&
667  (ucs <= 0x115f || /* Hangul Jamo init. consonants */
668  (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
669  ucs != 0x303f) || /* CJK ... Yi */
670  (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
671  (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
672  * Ideographs */
673  (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
674  (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
675  (ucs >= 0xffe0 && ucs <= 0xffe6) ||
676  (ucs >= 0x20000 && ucs <= 0x2ffff)));
677 }
678 
679 /*
680  * Convert a UTF-8 character to a Unicode code point.
681  * This is a one-character version of pg_utf2wchar_with_len.
682  *
683  * No error checks here, c must point to a long-enough string.
684  */
685 pg_wchar
686 utf8_to_unicode(const unsigned char *c)
687 {
688  if ((*c & 0x80) == 0)
689  return (pg_wchar) c[0];
690  else if ((*c & 0xe0) == 0xc0)
691  return (pg_wchar) (((c[0] & 0x1f) << 6) |
692  (c[1] & 0x3f));
693  else if ((*c & 0xf0) == 0xe0)
694  return (pg_wchar) (((c[0] & 0x0f) << 12) |
695  ((c[1] & 0x3f) << 6) |
696  (c[2] & 0x3f));
697  else if ((*c & 0xf8) == 0xf0)
698  return (pg_wchar) (((c[0] & 0x07) << 18) |
699  ((c[1] & 0x3f) << 12) |
700  ((c[2] & 0x3f) << 6) |
701  (c[3] & 0x3f));
702  else
703  /* that is an invalid code on purpose */
704  return 0xffffffff;
705 }
706 
707 static int
708 pg_utf_dsplen(const unsigned char *s)
709 {
710  return ucs_wcwidth(utf8_to_unicode(s));
711 }
712 
713 /*
714  * convert mule internal code to pg_wchar
715  * caller should allocate enough space for "to"
716  * len: length of from.
717  * "from" not necessarily null terminated.
718  */
719 static int
720 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
721 {
722  int cnt = 0;
723 
724  while (len > 0 && *from)
725  {
726  if (IS_LC1(*from) && len >= 2)
727  {
728  *to = *from++ << 16;
729  *to |= *from++;
730  len -= 2;
731  }
732  else if (IS_LCPRV1(*from) && len >= 3)
733  {
734  from++;
735  *to = *from++ << 16;
736  *to |= *from++;
737  len -= 3;
738  }
739  else if (IS_LC2(*from) && len >= 3)
740  {
741  *to = *from++ << 16;
742  *to |= *from++ << 8;
743  *to |= *from++;
744  len -= 3;
745  }
746  else if (IS_LCPRV2(*from) && len >= 4)
747  {
748  from++;
749  *to = *from++ << 16;
750  *to |= *from++ << 8;
751  *to |= *from++;
752  len -= 4;
753  }
754  else
755  { /* assume ASCII */
756  *to = (unsigned char) *from++;
757  len--;
758  }
759  to++;
760  cnt++;
761  }
762  *to = 0;
763  return cnt;
764 }
765 
766 /*
767  * convert pg_wchar to mule internal code
768  * caller should allocate enough space for "to"
769  * len: length of from.
770  * "from" not necessarily null terminated.
771  */
772 static int
773 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
774 {
775  int cnt = 0;
776 
777  while (len > 0 && *from)
778  {
779  unsigned char lb;
780 
781  lb = (*from >> 16) & 0xff;
782  if (IS_LC1(lb))
783  {
784  *to++ = lb;
785  *to++ = *from & 0xff;
786  cnt += 2;
787  }
788  else if (IS_LC2(lb))
789  {
790  *to++ = lb;
791  *to++ = (*from >> 8) & 0xff;
792  *to++ = *from & 0xff;
793  cnt += 3;
794  }
795  else if (IS_LCPRV1_A_RANGE(lb))
796  {
797  *to++ = LCPRV1_A;
798  *to++ = lb;
799  *to++ = *from & 0xff;
800  cnt += 3;
801  }
802  else if (IS_LCPRV1_B_RANGE(lb))
803  {
804  *to++ = LCPRV1_B;
805  *to++ = lb;
806  *to++ = *from & 0xff;
807  cnt += 3;
808  }
809  else if (IS_LCPRV2_A_RANGE(lb))
810  {
811  *to++ = LCPRV2_A;
812  *to++ = lb;
813  *to++ = (*from >> 8) & 0xff;
814  *to++ = *from & 0xff;
815  cnt += 4;
816  }
817  else if (IS_LCPRV2_B_RANGE(lb))
818  {
819  *to++ = LCPRV2_B;
820  *to++ = lb;
821  *to++ = (*from >> 8) & 0xff;
822  *to++ = *from & 0xff;
823  cnt += 4;
824  }
825  else
826  {
827  *to++ = *from & 0xff;
828  cnt += 1;
829  }
830  from++;
831  len--;
832  }
833  *to = 0;
834  return cnt;
835 }
836 
837 /* exported for direct use by conv.c */
838 int
839 pg_mule_mblen(const unsigned char *s)
840 {
841  int len;
842 
843  if (IS_LC1(*s))
844  len = 2;
845  else if (IS_LCPRV1(*s))
846  len = 3;
847  else if (IS_LC2(*s))
848  len = 3;
849  else if (IS_LCPRV2(*s))
850  len = 4;
851  else
852  len = 1; /* assume ASCII */
853  return len;
854 }
855 
856 static int
857 pg_mule_dsplen(const unsigned char *s)
858 {
859  int len;
860 
861  /*
862  * Note: it's not really appropriate to assume that all multibyte charsets
863  * are double-wide on screen. But this seems an okay approximation for
864  * the MULE charsets we currently support.
865  */
866 
867  if (IS_LC1(*s))
868  len = 1;
869  else if (IS_LCPRV1(*s))
870  len = 1;
871  else if (IS_LC2(*s))
872  len = 2;
873  else if (IS_LCPRV2(*s))
874  len = 2;
875  else
876  len = 1; /* assume ASCII */
877 
878  return len;
879 }
880 
881 /*
882  * ISO8859-1
883  */
884 static int
885 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
886 {
887  int cnt = 0;
888 
889  while (len > 0 && *from)
890  {
891  *to++ = *from++;
892  len--;
893  cnt++;
894  }
895  *to = 0;
896  return cnt;
897 }
898 
899 /*
900  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
901  * high bits.
902  * caller should allocate enough space for "to"
903  * len: length of from.
904  * "from" not necessarily null terminated.
905  */
906 static int
907 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
908 {
909  int cnt = 0;
910 
911  while (len > 0 && *from)
912  {
913  *to++ = *from++;
914  len--;
915  cnt++;
916  }
917  *to = 0;
918  return cnt;
919 }
920 
921 static int
922 pg_latin1_mblen(const unsigned char *s)
923 {
924  return 1;
925 }
926 
927 static int
928 pg_latin1_dsplen(const unsigned char *s)
929 {
930  return pg_ascii_dsplen(s);
931 }
932 
933 /*
934  * SJIS
935  */
936 static int
937 pg_sjis_mblen(const unsigned char *s)
938 {
939  int len;
940 
941  if (*s >= 0xa1 && *s <= 0xdf)
942  len = 1; /* 1 byte kana? */
943  else if (IS_HIGHBIT_SET(*s))
944  len = 2; /* kanji? */
945  else
946  len = 1; /* should be ASCII */
947  return len;
948 }
949 
950 static int
951 pg_sjis_dsplen(const unsigned char *s)
952 {
953  int len;
954 
955  if (*s >= 0xa1 && *s <= 0xdf)
956  len = 1; /* 1 byte kana? */
957  else if (IS_HIGHBIT_SET(*s))
958  len = 2; /* kanji? */
959  else
960  len = pg_ascii_dsplen(s); /* should be ASCII */
961  return len;
962 }
963 
964 /*
965  * Big5
966  */
967 static int
968 pg_big5_mblen(const unsigned char *s)
969 {
970  int len;
971 
972  if (IS_HIGHBIT_SET(*s))
973  len = 2; /* kanji? */
974  else
975  len = 1; /* should be ASCII */
976  return len;
977 }
978 
979 static int
980 pg_big5_dsplen(const unsigned char *s)
981 {
982  int len;
983 
984  if (IS_HIGHBIT_SET(*s))
985  len = 2; /* kanji? */
986  else
987  len = pg_ascii_dsplen(s); /* should be ASCII */
988  return len;
989 }
990 
991 /*
992  * GBK
993  */
994 static int
995 pg_gbk_mblen(const unsigned char *s)
996 {
997  int len;
998 
999  if (IS_HIGHBIT_SET(*s))
1000  len = 2; /* kanji? */
1001  else
1002  len = 1; /* should be ASCII */
1003  return len;
1004 }
1005 
1006 static int
1007 pg_gbk_dsplen(const unsigned char *s)
1008 {
1009  int len;
1010 
1011  if (IS_HIGHBIT_SET(*s))
1012  len = 2; /* kanji? */
1013  else
1014  len = pg_ascii_dsplen(s); /* should be ASCII */
1015  return len;
1016 }
1017 
1018 /*
1019  * UHC
1020  */
1021 static int
1022 pg_uhc_mblen(const unsigned char *s)
1023 {
1024  int len;
1025 
1026  if (IS_HIGHBIT_SET(*s))
1027  len = 2; /* 2byte? */
1028  else
1029  len = 1; /* should be ASCII */
1030  return len;
1031 }
1032 
1033 static int
1034 pg_uhc_dsplen(const unsigned char *s)
1035 {
1036  int len;
1037 
1038  if (IS_HIGHBIT_SET(*s))
1039  len = 2; /* 2byte? */
1040  else
1041  len = pg_ascii_dsplen(s); /* should be ASCII */
1042  return len;
1043 }
1044 
1045 /*
1046  * GB18030
1047  * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1048  */
1049 
1050 /*
1051  * Unlike all other mblen() functions, this also looks at the second byte of
1052  * the input. However, if you only pass the first byte of a multi-byte
1053  * string, and \0 as the second byte, this still works in a predictable way:
1054  * a 4-byte character will be reported as two 2-byte characters. That's
1055  * enough for all current uses, as a client-only encoding. It works that
1056  * way, because in any valid 4-byte GB18030-encoded character, the third and
1057  * fourth byte look like a 2-byte encoded character, when looked at
1058  * separately.
1059  */
1060 static int
1061 pg_gb18030_mblen(const unsigned char *s)
1062 {
1063  int len;
1064 
1065  if (!IS_HIGHBIT_SET(*s))
1066  len = 1; /* ASCII */
1067  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1068  len = 4;
1069  else
1070  len = 2;
1071  return len;
1072 }
1073 
1074 static int
1075 pg_gb18030_dsplen(const unsigned char *s)
1076 {
1077  int len;
1078 
1079  if (IS_HIGHBIT_SET(*s))
1080  len = 2;
1081  else
1082  len = pg_ascii_dsplen(s); /* ASCII */
1083  return len;
1084 }
1085 
1086 /*
1087  *-------------------------------------------------------------------
1088  * multibyte sequence validators
1089  *
1090  * The verifychar functions accept "s", a pointer to the first byte of a
1091  * string, and "len", the remaining length of the string. If there is a
1092  * validly encoded character beginning at *s, return its length in bytes;
1093  * else return -1.
1094  *
1095  * The verifystr functions also accept "s", a pointer to a string and "len",
1096  * the length of the string. They verify the whole string, and return the
1097  * number of input bytes (<= len) that are valid. In other words, if the
1098  * whole string is valid, verifystr returns "len", otherwise it returns the
1099  * byte offset of the first invalid character. The verifystr functions must
1100  * test for and reject zeroes in the input.
1101  *
1102  * The verifychar functions can assume that len > 0 and that *s != '\0', but
1103  * they must test for and reject zeroes in any additional bytes of a
1104  * multibyte character. Note that this definition allows the function for a
1105  * single-byte encoding to be just "return 1".
1106  *-------------------------------------------------------------------
1107  */
1108 static int
1109 pg_ascii_verifychar(const unsigned char *s, int len)
1110 {
1111  return 1;
1112 }
1113 
1114 static int
1115 pg_ascii_verifystr(const unsigned char *s, int len)
1116 {
1117  const unsigned char *nullpos = memchr(s, 0, len);
1118 
1119  if (nullpos == NULL)
1120  return len;
1121  else
1122  return nullpos - s;
1123 }
1124 
1125 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1126 
1127 static int
1128 pg_eucjp_verifychar(const unsigned char *s, int len)
1129 {
1130  int l;
1131  unsigned char c1,
1132  c2;
1133 
1134  c1 = *s++;
1135 
1136  switch (c1)
1137  {
1138  case SS2: /* JIS X 0201 */
1139  l = 2;
1140  if (l > len)
1141  return -1;
1142  c2 = *s++;
1143  if (c2 < 0xa1 || c2 > 0xdf)
1144  return -1;
1145  break;
1146 
1147  case SS3: /* JIS X 0212 */
1148  l = 3;
1149  if (l > len)
1150  return -1;
1151  c2 = *s++;
1152  if (!IS_EUC_RANGE_VALID(c2))
1153  return -1;
1154  c2 = *s++;
1155  if (!IS_EUC_RANGE_VALID(c2))
1156  return -1;
1157  break;
1158 
1159  default:
1160  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1161  {
1162  l = 2;
1163  if (l > len)
1164  return -1;
1165  if (!IS_EUC_RANGE_VALID(c1))
1166  return -1;
1167  c2 = *s++;
1168  if (!IS_EUC_RANGE_VALID(c2))
1169  return -1;
1170  }
1171  else
1172  /* must be ASCII */
1173  {
1174  l = 1;
1175  }
1176  break;
1177  }
1178 
1179  return l;
1180 }
1181 
1182 static int
1183 pg_eucjp_verifystr(const unsigned char *s, int len)
1184 {
1185  const unsigned char *start = s;
1186 
1187  while (len > 0)
1188  {
1189  int l;
1190 
1191  /* fast path for ASCII-subset characters */
1192  if (!IS_HIGHBIT_SET(*s))
1193  {
1194  if (*s == '\0')
1195  break;
1196  l = 1;
1197  }
1198  else
1199  {
1200  l = pg_eucjp_verifychar(s, len);
1201  if (l == -1)
1202  break;
1203  }
1204  s += l;
1205  len -= l;
1206  }
1207 
1208  return s - start;
1209 }
1210 
1211 static int
1212 pg_euckr_verifychar(const unsigned char *s, int len)
1213 {
1214  int l;
1215  unsigned char c1,
1216  c2;
1217 
1218  c1 = *s++;
1219 
1220  if (IS_HIGHBIT_SET(c1))
1221  {
1222  l = 2;
1223  if (l > len)
1224  return -1;
1225  if (!IS_EUC_RANGE_VALID(c1))
1226  return -1;
1227  c2 = *s++;
1228  if (!IS_EUC_RANGE_VALID(c2))
1229  return -1;
1230  }
1231  else
1232  /* must be ASCII */
1233  {
1234  l = 1;
1235  }
1236 
1237  return l;
1238 }
1239 
1240 static int
1241 pg_euckr_verifystr(const unsigned char *s, int len)
1242 {
1243  const unsigned char *start = s;
1244 
1245  while (len > 0)
1246  {
1247  int l;
1248 
1249  /* fast path for ASCII-subset characters */
1250  if (!IS_HIGHBIT_SET(*s))
1251  {
1252  if (*s == '\0')
1253  break;
1254  l = 1;
1255  }
1256  else
1257  {
1258  l = pg_euckr_verifychar(s, len);
1259  if (l == -1)
1260  break;
1261  }
1262  s += l;
1263  len -= l;
1264  }
1265 
1266  return s - start;
1267 }
1268 
1269 /* EUC-CN byte sequences are exactly same as EUC-KR */
1270 #define pg_euccn_verifychar pg_euckr_verifychar
1271 #define pg_euccn_verifystr pg_euckr_verifystr
1272 
1273 static int
1274 pg_euctw_verifychar(const unsigned char *s, int len)
1275 {
1276  int l;
1277  unsigned char c1,
1278  c2;
1279 
1280  c1 = *s++;
1281 
1282  switch (c1)
1283  {
1284  case SS2: /* CNS 11643 Plane 1-7 */
1285  l = 4;
1286  if (l > len)
1287  return -1;
1288  c2 = *s++;
1289  if (c2 < 0xa1 || c2 > 0xa7)
1290  return -1;
1291  c2 = *s++;
1292  if (!IS_EUC_RANGE_VALID(c2))
1293  return -1;
1294  c2 = *s++;
1295  if (!IS_EUC_RANGE_VALID(c2))
1296  return -1;
1297  break;
1298 
1299  case SS3: /* unused */
1300  return -1;
1301 
1302  default:
1303  if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1304  {
1305  l = 2;
1306  if (l > len)
1307  return -1;
1308  /* no further range check on c1? */
1309  c2 = *s++;
1310  if (!IS_EUC_RANGE_VALID(c2))
1311  return -1;
1312  }
1313  else
1314  /* must be ASCII */
1315  {
1316  l = 1;
1317  }
1318  break;
1319  }
1320  return l;
1321 }
1322 
1323 static int
1324 pg_euctw_verifystr(const unsigned char *s, int len)
1325 {
1326  const unsigned char *start = s;
1327 
1328  while (len > 0)
1329  {
1330  int l;
1331 
1332  /* fast path for ASCII-subset characters */
1333  if (!IS_HIGHBIT_SET(*s))
1334  {
1335  if (*s == '\0')
1336  break;
1337  l = 1;
1338  }
1339  else
1340  {
1341  l = pg_euctw_verifychar(s, len);
1342  if (l == -1)
1343  break;
1344  }
1345  s += l;
1346  len -= l;
1347  }
1348 
1349  return s - start;
1350 }
1351 
1352 static int
1353 pg_johab_verifychar(const unsigned char *s, int len)
1354 {
1355  int l,
1356  mbl;
1357  unsigned char c;
1358 
1359  l = mbl = pg_johab_mblen(s);
1360 
1361  if (len < l)
1362  return -1;
1363 
1364  if (!IS_HIGHBIT_SET(*s))
1365  return mbl;
1366 
1367  while (--l > 0)
1368  {
1369  c = *++s;
1370  if (!IS_EUC_RANGE_VALID(c))
1371  return -1;
1372  }
1373  return mbl;
1374 }
1375 
1376 static int
1377 pg_johab_verifystr(const unsigned char *s, int len)
1378 {
1379  const unsigned char *start = s;
1380 
1381  while (len > 0)
1382  {
1383  int l;
1384 
1385  /* fast path for ASCII-subset characters */
1386  if (!IS_HIGHBIT_SET(*s))
1387  {
1388  if (*s == '\0')
1389  break;
1390  l = 1;
1391  }
1392  else
1393  {
1394  l = pg_johab_verifychar(s, len);
1395  if (l == -1)
1396  break;
1397  }
1398  s += l;
1399  len -= l;
1400  }
1401 
1402  return s - start;
1403 }
1404 
1405 static int
1406 pg_mule_verifychar(const unsigned char *s, int len)
1407 {
1408  int l,
1409  mbl;
1410  unsigned char c;
1411 
1412  l = mbl = pg_mule_mblen(s);
1413 
1414  if (len < l)
1415  return -1;
1416 
1417  while (--l > 0)
1418  {
1419  c = *++s;
1420  if (!IS_HIGHBIT_SET(c))
1421  return -1;
1422  }
1423  return mbl;
1424 }
1425 
1426 static int
1427 pg_mule_verifystr(const unsigned char *s, int len)
1428 {
1429  const unsigned char *start = s;
1430 
1431  while (len > 0)
1432  {
1433  int l;
1434 
1435  /* fast path for ASCII-subset characters */
1436  if (!IS_HIGHBIT_SET(*s))
1437  {
1438  if (*s == '\0')
1439  break;
1440  l = 1;
1441  }
1442  else
1443  {
1444  l = pg_mule_verifychar(s, len);
1445  if (l == -1)
1446  break;
1447  }
1448  s += l;
1449  len -= l;
1450  }
1451 
1452  return s - start;
1453 }
1454 
1455 static int
1456 pg_latin1_verifychar(const unsigned char *s, int len)
1457 {
1458  return 1;
1459 }
1460 
1461 static int
1462 pg_latin1_verifystr(const unsigned char *s, int len)
1463 {
1464  const unsigned char *nullpos = memchr(s, 0, len);
1465 
1466  if (nullpos == NULL)
1467  return len;
1468  else
1469  return nullpos - s;
1470 }
1471 
1472 static int
1473 pg_sjis_verifychar(const unsigned char *s, int len)
1474 {
1475  int l,
1476  mbl;
1477  unsigned char c1,
1478  c2;
1479 
1480  l = mbl = pg_sjis_mblen(s);
1481 
1482  if (len < l)
1483  return -1;
1484 
1485  if (l == 1) /* pg_sjis_mblen already verified it */
1486  return mbl;
1487 
1488  c1 = *s++;
1489  c2 = *s;
1490  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1491  return -1;
1492  return mbl;
1493 }
1494 
1495 static int
1496 pg_sjis_verifystr(const unsigned char *s, int len)
1497 {
1498  const unsigned char *start = s;
1499 
1500  while (len > 0)
1501  {
1502  int l;
1503 
1504  /* fast path for ASCII-subset characters */
1505  if (!IS_HIGHBIT_SET(*s))
1506  {
1507  if (*s == '\0')
1508  break;
1509  l = 1;
1510  }
1511  else
1512  {
1513  l = pg_sjis_verifychar(s, len);
1514  if (l == -1)
1515  break;
1516  }
1517  s += l;
1518  len -= l;
1519  }
1520 
1521  return s - start;
1522 }
1523 
1524 static int
1525 pg_big5_verifychar(const unsigned char *s, int len)
1526 {
1527  int l,
1528  mbl;
1529 
1530  l = mbl = pg_big5_mblen(s);
1531 
1532  if (len < l)
1533  return -1;
1534 
1535  while (--l > 0)
1536  {
1537  if (*++s == '\0')
1538  return -1;
1539  }
1540 
1541  return mbl;
1542 }
1543 
1544 static int
1545 pg_big5_verifystr(const unsigned char *s, int len)
1546 {
1547  const unsigned char *start = s;
1548 
1549  while (len > 0)
1550  {
1551  int l;
1552 
1553  /* fast path for ASCII-subset characters */
1554  if (!IS_HIGHBIT_SET(*s))
1555  {
1556  if (*s == '\0')
1557  break;
1558  l = 1;
1559  }
1560  else
1561  {
1562  l = pg_big5_verifychar(s, len);
1563  if (l == -1)
1564  break;
1565  }
1566  s += l;
1567  len -= l;
1568  }
1569 
1570  return s - start;
1571 }
1572 
1573 static int
1574 pg_gbk_verifychar(const unsigned char *s, int len)
1575 {
1576  int l,
1577  mbl;
1578 
1579  l = mbl = pg_gbk_mblen(s);
1580 
1581  if (len < l)
1582  return -1;
1583 
1584  while (--l > 0)
1585  {
1586  if (*++s == '\0')
1587  return -1;
1588  }
1589 
1590  return mbl;
1591 }
1592 
1593 static int
1594 pg_gbk_verifystr(const unsigned char *s, int len)
1595 {
1596  const unsigned char *start = s;
1597 
1598  while (len > 0)
1599  {
1600  int l;
1601 
1602  /* fast path for ASCII-subset characters */
1603  if (!IS_HIGHBIT_SET(*s))
1604  {
1605  if (*s == '\0')
1606  break;
1607  l = 1;
1608  }
1609  else
1610  {
1611  l = pg_gbk_verifychar(s, len);
1612  if (l == -1)
1613  break;
1614  }
1615  s += l;
1616  len -= l;
1617  }
1618 
1619  return s - start;
1620 }
1621 
1622 static int
1623 pg_uhc_verifychar(const unsigned char *s, int len)
1624 {
1625  int l,
1626  mbl;
1627 
1628  l = mbl = pg_uhc_mblen(s);
1629 
1630  if (len < l)
1631  return -1;
1632 
1633  while (--l > 0)
1634  {
1635  if (*++s == '\0')
1636  return -1;
1637  }
1638 
1639  return mbl;
1640 }
1641 
1642 static int
1643 pg_uhc_verifystr(const unsigned char *s, int len)
1644 {
1645  const unsigned char *start = s;
1646 
1647  while (len > 0)
1648  {
1649  int l;
1650 
1651  /* fast path for ASCII-subset characters */
1652  if (!IS_HIGHBIT_SET(*s))
1653  {
1654  if (*s == '\0')
1655  break;
1656  l = 1;
1657  }
1658  else
1659  {
1660  l = pg_uhc_verifychar(s, len);
1661  if (l == -1)
1662  break;
1663  }
1664  s += l;
1665  len -= l;
1666  }
1667 
1668  return s - start;
1669 }
1670 
1671 static int
1672 pg_gb18030_verifychar(const unsigned char *s, int len)
1673 {
1674  int l;
1675 
1676  if (!IS_HIGHBIT_SET(*s))
1677  l = 1; /* ASCII */
1678  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1679  {
1680  /* Should be 4-byte, validate remaining bytes */
1681  if (*s >= 0x81 && *s <= 0xfe &&
1682  *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1683  *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1684  l = 4;
1685  else
1686  l = -1;
1687  }
1688  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1689  {
1690  /* Should be 2-byte, validate */
1691  if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1692  (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1693  l = 2;
1694  else
1695  l = -1;
1696  }
1697  else
1698  l = -1;
1699  return l;
1700 }
1701 
1702 static int
1703 pg_gb18030_verifystr(const unsigned char *s, int len)
1704 {
1705  const unsigned char *start = s;
1706 
1707  while (len > 0)
1708  {
1709  int l;
1710 
1711  /* fast path for ASCII-subset characters */
1712  if (!IS_HIGHBIT_SET(*s))
1713  {
1714  if (*s == '\0')
1715  break;
1716  l = 1;
1717  }
1718  else
1719  {
1720  l = pg_gb18030_verifychar(s, len);
1721  if (l == -1)
1722  break;
1723  }
1724  s += l;
1725  len -= l;
1726  }
1727 
1728  return s - start;
1729 }
1730 
1731 static int
1732 pg_utf8_verifychar(const unsigned char *s, int len)
1733 {
1734  int l;
1735 
1736  if ((*s & 0x80) == 0)
1737  {
1738  if (*s == '\0')
1739  return -1;
1740  return 1;
1741  }
1742  else if ((*s & 0xe0) == 0xc0)
1743  l = 2;
1744  else if ((*s & 0xf0) == 0xe0)
1745  l = 3;
1746  else if ((*s & 0xf8) == 0xf0)
1747  l = 4;
1748  else
1749  l = 1;
1750 
1751  if (l > len)
1752  return -1;
1753 
1754  if (!pg_utf8_islegal(s, l))
1755  return -1;
1756 
1757  return l;
1758 }
1759 
1760 static int
1761 pg_utf8_verifystr(const unsigned char *s, int len)
1762 {
1763  const unsigned char *start = s;
1764 
1765  while (len > 0)
1766  {
1767  int l;
1768 
1769  /* fast path for ASCII-subset characters */
1770  if (!IS_HIGHBIT_SET(*s))
1771  {
1772  if (*s == '\0')
1773  break;
1774  l = 1;
1775  }
1776  else
1777  {
1778  l = pg_utf8_verifychar(s, len);
1779  if (l == -1)
1780  break;
1781  }
1782  s += l;
1783  len -= l;
1784  }
1785 
1786  return s - start;
1787 }
1788 
1789 /*
1790  * Check for validity of a single UTF-8 encoded character
1791  *
1792  * This directly implements the rules in RFC3629. The bizarre-looking
1793  * restrictions on the second byte are meant to ensure that there isn't
1794  * more than one encoding of a given Unicode character point; that is,
1795  * you may not use a longer-than-necessary byte sequence with high order
1796  * zero bits to represent a character that would fit in fewer bytes.
1797  * To do otherwise is to create security hazards (eg, create an apparent
1798  * non-ASCII character that decodes to plain ASCII).
1799  *
1800  * length is assumed to have been obtained by pg_utf_mblen(), and the
1801  * caller must have checked that that many bytes are present in the buffer.
1802  */
1803 bool
1804 pg_utf8_islegal(const unsigned char *source, int length)
1805 {
1806  unsigned char a;
1807 
1808  switch (length)
1809  {
1810  default:
1811  /* reject lengths 5 and 6 for now */
1812  return false;
1813  case 4:
1814  a = source[3];
1815  if (a < 0x80 || a > 0xBF)
1816  return false;
1817  /* FALL THRU */
1818  case 3:
1819  a = source[2];
1820  if (a < 0x80 || a > 0xBF)
1821  return false;
1822  /* FALL THRU */
1823  case 2:
1824  a = source[1];
1825  switch (*source)
1826  {
1827  case 0xE0:
1828  if (a < 0xA0 || a > 0xBF)
1829  return false;
1830  break;
1831  case 0xED:
1832  if (a < 0x80 || a > 0x9F)
1833  return false;
1834  break;
1835  case 0xF0:
1836  if (a < 0x90 || a > 0xBF)
1837  return false;
1838  break;
1839  case 0xF4:
1840  if (a < 0x80 || a > 0x8F)
1841  return false;
1842  break;
1843  default:
1844  if (a < 0x80 || a > 0xBF)
1845  return false;
1846  break;
1847  }
1848  /* FALL THRU */
1849  case 1:
1850  a = *source;
1851  if (a >= 0x80 && a < 0xC2)
1852  return false;
1853  if (a > 0xF4)
1854  return false;
1855  break;
1856  }
1857  return true;
1858 }
1859 
1860 
1861 /*
1862  *-------------------------------------------------------------------
1863  * encoding info table
1864  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1865  *-------------------------------------------------------------------
1866  */
1905  {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, /* PG_GBK */
1906  {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, /* PG_UHC */
1909  {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
1910 };
1911 
1912 /*
1913  * Returns the byte length of a multibyte character.
1914  *
1915  * Caution: when dealing with text that is not certainly valid in the
1916  * specified encoding, the result may exceed the actual remaining
1917  * string length. Callers that are not prepared to deal with that
1918  * should use pg_encoding_mblen_bounded() instead.
1919  */
1920 int
1921 pg_encoding_mblen(int encoding, const char *mbstr)
1922 {
1923  return (PG_VALID_ENCODING(encoding) ?
1924  pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1925  pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1926 }
1927 
1928 /*
1929  * Returns the byte length of a multibyte character; but not more than
1930  * the distance to end of string.
1931  */
1932 int
1933 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
1934 {
1935  return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
1936 }
1937 
1938 /*
1939  * Returns the display length of a multibyte character.
1940  */
1941 int
1942 pg_encoding_dsplen(int encoding, const char *mbstr)
1943 {
1944  return (PG_VALID_ENCODING(encoding) ?
1945  pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1946  pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1947 }
1948 
1949 /*
1950  * Verify the first multibyte character of the given string.
1951  * Return its byte length if good, -1 if bad. (See comments above for
1952  * full details of the mbverifychar API.)
1953  */
1954 int
1955 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
1956 {
1957  return (PG_VALID_ENCODING(encoding) ?
1958  pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
1959  pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
1960 }
1961 
1962 /*
1963  * Verify that a string is valid for the given encoding.
1964  * Returns the number of input bytes (<= len) that form a valid string.
1965  * (See comments above for full details of the mbverifystr API.)
1966  */
1967 int
1968 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
1969 {
1970  return (PG_VALID_ENCODING(encoding) ?
1971  pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
1972  pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
1973 }
1974 
1975 /*
1976  * fetch maximum length of a given encoding
1977  */
1978 int
1980 {
1981  Assert(PG_VALID_ENCODING(encoding));
1982 
1983  return pg_wchar_table[encoding].maxmblen;
1984 }
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1496
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1761
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1324
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1109
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:210
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:645
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1274
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:261
pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: wchar.c:686
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:928
#define IS_LC2(c)
Definition: pg_wchar.h:144
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:69
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1804
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:483
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:922
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:885
unsigned short last
Definition: wchar.c:587
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:592
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:122
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:839
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1075
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition: wchar.c:1968
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1594
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:708
#define ISSJISTAIL(c)
Definition: pg_wchar.h:42
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:937
unsigned short first
Definition: wchar.c:586
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1574
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:720
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:980
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1732
#define LCPRV1_A
Definition: pg_wchar.h:150
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1643
#define SS3
Definition: pg_wchar.h:36
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:1007
#define LCPRV1_B
Definition: pg_wchar.h:151
int maxmblen
Definition: pg_wchar.h:406
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:63
#define IS_LCPRV2(c)
Definition: pg_wchar.h:164
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1115
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1703
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1156
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:907
#define IS_LCPRV1(c)
Definition: pg_wchar.h:152
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:249
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1545
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:773
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1353
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:194
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1473
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:48
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:157
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1427
int pg_encoding_max_length(int encoding)
Definition: wchar.c:1979
char * c
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1377
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:401
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:1921
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:1942
unsigned int uint32
Definition: c.h:441
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:968
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:355
unsigned int pg_wchar
Definition: mbprint.c:31
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:317
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:83
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:1061
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1623
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1183
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:419
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition: wchar.c:1933
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:333
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:165
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:277
#define LCPRV2_B
Definition: pg_wchar.h:163
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:857
#define LCPRV2_A
Definition: pg_wchar.h:162
static const struct mbinterval combining[]
#define Assert(condition)
Definition: c.h:804
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:1022
#define ISSJISHEAD(c)
Definition: pg_wchar.h:41
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1128
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1212
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:167
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:995
static rewind_source * source
Definition: pg_rewind.c:79
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:518
#define pg_euccn_verifystr
Definition: wchar.c:1271
size_t strnlen(const char *str, size_t maxlen)
Definition: strnlen.c:26
int32 encoding
Definition: pg_database.h:41
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:169
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:951
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:1955
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:549
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:188
#define IS_LC1(c)
Definition: pg_wchar.h:123
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:407
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:1867
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1456
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:138
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1525
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1406
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1125
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:153
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:200
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:163
#define pg_euccn_verifychar
Definition: wchar.c:1270
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:155
#define SS2
Definition: pg_wchar.h:35
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:1034
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1462
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1241
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1672