PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
wchar.c
Go to the documentation of this file.
1 /*
2  * conversion functions between pg_wchar and multibyte streams.
3  * Tatsuo Ishii
4  * src/backend/utils/mb/wchar.c
5  *
6  */
7 /* can be used in either frontend or backend */
8 #ifdef FRONTEND
9 #include "postgres_fe.h"
10 #else
11 #include "postgres.h"
12 #endif
13 
14 #include "mb/pg_wchar.h"
15 
16 
17 /*
18  * conversion to pg_wchar is done by "table driven."
19  * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
20  * for the particular encoding. Note that if the encoding is only
21  * supported in the client, you don't need to define
22  * mb2wchar_with_len() function (SJIS is the case).
23  *
24  * These functions generally assume that their input is validly formed.
25  * The "verifier" functions, further down in the file, have to be more
26  * paranoid. We expect that mblen() does not need to examine more than
27  * the first byte of the character to discover the correct length.
28  *
29  * Note: for the display output of psql to work properly, the return values
30  * of the dsplen functions must conform to the Unicode standard. In particular
31  * the NUL character is zero width and control characters are generally
32  * width -1. It is recommended that non-ASCII encodings refer their ASCII
33  * subset to the ASCII routines to ensure consistency.
34  */
35 
36 /*
37  * SQL/ASCII
38  */
39 static int
40 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
41 {
42  int cnt = 0;
43 
44  while (len > 0 && *from)
45  {
46  *to++ = *from++;
47  len--;
48  cnt++;
49  }
50  *to = 0;
51  return cnt;
52 }
53 
54 static int
55 pg_ascii_mblen(const unsigned char *s)
56 {
57  return 1;
58 }
59 
60 static int
61 pg_ascii_dsplen(const unsigned char *s)
62 {
63  if (*s == '\0')
64  return 0;
65  if (*s < 0x20 || *s == 0x7f)
66  return -1;
67 
68  return 1;
69 }
70 
71 /*
72  * EUC
73  */
74 static int
75 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
76 {
77  int cnt = 0;
78 
79  while (len > 0 && *from)
80  {
81  if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
82  * KANA") */
83  {
84  from++;
85  *to = (SS2 << 8) | *from++;
86  len -= 2;
87  }
88  else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
89  {
90  from++;
91  *to = (SS3 << 16) | (*from++ << 8);
92  *to |= *from++;
93  len -= 3;
94  }
95  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
96  {
97  *to = *from++ << 8;
98  *to |= *from++;
99  len -= 2;
100  }
101  else /* must be ASCII */
102  {
103  *to = *from++;
104  len--;
105  }
106  to++;
107  cnt++;
108  }
109  *to = 0;
110  return cnt;
111 }
112 
113 static inline int
114 pg_euc_mblen(const unsigned char *s)
115 {
116  int len;
117 
118  if (*s == SS2)
119  len = 2;
120  else if (*s == SS3)
121  len = 3;
122  else if (IS_HIGHBIT_SET(*s))
123  len = 2;
124  else
125  len = 1;
126  return len;
127 }
128 
129 static inline int
130 pg_euc_dsplen(const unsigned char *s)
131 {
132  int len;
133 
134  if (*s == SS2)
135  len = 2;
136  else if (*s == SS3)
137  len = 2;
138  else if (IS_HIGHBIT_SET(*s))
139  len = 2;
140  else
141  len = pg_ascii_dsplen(s);
142  return len;
143 }
144 
145 /*
146  * EUC_JP
147  */
148 static int
149 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
150 {
151  return pg_euc2wchar_with_len(from, to, len);
152 }
153 
154 static int
155 pg_eucjp_mblen(const unsigned char *s)
156 {
157  return pg_euc_mblen(s);
158 }
159 
160 static int
161 pg_eucjp_dsplen(const unsigned char *s)
162 {
163  int len;
164 
165  if (*s == SS2)
166  len = 1;
167  else if (*s == SS3)
168  len = 2;
169  else if (IS_HIGHBIT_SET(*s))
170  len = 2;
171  else
172  len = pg_ascii_dsplen(s);
173  return len;
174 }
175 
176 /*
177  * EUC_KR
178  */
179 static int
180 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
181 {
182  return pg_euc2wchar_with_len(from, to, len);
183 }
184 
185 static int
186 pg_euckr_mblen(const unsigned char *s)
187 {
188  return pg_euc_mblen(s);
189 }
190 
191 static int
192 pg_euckr_dsplen(const unsigned char *s)
193 {
194  return pg_euc_dsplen(s);
195 }
196 
197 /*
198  * EUC_CN
199  *
200  */
201 static int
202 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
203 {
204  int cnt = 0;
205 
206  while (len > 0 && *from)
207  {
208  if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
209  {
210  from++;
211  *to = (SS2 << 16) | (*from++ << 8);
212  *to |= *from++;
213  len -= 3;
214  }
215  else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
216  {
217  from++;
218  *to = (SS3 << 16) | (*from++ << 8);
219  *to |= *from++;
220  len -= 3;
221  }
222  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
223  {
224  *to = *from++ << 8;
225  *to |= *from++;
226  len -= 2;
227  }
228  else
229  {
230  *to = *from++;
231  len--;
232  }
233  to++;
234  cnt++;
235  }
236  *to = 0;
237  return cnt;
238 }
239 
240 static int
241 pg_euccn_mblen(const unsigned char *s)
242 {
243  int len;
244 
245  if (IS_HIGHBIT_SET(*s))
246  len = 2;
247  else
248  len = 1;
249  return len;
250 }
251 
252 static int
253 pg_euccn_dsplen(const unsigned char *s)
254 {
255  int len;
256 
257  if (IS_HIGHBIT_SET(*s))
258  len = 2;
259  else
260  len = pg_ascii_dsplen(s);
261  return len;
262 }
263 
264 /*
265  * EUC_TW
266  *
267  */
268 static int
269 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
270 {
271  int cnt = 0;
272 
273  while (len > 0 && *from)
274  {
275  if (*from == SS2 && len >= 4) /* code set 2 */
276  {
277  from++;
278  *to = (((uint32) SS2) << 24) | (*from++ << 16);
279  *to |= *from++ << 8;
280  *to |= *from++;
281  len -= 4;
282  }
283  else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
284  {
285  from++;
286  *to = (SS3 << 16) | (*from++ << 8);
287  *to |= *from++;
288  len -= 3;
289  }
290  else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
291  {
292  *to = *from++ << 8;
293  *to |= *from++;
294  len -= 2;
295  }
296  else
297  {
298  *to = *from++;
299  len--;
300  }
301  to++;
302  cnt++;
303  }
304  *to = 0;
305  return cnt;
306 }
307 
308 static int
309 pg_euctw_mblen(const unsigned char *s)
310 {
311  int len;
312 
313  if (*s == SS2)
314  len = 4;
315  else if (*s == SS3)
316  len = 3;
317  else if (IS_HIGHBIT_SET(*s))
318  len = 2;
319  else
320  len = 1;
321  return len;
322 }
323 
324 static int
325 pg_euctw_dsplen(const unsigned char *s)
326 {
327  int len;
328 
329  if (*s == SS2)
330  len = 2;
331  else if (*s == SS3)
332  len = 2;
333  else if (IS_HIGHBIT_SET(*s))
334  len = 2;
335  else
336  len = pg_ascii_dsplen(s);
337  return len;
338 }
339 
340 /*
341  * Convert pg_wchar to EUC_* encoding.
342  * caller must allocate enough space for "to", including a trailing zero!
343  * len: length of from.
344  * "from" not necessarily null terminated.
345  */
346 static int
347 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
348 {
349  int cnt = 0;
350 
351  while (len > 0 && *from)
352  {
353  unsigned char c;
354 
355  if ((c = (*from >> 24)))
356  {
357  *to++ = c;
358  *to++ = (*from >> 16) & 0xff;
359  *to++ = (*from >> 8) & 0xff;
360  *to++ = *from & 0xff;
361  cnt += 4;
362  }
363  else if ((c = (*from >> 16)))
364  {
365  *to++ = c;
366  *to++ = (*from >> 8) & 0xff;
367  *to++ = *from & 0xff;
368  cnt += 3;
369  }
370  else if ((c = (*from >> 8)))
371  {
372  *to++ = c;
373  *to++ = *from & 0xff;
374  cnt += 2;
375  }
376  else
377  {
378  *to++ = *from;
379  cnt++;
380  }
381  from++;
382  len--;
383  }
384  *to = 0;
385  return cnt;
386 }
387 
388 
389 /*
390  * JOHAB
391  */
392 static int
393 pg_johab_mblen(const unsigned char *s)
394 {
395  return pg_euc_mblen(s);
396 }
397 
398 static int
399 pg_johab_dsplen(const unsigned char *s)
400 {
401  return pg_euc_dsplen(s);
402 }
403 
404 /*
405  * convert UTF8 string to pg_wchar (UCS-4)
406  * caller must allocate enough space for "to", including a trailing zero!
407  * len: length of from.
408  * "from" not necessarily null terminated.
409  */
410 static int
411 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
412 {
413  int cnt = 0;
414  uint32 c1,
415  c2,
416  c3,
417  c4;
418 
419  while (len > 0 && *from)
420  {
421  if ((*from & 0x80) == 0)
422  {
423  *to = *from++;
424  len--;
425  }
426  else if ((*from & 0xe0) == 0xc0)
427  {
428  if (len < 2)
429  break; /* drop trailing incomplete char */
430  c1 = *from++ & 0x1f;
431  c2 = *from++ & 0x3f;
432  *to = (c1 << 6) | c2;
433  len -= 2;
434  }
435  else if ((*from & 0xf0) == 0xe0)
436  {
437  if (len < 3)
438  break; /* drop trailing incomplete char */
439  c1 = *from++ & 0x0f;
440  c2 = *from++ & 0x3f;
441  c3 = *from++ & 0x3f;
442  *to = (c1 << 12) | (c2 << 6) | c3;
443  len -= 3;
444  }
445  else if ((*from & 0xf8) == 0xf0)
446  {
447  if (len < 4)
448  break; /* drop trailing incomplete char */
449  c1 = *from++ & 0x07;
450  c2 = *from++ & 0x3f;
451  c3 = *from++ & 0x3f;
452  c4 = *from++ & 0x3f;
453  *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
454  len -= 4;
455  }
456  else
457  {
458  /* treat a bogus char as length 1; not ours to raise error */
459  *to = *from++;
460  len--;
461  }
462  to++;
463  cnt++;
464  }
465  *to = 0;
466  return cnt;
467 }
468 
469 
470 /*
471  * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
472  * space allocated.
473  */
474 unsigned char *
475 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
476 {
477  if (c <= 0x7F)
478  {
479  utf8string[0] = c;
480  }
481  else if (c <= 0x7FF)
482  {
483  utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
484  utf8string[1] = 0x80 | (c & 0x3F);
485  }
486  else if (c <= 0xFFFF)
487  {
488  utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
489  utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
490  utf8string[2] = 0x80 | (c & 0x3F);
491  }
492  else
493  {
494  utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
495  utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
496  utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
497  utf8string[3] = 0x80 | (c & 0x3F);
498  }
499 
500  return utf8string;
501 }
502 
503 /*
504  * Trivial conversion from pg_wchar to UTF-8.
505  * caller should allocate enough space for "to"
506  * len: length of from.
507  * "from" not necessarily null terminated.
508  */
509 static int
510 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
511 {
512  int cnt = 0;
513 
514  while (len > 0 && *from)
515  {
516  int char_len;
517 
518  unicode_to_utf8(*from, to);
519  char_len = pg_utf_mblen(to);
520  cnt += char_len;
521  to += char_len;
522  from++;
523  len--;
524  }
525  *to = 0;
526  return cnt;
527 }
528 
529 /*
530  * Return the byte length of a UTF8 character pointed to by s
531  *
532  * Note: in the current implementation we do not support UTF8 sequences
533  * of more than 4 bytes; hence do NOT return a value larger than 4.
534  * We return "1" for any leading byte that is either flat-out illegal or
535  * indicates a length larger than we support.
536  *
537  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
538  * other places would need to be fixed to change this.
539  */
540 int
541 pg_utf_mblen(const unsigned char *s)
542 {
543  int len;
544 
545  if ((*s & 0x80) == 0)
546  len = 1;
547  else if ((*s & 0xe0) == 0xc0)
548  len = 2;
549  else if ((*s & 0xf0) == 0xe0)
550  len = 3;
551  else if ((*s & 0xf8) == 0xf0)
552  len = 4;
553 #ifdef NOT_USED
554  else if ((*s & 0xfc) == 0xf8)
555  len = 5;
556  else if ((*s & 0xfe) == 0xfc)
557  len = 6;
558 #endif
559  else
560  len = 1;
561  return len;
562 }
563 
564 /*
565  * This is an implementation of wcwidth() and wcswidth() as defined in
566  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
567  * <http://www.UNIX-systems.org/online.html>
568  *
569  * Markus Kuhn -- 2001-09-08 -- public domain
570  *
571  * customised for PostgreSQL
572  *
573  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
574  */
575 
577 {
578  unsigned short first;
579  unsigned short last;
580 };
581 
582 /* auxiliary function for binary search in interval table */
583 static int
584 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
585 {
586  int min = 0;
587  int mid;
588 
589  if (ucs < table[0].first || ucs > table[max].last)
590  return 0;
591  while (max >= min)
592  {
593  mid = (min + max) / 2;
594  if (ucs > table[mid].last)
595  min = mid + 1;
596  else if (ucs < table[mid].first)
597  max = mid - 1;
598  else
599  return 1;
600  }
601 
602  return 0;
603 }
604 
605 
606 /* The following functions define the column width of an ISO 10646
607  * character as follows:
608  *
609  * - The null character (U+0000) has a column width of 0.
610  *
611  * - Other C0/C1 control characters and DEL will lead to a return
612  * value of -1.
613  *
614  * - Non-spacing and enclosing combining characters (general
615  * category code Mn or Me in the Unicode database) have a
616  * column width of 0.
617  *
618  * - Other format characters (general category code Cf in the Unicode
619  * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
620  *
621  * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
622  * have a column width of 0.
623  *
624  * - Spacing characters in the East Asian Wide (W) or East Asian
625  * FullWidth (F) category as defined in Unicode Technical
626  * Report #11 have a column width of 2.
627  *
628  * - All remaining characters (including all printable
629  * ISO 8859-1 and WGL4 characters, Unicode control characters,
630  * etc.) have a column width of 1.
631  *
632  * This implementation assumes that wchar_t characters are encoded
633  * in ISO 10646.
634  */
635 
636 static int
638 {
639  /* sorted list of non-overlapping intervals of non-spacing characters */
640  static const struct mbinterval combining[] = {
641  {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486},
642  {0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9},
643  {0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
644  {0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
645  {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
646  {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
647  {0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
648  {0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954},
649  {0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC},
650  {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3},
651  {0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
652  {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71},
653  {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5},
654  {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01},
655  {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
656  {0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82},
657  {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40},
658  {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56},
659  {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
660  {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA},
661  {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31},
662  {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
663  {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
664  {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37},
665  {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84},
666  {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC},
667  {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
668  {0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059},
669  {0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
670  {0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9},
671  {0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F},
672  {0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A},
673  {0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF},
674  {0xFFF9, 0xFFFB}
675  };
676 
677  /* test for 8-bit control characters */
678  if (ucs == 0)
679  return 0;
680 
681  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
682  return -1;
683 
684  /* binary search in table of non-spacing characters */
685  if (mbbisearch(ucs, combining,
686  sizeof(combining) / sizeof(struct mbinterval) - 1))
687  return 0;
688 
689  /*
690  * if we arrive here, ucs is not a combining or C0/C1 control character
691  */
692 
693  return 1 +
694  (ucs >= 0x1100 &&
695  (ucs <= 0x115f || /* Hangul Jamo init. consonants */
696  (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
697  ucs != 0x303f) || /* CJK ... Yi */
698  (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
699  (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
700  * Ideographs */
701  (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
702  (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
703  (ucs >= 0xffe0 && ucs <= 0xffe6) ||
704  (ucs >= 0x20000 && ucs <= 0x2ffff)));
705 }
706 
707 /*
708  * Convert a UTF-8 character to a Unicode code point.
709  * This is a one-character version of pg_utf2wchar_with_len.
710  *
711  * No error checks here, c must point to a long-enough string.
712  */
713 pg_wchar
714 utf8_to_unicode(const unsigned char *c)
715 {
716  if ((*c & 0x80) == 0)
717  return (pg_wchar) c[0];
718  else if ((*c & 0xe0) == 0xc0)
719  return (pg_wchar) (((c[0] & 0x1f) << 6) |
720  (c[1] & 0x3f));
721  else if ((*c & 0xf0) == 0xe0)
722  return (pg_wchar) (((c[0] & 0x0f) << 12) |
723  ((c[1] & 0x3f) << 6) |
724  (c[2] & 0x3f));
725  else if ((*c & 0xf8) == 0xf0)
726  return (pg_wchar) (((c[0] & 0x07) << 18) |
727  ((c[1] & 0x3f) << 12) |
728  ((c[2] & 0x3f) << 6) |
729  (c[3] & 0x3f));
730  else
731  /* that is an invalid code on purpose */
732  return 0xffffffff;
733 }
734 
735 static int
736 pg_utf_dsplen(const unsigned char *s)
737 {
738  return ucs_wcwidth(utf8_to_unicode(s));
739 }
740 
741 /*
742  * convert mule internal code to pg_wchar
743  * caller should allocate enough space for "to"
744  * len: length of from.
745  * "from" not necessarily null terminated.
746  */
747 static int
748 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
749 {
750  int cnt = 0;
751 
752  while (len > 0 && *from)
753  {
754  if (IS_LC1(*from) && len >= 2)
755  {
756  *to = *from++ << 16;
757  *to |= *from++;
758  len -= 2;
759  }
760  else if (IS_LCPRV1(*from) && len >= 3)
761  {
762  from++;
763  *to = *from++ << 16;
764  *to |= *from++;
765  len -= 3;
766  }
767  else if (IS_LC2(*from) && len >= 3)
768  {
769  *to = *from++ << 16;
770  *to |= *from++ << 8;
771  *to |= *from++;
772  len -= 3;
773  }
774  else if (IS_LCPRV2(*from) && len >= 4)
775  {
776  from++;
777  *to = *from++ << 16;
778  *to |= *from++ << 8;
779  *to |= *from++;
780  len -= 4;
781  }
782  else
783  { /* assume ASCII */
784  *to = (unsigned char) *from++;
785  len--;
786  }
787  to++;
788  cnt++;
789  }
790  *to = 0;
791  return cnt;
792 }
793 
794 /*
795  * convert pg_wchar to mule internal code
796  * caller should allocate enough space for "to"
797  * len: length of from.
798  * "from" not necessarily null terminated.
799  */
800 static int
801 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
802 {
803  int cnt = 0;
804 
805  while (len > 0 && *from)
806  {
807  unsigned char lb;
808 
809  lb = (*from >> 16) & 0xff;
810  if (IS_LC1(lb))
811  {
812  *to++ = lb;
813  *to++ = *from & 0xff;
814  cnt += 2;
815  }
816  else if (IS_LC2(lb))
817  {
818  *to++ = lb;
819  *to++ = (*from >> 8) & 0xff;
820  *to++ = *from & 0xff;
821  cnt += 3;
822  }
823  else if (IS_LCPRV1_A_RANGE(lb))
824  {
825  *to++ = LCPRV1_A;
826  *to++ = lb;
827  *to++ = *from & 0xff;
828  cnt += 3;
829  }
830  else if (IS_LCPRV1_B_RANGE(lb))
831  {
832  *to++ = LCPRV1_B;
833  *to++ = lb;
834  *to++ = *from & 0xff;
835  cnt += 3;
836  }
837  else if (IS_LCPRV2_A_RANGE(lb))
838  {
839  *to++ = LCPRV2_A;
840  *to++ = lb;
841  *to++ = (*from >> 8) & 0xff;
842  *to++ = *from & 0xff;
843  cnt += 4;
844  }
845  else if (IS_LCPRV2_B_RANGE(lb))
846  {
847  *to++ = LCPRV2_B;
848  *to++ = lb;
849  *to++ = (*from >> 8) & 0xff;
850  *to++ = *from & 0xff;
851  cnt += 4;
852  }
853  else
854  {
855  *to++ = *from & 0xff;
856  cnt += 1;
857  }
858  from++;
859  len--;
860  }
861  *to = 0;
862  return cnt;
863 }
864 
865 int
866 pg_mule_mblen(const unsigned char *s)
867 {
868  int len;
869 
870  if (IS_LC1(*s))
871  len = 2;
872  else if (IS_LCPRV1(*s))
873  len = 3;
874  else if (IS_LC2(*s))
875  len = 3;
876  else if (IS_LCPRV2(*s))
877  len = 4;
878  else
879  len = 1; /* assume ASCII */
880  return len;
881 }
882 
883 static int
884 pg_mule_dsplen(const unsigned char *s)
885 {
886  int len;
887 
888  /*
889  * Note: it's not really appropriate to assume that all multibyte charsets
890  * are double-wide on screen. But this seems an okay approximation for
891  * the MULE charsets we currently support.
892  */
893 
894  if (IS_LC1(*s))
895  len = 1;
896  else if (IS_LCPRV1(*s))
897  len = 1;
898  else if (IS_LC2(*s))
899  len = 2;
900  else if (IS_LCPRV2(*s))
901  len = 2;
902  else
903  len = 1; /* assume ASCII */
904 
905  return len;
906 }
907 
908 /*
909  * ISO8859-1
910  */
911 static int
912 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
913 {
914  int cnt = 0;
915 
916  while (len > 0 && *from)
917  {
918  *to++ = *from++;
919  len--;
920  cnt++;
921  }
922  *to = 0;
923  return cnt;
924 }
925 
926 /*
927  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
928  * high bits.
929  * caller should allocate enough space for "to"
930  * len: length of from.
931  * "from" not necessarily null terminated.
932  */
933 static int
934 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
935 {
936  int cnt = 0;
937 
938  while (len > 0 && *from)
939  {
940  *to++ = *from++;
941  len--;
942  cnt++;
943  }
944  *to = 0;
945  return cnt;
946 }
947 
948 static int
949 pg_latin1_mblen(const unsigned char *s)
950 {
951  return 1;
952 }
953 
954 static int
955 pg_latin1_dsplen(const unsigned char *s)
956 {
957  return pg_ascii_dsplen(s);
958 }
959 
960 /*
961  * SJIS
962  */
963 static int
964 pg_sjis_mblen(const unsigned char *s)
965 {
966  int len;
967 
968  if (*s >= 0xa1 && *s <= 0xdf)
969  len = 1; /* 1 byte kana? */
970  else if (IS_HIGHBIT_SET(*s))
971  len = 2; /* kanji? */
972  else
973  len = 1; /* should be ASCII */
974  return len;
975 }
976 
977 static int
978 pg_sjis_dsplen(const unsigned char *s)
979 {
980  int len;
981 
982  if (*s >= 0xa1 && *s <= 0xdf)
983  len = 1; /* 1 byte kana? */
984  else if (IS_HIGHBIT_SET(*s))
985  len = 2; /* kanji? */
986  else
987  len = pg_ascii_dsplen(s); /* should be ASCII */
988  return len;
989 }
990 
991 /*
992  * Big5
993  */
994 static int
995 pg_big5_mblen(const unsigned char *s)
996 {
997  int len;
998 
999  if (IS_HIGHBIT_SET(*s))
1000  len = 2; /* kanji? */
1001  else
1002  len = 1; /* should be ASCII */
1003  return len;
1004 }
1005 
1006 static int
1007 pg_big5_dsplen(const unsigned char *s)
1008 {
1009  int len;
1010 
1011  if (IS_HIGHBIT_SET(*s))
1012  len = 2; /* kanji? */
1013  else
1014  len = pg_ascii_dsplen(s); /* should be ASCII */
1015  return len;
1016 }
1017 
1018 /*
1019  * GBK
1020  */
1021 static int
1022 pg_gbk_mblen(const unsigned char *s)
1023 {
1024  int len;
1025 
1026  if (IS_HIGHBIT_SET(*s))
1027  len = 2; /* kanji? */
1028  else
1029  len = 1; /* should be ASCII */
1030  return len;
1031 }
1032 
1033 static int
1034 pg_gbk_dsplen(const unsigned char *s)
1035 {
1036  int len;
1037 
1038  if (IS_HIGHBIT_SET(*s))
1039  len = 2; /* kanji? */
1040  else
1041  len = pg_ascii_dsplen(s); /* should be ASCII */
1042  return len;
1043 }
1044 
1045 /*
1046  * UHC
1047  */
1048 static int
1049 pg_uhc_mblen(const unsigned char *s)
1050 {
1051  int len;
1052 
1053  if (IS_HIGHBIT_SET(*s))
1054  len = 2; /* 2byte? */
1055  else
1056  len = 1; /* should be ASCII */
1057  return len;
1058 }
1059 
1060 static int
1061 pg_uhc_dsplen(const unsigned char *s)
1062 {
1063  int len;
1064 
1065  if (IS_HIGHBIT_SET(*s))
1066  len = 2; /* 2byte? */
1067  else
1068  len = pg_ascii_dsplen(s); /* should be ASCII */
1069  return len;
1070 }
1071 
1072 /*
1073  * GB18030
1074  * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1075  */
1076 static int
1077 pg_gb18030_mblen(const unsigned char *s)
1078 {
1079  int len;
1080 
1081  if (!IS_HIGHBIT_SET(*s))
1082  len = 1; /* ASCII */
1083  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1084  len = 4;
1085  else
1086  len = 2;
1087  return len;
1088 }
1089 
1090 static int
1091 pg_gb18030_dsplen(const unsigned char *s)
1092 {
1093  int len;
1094 
1095  if (IS_HIGHBIT_SET(*s))
1096  len = 2;
1097  else
1098  len = pg_ascii_dsplen(s); /* ASCII */
1099  return len;
1100 }
1101 
1102 /*
1103  *-------------------------------------------------------------------
1104  * multibyte sequence validators
1105  *
1106  * These functions accept "s", a pointer to the first byte of a string,
1107  * and "len", the remaining length of the string. If there is a validly
1108  * encoded character beginning at *s, return its length in bytes; else
1109  * return -1.
1110  *
1111  * The functions can assume that len > 0 and that *s != '\0', but they must
1112  * test for and reject zeroes in any additional bytes of a multibyte character.
1113  *
1114  * Note that this definition allows the function for a single-byte
1115  * encoding to be just "return 1".
1116  *-------------------------------------------------------------------
1117  */
1118 
1119 static int
1120 pg_ascii_verifier(const unsigned char *s, int len)
1121 {
1122  return 1;
1123 }
1124 
1125 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1126 
1127 static int
1128 pg_eucjp_verifier(const unsigned char *s, int len)
1129 {
1130  int l;
1131  unsigned char c1,
1132  c2;
1133 
1134  c1 = *s++;
1135 
1136  switch (c1)
1137  {
1138  case SS2: /* JIS X 0201 */
1139  l = 2;
1140  if (l > len)
1141  return -1;
1142  c2 = *s++;
1143  if (c2 < 0xa1 || c2 > 0xdf)
1144  return -1;
1145  break;
1146 
1147  case SS3: /* JIS X 0212 */
1148  l = 3;
1149  if (l > len)
1150  return -1;
1151  c2 = *s++;
1152  if (!IS_EUC_RANGE_VALID(c2))
1153  return -1;
1154  c2 = *s++;
1155  if (!IS_EUC_RANGE_VALID(c2))
1156  return -1;
1157  break;
1158 
1159  default:
1160  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1161  {
1162  l = 2;
1163  if (l > len)
1164  return -1;
1165  if (!IS_EUC_RANGE_VALID(c1))
1166  return -1;
1167  c2 = *s++;
1168  if (!IS_EUC_RANGE_VALID(c2))
1169  return -1;
1170  }
1171  else
1172  /* must be ASCII */
1173  {
1174  l = 1;
1175  }
1176  break;
1177  }
1178 
1179  return l;
1180 }
1181 
1182 static int
1183 pg_euckr_verifier(const unsigned char *s, int len)
1184 {
1185  int l;
1186  unsigned char c1,
1187  c2;
1188 
1189  c1 = *s++;
1190 
1191  if (IS_HIGHBIT_SET(c1))
1192  {
1193  l = 2;
1194  if (l > len)
1195  return -1;
1196  if (!IS_EUC_RANGE_VALID(c1))
1197  return -1;
1198  c2 = *s++;
1199  if (!IS_EUC_RANGE_VALID(c2))
1200  return -1;
1201  }
1202  else
1203  /* must be ASCII */
1204  {
1205  l = 1;
1206  }
1207 
1208  return l;
1209 }
1210 
1211 /* EUC-CN byte sequences are exactly same as EUC-KR */
1212 #define pg_euccn_verifier pg_euckr_verifier
1213 
1214 static int
1215 pg_euctw_verifier(const unsigned char *s, int len)
1216 {
1217  int l;
1218  unsigned char c1,
1219  c2;
1220 
1221  c1 = *s++;
1222 
1223  switch (c1)
1224  {
1225  case SS2: /* CNS 11643 Plane 1-7 */
1226  l = 4;
1227  if (l > len)
1228  return -1;
1229  c2 = *s++;
1230  if (c2 < 0xa1 || c2 > 0xa7)
1231  return -1;
1232  c2 = *s++;
1233  if (!IS_EUC_RANGE_VALID(c2))
1234  return -1;
1235  c2 = *s++;
1236  if (!IS_EUC_RANGE_VALID(c2))
1237  return -1;
1238  break;
1239 
1240  case SS3: /* unused */
1241  return -1;
1242 
1243  default:
1244  if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1245  {
1246  l = 2;
1247  if (l > len)
1248  return -1;
1249  /* no further range check on c1? */
1250  c2 = *s++;
1251  if (!IS_EUC_RANGE_VALID(c2))
1252  return -1;
1253  }
1254  else
1255  /* must be ASCII */
1256  {
1257  l = 1;
1258  }
1259  break;
1260  }
1261  return l;
1262 }
1263 
1264 static int
1265 pg_johab_verifier(const unsigned char *s, int len)
1266 {
1267  int l,
1268  mbl;
1269  unsigned char c;
1270 
1271  l = mbl = pg_johab_mblen(s);
1272 
1273  if (len < l)
1274  return -1;
1275 
1276  if (!IS_HIGHBIT_SET(*s))
1277  return mbl;
1278 
1279  while (--l > 0)
1280  {
1281  c = *++s;
1282  if (!IS_EUC_RANGE_VALID(c))
1283  return -1;
1284  }
1285  return mbl;
1286 }
1287 
1288 static int
1289 pg_mule_verifier(const unsigned char *s, int len)
1290 {
1291  int l,
1292  mbl;
1293  unsigned char c;
1294 
1295  l = mbl = pg_mule_mblen(s);
1296 
1297  if (len < l)
1298  return -1;
1299 
1300  while (--l > 0)
1301  {
1302  c = *++s;
1303  if (!IS_HIGHBIT_SET(c))
1304  return -1;
1305  }
1306  return mbl;
1307 }
1308 
1309 static int
1310 pg_latin1_verifier(const unsigned char *s, int len)
1311 {
1312  return 1;
1313 }
1314 
1315 static int
1316 pg_sjis_verifier(const unsigned char *s, int len)
1317 {
1318  int l,
1319  mbl;
1320  unsigned char c1,
1321  c2;
1322 
1323  l = mbl = pg_sjis_mblen(s);
1324 
1325  if (len < l)
1326  return -1;
1327 
1328  if (l == 1) /* pg_sjis_mblen already verified it */
1329  return mbl;
1330 
1331  c1 = *s++;
1332  c2 = *s;
1333  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1334  return -1;
1335  return mbl;
1336 }
1337 
1338 static int
1339 pg_big5_verifier(const unsigned char *s, int len)
1340 {
1341  int l,
1342  mbl;
1343 
1344  l = mbl = pg_big5_mblen(s);
1345 
1346  if (len < l)
1347  return -1;
1348 
1349  while (--l > 0)
1350  {
1351  if (*++s == '\0')
1352  return -1;
1353  }
1354 
1355  return mbl;
1356 }
1357 
1358 static int
1359 pg_gbk_verifier(const unsigned char *s, int len)
1360 {
1361  int l,
1362  mbl;
1363 
1364  l = mbl = pg_gbk_mblen(s);
1365 
1366  if (len < l)
1367  return -1;
1368 
1369  while (--l > 0)
1370  {
1371  if (*++s == '\0')
1372  return -1;
1373  }
1374 
1375  return mbl;
1376 }
1377 
1378 static int
1379 pg_uhc_verifier(const unsigned char *s, int len)
1380 {
1381  int l,
1382  mbl;
1383 
1384  l = mbl = pg_uhc_mblen(s);
1385 
1386  if (len < l)
1387  return -1;
1388 
1389  while (--l > 0)
1390  {
1391  if (*++s == '\0')
1392  return -1;
1393  }
1394 
1395  return mbl;
1396 }
1397 
1398 static int
1399 pg_gb18030_verifier(const unsigned char *s, int len)
1400 {
1401  int l;
1402 
1403  if (!IS_HIGHBIT_SET(*s))
1404  l = 1; /* ASCII */
1405  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1406  {
1407  /* Should be 4-byte, validate remaining bytes */
1408  if (*s >= 0x81 && *s <= 0xfe &&
1409  *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1410  *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1411  l = 4;
1412  else
1413  l = -1;
1414  }
1415  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1416  {
1417  /* Should be 2-byte, validate */
1418  if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1419  (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1420  l = 2;
1421  else
1422  l = -1;
1423  }
1424  else
1425  l = -1;
1426  return l;
1427 }
1428 
1429 static int
1430 pg_utf8_verifier(const unsigned char *s, int len)
1431 {
1432  int l = pg_utf_mblen(s);
1433 
1434  if (len < l)
1435  return -1;
1436 
1437  if (!pg_utf8_islegal(s, l))
1438  return -1;
1439 
1440  return l;
1441 }
1442 
1443 /*
1444  * Check for validity of a single UTF-8 encoded character
1445  *
1446  * This directly implements the rules in RFC3629. The bizarre-looking
1447  * restrictions on the second byte are meant to ensure that there isn't
1448  * more than one encoding of a given Unicode character point; that is,
1449  * you may not use a longer-than-necessary byte sequence with high order
1450  * zero bits to represent a character that would fit in fewer bytes.
1451  * To do otherwise is to create security hazards (eg, create an apparent
1452  * non-ASCII character that decodes to plain ASCII).
1453  *
1454  * length is assumed to have been obtained by pg_utf_mblen(), and the
1455  * caller must have checked that that many bytes are present in the buffer.
1456  */
1457 bool
1458 pg_utf8_islegal(const unsigned char *source, int length)
1459 {
1460  unsigned char a;
1461 
1462  switch (length)
1463  {
1464  default:
1465  /* reject lengths 5 and 6 for now */
1466  return false;
1467  case 4:
1468  a = source[3];
1469  if (a < 0x80 || a > 0xBF)
1470  return false;
1471  /* FALL THRU */
1472  case 3:
1473  a = source[2];
1474  if (a < 0x80 || a > 0xBF)
1475  return false;
1476  /* FALL THRU */
1477  case 2:
1478  a = source[1];
1479  switch (*source)
1480  {
1481  case 0xE0:
1482  if (a < 0xA0 || a > 0xBF)
1483  return false;
1484  break;
1485  case 0xED:
1486  if (a < 0x80 || a > 0x9F)
1487  return false;
1488  break;
1489  case 0xF0:
1490  if (a < 0x90 || a > 0xBF)
1491  return false;
1492  break;
1493  case 0xF4:
1494  if (a < 0x80 || a > 0x8F)
1495  return false;
1496  break;
1497  default:
1498  if (a < 0x80 || a > 0xBF)
1499  return false;
1500  break;
1501  }
1502  /* FALL THRU */
1503  case 1:
1504  a = *source;
1505  if (a >= 0x80 && a < 0xC2)
1506  return false;
1507  if (a > 0xF4)
1508  return false;
1509  break;
1510  }
1511  return true;
1512 }
1513 
1514 #ifndef FRONTEND
1515 
1516 /*
1517  * Generic character incrementer function.
1518  *
1519  * Not knowing anything about the properties of the encoding in use, we just
1520  * keep incrementing the last byte until we get a validly-encoded result,
1521  * or we run out of values to try. We don't bother to try incrementing
1522  * higher-order bytes, so there's no growth in runtime for wider characters.
1523  * (If we did try to do that, we'd need to consider the likelihood that 255
1524  * is not a valid final byte in the encoding.)
1525  */
1526 static bool
1527 pg_generic_charinc(unsigned char *charptr, int len)
1528 {
1529  unsigned char *lastbyte = charptr + len - 1;
1530  mbverifier mbverify;
1531 
1532  /* We can just invoke the character verifier directly. */
1534 
1535  while (*lastbyte < (unsigned char) 255)
1536  {
1537  (*lastbyte)++;
1538  if ((*mbverify) (charptr, len) == len)
1539  return true;
1540  }
1541 
1542  return false;
1543 }
1544 
1545 /*
1546  * UTF-8 character incrementer function.
1547  *
1548  * For a one-byte character less than 0x7F, we just increment the byte.
1549  *
1550  * For a multibyte character, every byte but the first must fall between 0x80
1551  * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1552  * the last byte that's not already at its maximum value. If we can't find a
1553  * byte that's less than the maximum allowable value, we simply fail. We also
1554  * need some special-case logic to skip regions used for surrogate pair
1555  * handling, as those should not occur in valid UTF-8.
1556  *
1557  * Note that we don't reset lower-order bytes back to their minimums, since
1558  * we can't afford to make an exhaustive search (see make_greater_string).
1559  */
1560 static bool
1561 pg_utf8_increment(unsigned char *charptr, int length)
1562 {
1563  unsigned char a;
1564  unsigned char limit;
1565 
1566  switch (length)
1567  {
1568  default:
1569  /* reject lengths 5 and 6 for now */
1570  return false;
1571  case 4:
1572  a = charptr[3];
1573  if (a < 0xBF)
1574  {
1575  charptr[3]++;
1576  break;
1577  }
1578  /* FALL THRU */
1579  case 3:
1580  a = charptr[2];
1581  if (a < 0xBF)
1582  {
1583  charptr[2]++;
1584  break;
1585  }
1586  /* FALL THRU */
1587  case 2:
1588  a = charptr[1];
1589  switch (*charptr)
1590  {
1591  case 0xED:
1592  limit = 0x9F;
1593  break;
1594  case 0xF4:
1595  limit = 0x8F;
1596  break;
1597  default:
1598  limit = 0xBF;
1599  break;
1600  }
1601  if (a < limit)
1602  {
1603  charptr[1]++;
1604  break;
1605  }
1606  /* FALL THRU */
1607  case 1:
1608  a = *charptr;
1609  if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1610  return false;
1611  charptr[0]++;
1612  break;
1613  }
1614 
1615  return true;
1616 }
1617 
1618 /*
1619  * EUC-JP character incrementer function.
1620  *
1621  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1622  * representing JIS X 0201 characters with the second byte ranging between
1623  * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1624  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1625  *
1626  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1627  * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1628  * is incremented if possible, otherwise the second-to-last byte.
1629  *
1630  * If the sequence starts with a value other than the above and its MSB
1631  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1632  * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1633  * incremented if possible, otherwise the second-to-last byte.
1634  *
1635  * Otherwise, the sequence is a single-byte ASCII character. It is
1636  * incremented up to 0x7f.
1637  */
1638 static bool
1639 pg_eucjp_increment(unsigned char *charptr, int length)
1640 {
1641  unsigned char c1,
1642  c2;
1643  int i;
1644 
1645  c1 = *charptr;
1646 
1647  switch (c1)
1648  {
1649  case SS2: /* JIS X 0201 */
1650  if (length != 2)
1651  return false;
1652 
1653  c2 = charptr[1];
1654 
1655  if (c2 >= 0xdf)
1656  charptr[0] = charptr[1] = 0xa1;
1657  else if (c2 < 0xa1)
1658  charptr[1] = 0xa1;
1659  else
1660  charptr[1]++;
1661  break;
1662 
1663  case SS3: /* JIS X 0212 */
1664  if (length != 3)
1665  return false;
1666 
1667  for (i = 2; i > 0; i--)
1668  {
1669  c2 = charptr[i];
1670  if (c2 < 0xa1)
1671  {
1672  charptr[i] = 0xa1;
1673  return true;
1674  }
1675  else if (c2 < 0xfe)
1676  {
1677  charptr[i]++;
1678  return true;
1679  }
1680  }
1681 
1682  /* Out of 3-byte code region */
1683  return false;
1684 
1685  default:
1686  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1687  {
1688  if (length != 2)
1689  return false;
1690 
1691  for (i = 1; i >= 0; i--)
1692  {
1693  c2 = charptr[i];
1694  if (c2 < 0xa1)
1695  {
1696  charptr[i] = 0xa1;
1697  return true;
1698  }
1699  else if (c2 < 0xfe)
1700  {
1701  charptr[i]++;
1702  return true;
1703  }
1704  }
1705 
1706  /* Out of 2 byte code region */
1707  return false;
1708  }
1709  else
1710  { /* ASCII, single byte */
1711  if (c1 > 0x7e)
1712  return false;
1713  (*charptr)++;
1714  }
1715  break;
1716  }
1717 
1718  return true;
1719 }
1720 #endif /* !FRONTEND */
1721 
1722 
1723 /*
1724  *-------------------------------------------------------------------
1725  * encoding info table
1726  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1727  *-------------------------------------------------------------------
1728  */
1765  {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1766  {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1767  {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
1768  {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
1769  {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
1770  {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
1771  {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
1772 };
1773 
1774 /* returns the byte length of a word for mule internal code */
1775 int
1776 pg_mic_mblen(const unsigned char *mbstr)
1777 {
1778  return pg_mule_mblen(mbstr);
1779 }
1780 
1781 /*
1782  * Returns the byte length of a multibyte character.
1783  */
1784 int
1785 pg_encoding_mblen(int encoding, const char *mbstr)
1786 {
1787  return (PG_VALID_ENCODING(encoding) ?
1788  pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1789  pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1790 }
1791 
1792 /*
1793  * Returns the display length of a multibyte character.
1794  */
1795 int
1796 pg_encoding_dsplen(int encoding, const char *mbstr)
1797 {
1798  return (PG_VALID_ENCODING(encoding) ?
1799  pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1800  pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1801 }
1802 
1803 /*
1804  * Verify the first multibyte character of the given string.
1805  * Return its byte length if good, -1 if bad. (See comments above for
1806  * full details of the mbverify API.)
1807  */
1808 int
1809 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1810 {
1811  return (PG_VALID_ENCODING(encoding) ?
1812  pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1813  pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1814 }
1815 
1816 /*
1817  * fetch maximum length of a given encoding
1818  */
1819 int
1821 {
1822  Assert(PG_VALID_ENCODING(encoding));
1823 
1824  return pg_wchar_table[encoding].maxmblen;
1825 }
1826 
1827 #ifndef FRONTEND
1828 
1829 /*
1830  * fetch maximum length of the encoding for the current database
1831  */
1832 int
1834 {
1835  return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1836 }
1837 
1838 /*
1839  * get the character incrementer for the encoding for the current database
1840  */
1843 {
1844  /*
1845  * Eventually it might be best to add a field to pg_wchar_table[], but for
1846  * now we just use a switch.
1847  */
1848  switch (GetDatabaseEncoding())
1849  {
1850  case PG_UTF8:
1851  return pg_utf8_increment;
1852 
1853  case PG_EUC_JP:
1854  return pg_eucjp_increment;
1855 
1856  default:
1857  return pg_generic_charinc;
1858  }
1859 }
1860 
1861 /*
1862  * Verify mbstr to make sure that it is validly encoded in the current
1863  * database encoding. Otherwise same as pg_verify_mbstr().
1864  */
1865 bool
1866 pg_verifymbstr(const char *mbstr, int len, bool noError)
1867 {
1868  return
1869  pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1870 }
1871 
1872 /*
1873  * Verify mbstr to make sure that it is validly encoded in the specified
1874  * encoding.
1875  */
1876 bool
1877 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1878 {
1879  return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1880 }
1881 
1882 /*
1883  * Verify mbstr to make sure that it is validly encoded in the specified
1884  * encoding.
1885  *
1886  * mbstr is not necessarily zero terminated; length of mbstr is
1887  * specified by len.
1888  *
1889  * If OK, return length of string in the encoding.
1890  * If a problem is found, return -1 when noError is
1891  * true; when noError is false, ereport() a descriptive message.
1892  */
1893 int
1894 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1895 {
1896  mbverifier mbverify;
1897  int mb_len;
1898 
1899  Assert(PG_VALID_ENCODING(encoding));
1900 
1901  /*
1902  * In single-byte encodings, we need only reject nulls (\0).
1903  */
1904  if (pg_encoding_max_length(encoding) <= 1)
1905  {
1906  const char *nullpos = memchr(mbstr, 0, len);
1907 
1908  if (nullpos == NULL)
1909  return len;
1910  if (noError)
1911  return -1;
1912  report_invalid_encoding(encoding, nullpos, 1);
1913  }
1914 
1915  /* fetch function pointer just once */
1916  mbverify = pg_wchar_table[encoding].mbverify;
1917 
1918  mb_len = 0;
1919 
1920  while (len > 0)
1921  {
1922  int l;
1923 
1924  /* fast path for ASCII-subset characters */
1925  if (!IS_HIGHBIT_SET(*mbstr))
1926  {
1927  if (*mbstr != '\0')
1928  {
1929  mb_len++;
1930  mbstr++;
1931  len--;
1932  continue;
1933  }
1934  if (noError)
1935  return -1;
1936  report_invalid_encoding(encoding, mbstr, len);
1937  }
1938 
1939  l = (*mbverify) ((const unsigned char *) mbstr, len);
1940 
1941  if (l < 0)
1942  {
1943  if (noError)
1944  return -1;
1945  report_invalid_encoding(encoding, mbstr, len);
1946  }
1947 
1948  mbstr += l;
1949  len -= l;
1950  mb_len++;
1951  }
1952  return mb_len;
1953 }
1954 
1955 /*
1956  * check_encoding_conversion_args: check arguments of a conversion function
1957  *
1958  * "expected" arguments can be either an encoding ID or -1 to indicate that
1959  * the caller will check whether it accepts the ID.
1960  *
1961  * Note: the errors here are not really user-facing, so elog instead of
1962  * ereport seems sufficient. Also, we trust that the "expected" encoding
1963  * arguments are valid encoding IDs, but we don't trust the actuals.
1964  */
1965 void
1967  int dest_encoding,
1968  int len,
1969  int expected_src_encoding,
1970  int expected_dest_encoding)
1971 {
1972  if (!PG_VALID_ENCODING(src_encoding))
1973  elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1974  if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1975  elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1976  pg_enc2name_tbl[expected_src_encoding].name,
1977  pg_enc2name_tbl[src_encoding].name);
1978  if (!PG_VALID_ENCODING(dest_encoding))
1979  elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1980  if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1981  elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1982  pg_enc2name_tbl[expected_dest_encoding].name,
1983  pg_enc2name_tbl[dest_encoding].name);
1984  if (len < 0)
1985  elog(ERROR, "encoding conversion length must not be negative");
1986 }
1987 
1988 /*
1989  * report_invalid_encoding: complain about invalid multibyte character
1990  *
1991  * note: len is remaining length of string, not length of character;
1992  * len must be greater than zero, as we always examine the first byte.
1993  */
1994 void
1995 report_invalid_encoding(int encoding, const char *mbstr, int len)
1996 {
1997  int l = pg_encoding_mblen(encoding, mbstr);
1998  char buf[8 * 5 + 1];
1999  char *p = buf;
2000  int j,
2001  jlimit;
2002 
2003  jlimit = Min(l, len);
2004  jlimit = Min(jlimit, 8); /* prevent buffer overrun */
2005 
2006  for (j = 0; j < jlimit; j++)
2007  {
2008  p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2009  if (j < jlimit - 1)
2010  p += sprintf(p, " ");
2011  }
2012 
2013  ereport(ERROR,
2014  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
2015  errmsg("invalid byte sequence for encoding \"%s\": %s",
2016  pg_enc2name_tbl[encoding].name,
2017  buf)));
2018 }
2019 
2020 /*
2021  * report_untranslatable_char: complain about untranslatable character
2022  *
2023  * note: len is remaining length of string, not length of character;
2024  * len must be greater than zero, as we always examine the first byte.
2025  */
2026 void
2027 report_untranslatable_char(int src_encoding, int dest_encoding,
2028  const char *mbstr, int len)
2029 {
2030  int l = pg_encoding_mblen(src_encoding, mbstr);
2031  char buf[8 * 5 + 1];
2032  char *p = buf;
2033  int j,
2034  jlimit;
2035 
2036  jlimit = Min(l, len);
2037  jlimit = Min(jlimit, 8); /* prevent buffer overrun */
2038 
2039  for (j = 0; j < jlimit; j++)
2040  {
2041  p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2042  if (j < jlimit - 1)
2043  p += sprintf(p, " ");
2044  }
2045 
2046  ereport(ERROR,
2047  (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
2048  errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
2049  buf,
2050  pg_enc2name_tbl[src_encoding].name,
2051  pg_enc2name_tbl[dest_encoding].name)));
2052 }
2053 
2054 #endif /* !FRONTEND */
static bool pg_eucjp_increment(unsigned char *charptr, int length)
Definition: wchar.c:1639
int length(const List *list)
Definition: list.c:1271
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:202
static int pg_big5_verifier(const unsigned char *s, int len)
Definition: wchar.c:1339
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:637
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:253
pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: wchar.c:714
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:955
#define IS_LC2(c)
Definition: pg_wchar.h:144
static bool pg_generic_charinc(unsigned char *charptr, int len)
Definition: wchar.c:1527
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:61
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1458
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:475
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:949
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:912
unsigned short last
Definition: wchar.c:579
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:584
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:114
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:866
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1091
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1894
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:736
#define ISSJISTAIL(c)
Definition: pg_wchar.h:42
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:964
#define Min(x, y)
Definition: c.h:795
unsigned short first
Definition: wchar.c:578
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:748
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:1007
#define LCPRV1_A
Definition: pg_wchar.h:150
int errcode(int sqlerrcode)
Definition: elog.c:575
static int pg_eucjp_verifier(const unsigned char *s, int len)
Definition: wchar.c:1128
int pg_mic_mblen(const unsigned char *mbstr)
Definition: wchar.c:1776
#define SS3
Definition: pg_wchar.h:36
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:1034
static int pg_euckr_verifier(const unsigned char *s, int len)
Definition: wchar.c:1183
int pg_encoding_verifymb(int encoding, const char *mbstr, int len)
Definition: wchar.c:1809
#define LCPRV1_B
Definition: pg_wchar.h:151
int maxmblen
Definition: pg_wchar.h:371
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:55
static int pg_gb18030_verifier(const unsigned char *s, int len)
Definition: wchar.c:1399
#define IS_LCPRV2(c)
Definition: pg_wchar.h:164
#define IS_HIGHBIT_SET(ch)
Definition: c.h:962
#define ERROR
Definition: elog.h:43
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:934
static int pg_ascii_verifier(const unsigned char *s, int len)
Definition: wchar.c:1120
#define IS_LCPRV1(c)
Definition: pg_wchar.h:152
static bool pg_utf8_increment(unsigned char *charptr, int length)
Definition: wchar.c:1561
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: wchar.c:2027
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:241
static int pg_sjis_verifier(const unsigned char *s, int len)
Definition: wchar.c:1316
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:801
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:186
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:40
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:149
int pg_encoding_max_length(int encoding)
Definition: wchar.c:1820
char * c
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
static char * buf
Definition: pg_test_fsync.c:67
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:307
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:393
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:1785
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:1796
unsigned int uint32
Definition: c.h:258
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:995
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:347
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1877
static int pg_mule_verifier(const unsigned char *s, int len)
Definition: wchar.c:1289
#define ereport(elevel, rest)
Definition: elog.h:122
unsigned int pg_wchar
Definition: mbprint.c:31
#define pg_euccn_verifier
Definition: wchar.c:1212
static int pg_latin1_verifier(const unsigned char *s, int len)
Definition: wchar.c:1310
static int pg_utf8_verifier(const unsigned char *s, int len)
Definition: wchar.c:1430
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:309
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:75
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:1077
int GetDatabaseEncoding(void)
Definition: mbutils.c:1015
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:411
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:325
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:165
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:269
#define LCPRV2_B
Definition: pg_wchar.h:163
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:884
static char * encoding
Definition: initdb.c:123
#define LCPRV2_A
Definition: pg_wchar.h:162
bool(* mbcharacter_incrementer)(unsigned char *mbstr, int len)
Definition: pg_wchar.h:358
#define Assert(condition)
Definition: c.h:664
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:1049
#define ISSJISHEAD(c)
Definition: pg_wchar.h:41
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:167
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:1022
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:510
static int pg_gbk_verifier(const unsigned char *s, int len)
Definition: wchar.c:1359
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: wchar.c:1995
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:161
const char * name
Definition: encode.c:521
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:978
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:541
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:180
void check_encoding_conversion_args(int src_encoding, int dest_encoding, int len, int expected_src_encoding, int expected_dest_encoding)
Definition: wchar.c:1966
#define IS_LC1(c)
Definition: pg_wchar.h:123
int errmsg(const char *fmt,...)
Definition: elog.c:797
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:399
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:1729
static int pg_euctw_verifier(const unsigned char *s, int len)
Definition: wchar.c:1215
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:130
static int pg_uhc_verifier(const unsigned char *s, int len)
Definition: wchar.c:1379
int i
mbcharacter_incrementer pg_database_encoding_character_incrementer(void)
Definition: wchar.c:1842
int(* mbverifier)(const unsigned char *mbstr, int len)
Definition: pg_wchar.h:360
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: wchar.c:1866
#define elog
Definition: elog.h:219
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1125
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:153
mbverifier mbverify
Definition: pg_wchar.h:370
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:192
static int pg_johab_verifier(const unsigned char *s, int len)
Definition: wchar.c:1265
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:155
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:155
#define SS2
Definition: pg_wchar.h:35
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:1061