PostgreSQL Source Code  git master
utf8_and_gb18030.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * GB18030 <--> UTF8
4  *
5  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  * src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 #include "fmgr.h"
16 #include "mb/pg_wchar.h"
17 #include "../../Unicode/gb18030_to_utf8.map"
18 #include "../../Unicode/utf8_to_gb18030.map"
19 
21 
24 
25 /*
26  * Convert 4-byte GB18030 characters to and from a linear code space
27  *
28  * The first and third bytes can range from 0x81 to 0xfe (126 values),
29  * while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
30  */
31 static inline uint32
33 {
34  uint32 b0 = (gb & 0xff000000) >> 24;
35  uint32 b1 = (gb & 0x00ff0000) >> 16;
36  uint32 b2 = (gb & 0x0000ff00) >> 8;
37  uint32 b3 = (gb & 0x000000ff);
38 
39  return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
40  (0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
41 }
42 
43 static inline uint32
45 {
46  uint32 r0 = 0x81 + lin / 12600;
47  uint32 r1 = 0x30 + (lin / 1260) % 10;
48  uint32 r2 = 0x81 + (lin / 10) % 126;
49  uint32 r3 = 0x30 + lin % 10;
50 
51  return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
52 }
53 
54 /*
55  * Convert word-formatted UTF8 to and from Unicode code points
56  *
57  * Probably this should be somewhere else ...
58  */
59 static inline uint32
61 {
62  uint32 word;
63 
64  if (c <= 0x7F)
65  {
66  word = c;
67  }
68  else if (c <= 0x7FF)
69  {
70  word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
71  word |= 0x80 | (c & 0x3F);
72  }
73  else if (c <= 0xFFFF)
74  {
75  word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
76  word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
77  word |= 0x80 | (c & 0x3F);
78  }
79  else
80  {
81  word = (0xF0 | ((c >> 18) & 0x07)) << 24;
82  word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
83  word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
84  word |= 0x80 | (c & 0x3F);
85  }
86 
87  return word;
88 }
89 
90 static inline uint32
92 {
93  uint32 ucs;
94 
95  if (c <= 0x7F)
96  {
97  ucs = c;
98  }
99  else if (c <= 0xFFFF)
100  {
101  ucs = ((c >> 8) & 0x1F) << 6;
102  ucs |= c & 0x3F;
103  }
104  else if (c <= 0xFFFFFF)
105  {
106  ucs = ((c >> 16) & 0x0F) << 12;
107  ucs |= ((c >> 8) & 0x3F) << 6;
108  ucs |= c & 0x3F;
109  }
110  else
111  {
112  ucs = ((c >> 24) & 0x07) << 18;
113  ucs |= ((c >> 16) & 0x3F) << 12;
114  ucs |= ((c >> 8) & 0x3F) << 6;
115  ucs |= c & 0x3F;
116  }
117 
118  return ucs;
119 }
120 
121 /*
122  * Perform mapping of GB18030 ranges to UTF8
123  *
124  * The ranges we need to convert are specified in gb-18030-2000.xml.
125  * All are ranges of 4-byte GB18030 codes.
126  */
127 static uint32
129 {
130 #define conv18030(minunicode, mincode, maxcode) \
131  if (code >= mincode && code <= maxcode) \
132  return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
133 
134  conv18030(0x0452, 0x8130D330, 0x8136A531);
135  conv18030(0x2643, 0x8137A839, 0x8138FD38);
136  conv18030(0x361B, 0x8230A633, 0x8230F237);
137  conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
138  conv18030(0x4160, 0x8232C937, 0x8232F837);
139  conv18030(0x44D7, 0x8233A339, 0x8233C931);
140  conv18030(0x478E, 0x8233E838, 0x82349638);
141  conv18030(0x49B8, 0x8234A131, 0x8234E733);
142  conv18030(0x9FA6, 0x82358F33, 0x8336C738);
143  conv18030(0xE865, 0x8336D030, 0x84308534);
144  conv18030(0xFA2A, 0x84309C38, 0x84318537);
145  conv18030(0xFFE6, 0x8431A234, 0x8431A439);
146  conv18030(0x10000, 0x90308130, 0xE3329A35);
147  /* No mapping exists */
148  return 0;
149 }
150 
151 /*
152  * Perform mapping of UTF8 ranges to GB18030
153  */
154 static uint32
156 {
157  uint32 ucs = utf8word_to_unicode(code);
158 
159 #define convutf8(minunicode, maxunicode, mincode) \
160  if (ucs >= minunicode && ucs <= maxunicode) \
161  return gb_unlinear(ucs - minunicode + gb_linear(mincode))
162 
163  convutf8(0x0452, 0x200F, 0x8130D330);
164  convutf8(0x2643, 0x2E80, 0x8137A839);
165  convutf8(0x361B, 0x3917, 0x8230A633);
166  convutf8(0x3CE1, 0x4055, 0x8231D438);
167  convutf8(0x4160, 0x4336, 0x8232C937);
168  convutf8(0x44D7, 0x464B, 0x8233A339);
169  convutf8(0x478E, 0x4946, 0x8233E838);
170  convutf8(0x49B8, 0x4C76, 0x8234A131);
171  convutf8(0x9FA6, 0xD7FF, 0x82358F33);
172  convutf8(0xE865, 0xF92B, 0x8336D030);
173  convutf8(0xFA2A, 0xFE2F, 0x84309C38);
174  convutf8(0xFFE6, 0xFFFF, 0x8431A234);
175  convutf8(0x10000, 0x10FFFF, 0x90308130);
176  /* No mapping exists */
177  return 0;
178 }
179 
180 /* ----------
181  * conv_proc(
182  * INTEGER, -- source encoding id
183  * INTEGER, -- destination encoding id
184  * CSTRING, -- source string (null terminated C string)
185  * CSTRING, -- destination string (null terminated C string)
186  * INTEGER -- source string length
187  * ) returns VOID;
188  * ----------
189  */
190 Datum
192 {
193  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
194  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
195  int len = PG_GETARG_INT32(4);
196 
198 
199  LocalToUtf(src, len, dest,
200  &gb18030_to_unicode_tree,
201  NULL, 0,
203  PG_GB18030);
204 
205  PG_RETURN_VOID();
206 }
207 
208 Datum
210 {
211  unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
212  unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
213  int len = PG_GETARG_INT32(4);
214 
216 
217  UtfToLocal(src, len, dest,
218  &gb18030_from_unicode_tree,
219  NULL, 0,
221  PG_GB18030);
222 
223  PG_RETURN_VOID();
224 }
void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding)
Definition: conv.c:474
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
static uint32 conv_18030_to_utf8(uint32 code)
static uint32 gb_unlinear(uint32 lin)
static uint32 conv_utf8_to_18030(uint32 code)
Datum gb18030_to_utf8(PG_FUNCTION_ARGS)
PG_MODULE_MAGIC
Datum utf8_to_gb18030(PG_FUNCTION_ARGS)
static uint32 unicode_to_utf8word(uint32 c)
char * c
void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding)
Definition: conv.c:666
static uint32 gb_linear(uint32 gb)
unsigned int uint32
Definition: c.h:306
PG_FUNCTION_INFO_V1(gb18030_to_utf8)
#define convutf8(minunicode, maxunicode, mincode)
#define conv18030(minunicode, mincode, maxcode)
uintptr_t Datum
Definition: postgres.h:372
#define PG_RETURN_VOID()
Definition: fmgr.h:309
#define CHECK_ENCODING_CONVERSION_ARGS(srcencoding, destencoding)
Definition: pg_wchar.h:503
static uint32 utf8word_to_unicode(uint32 c)
static void word(struct vars *, int, struct state *, struct state *)
Definition: regcomp.c:1243
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:242
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158