PostgreSQL Source Code git master
Loading...
Searching...
No Matches
conv.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * Utility functions for conversion procs.
4 *
5 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conv.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "postgres.h"
14#include "mb/pg_wchar.h"
15
16
17/*
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
20 *
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
28 *
29 * Returns the number of input bytes consumed. If noError is true, this can
30 * be less than 'len'.
31 */
32int
33local2local(const unsigned char *l,
34 unsigned char *p,
35 int len,
36 int src_encoding,
37 int dest_encoding,
38 const unsigned char *tab,
39 bool noError)
40{
41 const unsigned char *start = l;
42 unsigned char c1,
43 c2;
44
45 while (len > 0)
46 {
47 c1 = *l;
48 if (c1 == 0)
49 {
50 if (noError)
51 break;
52 report_invalid_encoding(src_encoding, (const char *) l, len);
53 }
54 if (!IS_HIGHBIT_SET(c1))
55 *p++ = c1;
56 else
57 {
58 c2 = tab[c1 - HIGHBIT];
59 if (c2)
60 *p++ = c2;
61 else
62 {
63 if (noError)
64 break;
66 (const char *) l, len);
67 }
68 }
69 l++;
70 len--;
71 }
72 *p = '\0';
73
74 return l - start;
75}
76
77/*
78 * comparison routine for bsearch()
79 * this routine is intended for combined UTF8 -> local code
80 */
81static int
82compare3(const void *p1, const void *p2)
83{
84 uint32 s1,
85 s2,
86 d1,
87 d2;
88
89 s1 = *(const uint32 *) p1;
90 s2 = *((const uint32 *) p1 + 1);
91 d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
92 d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
93 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
94}
95
96/*
97 * comparison routine for bsearch()
98 * this routine is intended for local code -> combined UTF8
99 */
100static int
101compare4(const void *p1, const void *p2)
102{
103 uint32 v1,
104 v2;
105
106 v1 = *(const uint32 *) p1;
107 v2 = ((const pg_local_to_utf_combined *) p2)->code;
108 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
109}
110
111/*
112 * store 32bit character representation into multibyte stream
113 */
114static inline unsigned char *
115store_coded_char(unsigned char *dest, uint32 code)
116{
117 if (code & 0xff000000)
118 *dest++ = code >> 24;
119 if (code & 0x00ff0000)
120 *dest++ = code >> 16;
121 if (code & 0x0000ff00)
122 *dest++ = code >> 8;
123 if (code & 0x000000ff)
124 *dest++ = code;
125 return dest;
126}
127
128/*
129 * Convert a character using a conversion radix tree.
130 *
131 * 'l' is the length of the input character in bytes, and b1-b4 are
132 * the input character's bytes.
133 */
134static inline uint32
136 int l,
137 unsigned char b1,
138 unsigned char b2,
139 unsigned char b3,
140 unsigned char b4)
141{
142 if (l == 4)
143 {
144 /* 4-byte code */
145
146 /* check code validity */
147 if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
148 b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
149 b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
150 b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
151 return 0;
152
153 /* perform lookup */
154 if (rt->chars32)
155 {
156 uint32 idx = rt->b4root;
157
158 idx = rt->chars32[b1 + idx - rt->b4_1_lower];
159 idx = rt->chars32[b2 + idx - rt->b4_2_lower];
160 idx = rt->chars32[b3 + idx - rt->b4_3_lower];
161 return rt->chars32[b4 + idx - rt->b4_4_lower];
162 }
163 else
164 {
165 uint16 idx = rt->b4root;
166
167 idx = rt->chars16[b1 + idx - rt->b4_1_lower];
168 idx = rt->chars16[b2 + idx - rt->b4_2_lower];
169 idx = rt->chars16[b3 + idx - rt->b4_3_lower];
170 return rt->chars16[b4 + idx - rt->b4_4_lower];
171 }
172 }
173 else if (l == 3)
174 {
175 /* 3-byte code */
176
177 /* check code validity */
178 if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
179 b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
180 b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
181 return 0;
182
183 /* perform lookup */
184 if (rt->chars32)
185 {
186 uint32 idx = rt->b3root;
187
188 idx = rt->chars32[b2 + idx - rt->b3_1_lower];
189 idx = rt->chars32[b3 + idx - rt->b3_2_lower];
190 return rt->chars32[b4 + idx - rt->b3_3_lower];
191 }
192 else
193 {
194 uint16 idx = rt->b3root;
195
196 idx = rt->chars16[b2 + idx - rt->b3_1_lower];
197 idx = rt->chars16[b3 + idx - rt->b3_2_lower];
198 return rt->chars16[b4 + idx - rt->b3_3_lower];
199 }
200 }
201 else if (l == 2)
202 {
203 /* 2-byte code */
204
205 /* check code validity - first byte */
206 if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
207 b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
208 return 0;
209
210 /* perform lookup */
211 if (rt->chars32)
212 {
213 uint32 idx = rt->b2root;
214
215 idx = rt->chars32[b3 + idx - rt->b2_1_lower];
216 return rt->chars32[b4 + idx - rt->b2_2_lower];
217 }
218 else
219 {
220 uint16 idx = rt->b2root;
221
222 idx = rt->chars16[b3 + idx - rt->b2_1_lower];
223 return rt->chars16[b4 + idx - rt->b2_2_lower];
224 }
225 }
226 else if (l == 1)
227 {
228 /* 1-byte code */
229
230 /* check code validity - first byte */
231 if (b4 < rt->b1_lower || b4 > rt->b1_upper)
232 return 0;
233
234 /* perform lookup */
235 if (rt->chars32)
236 return rt->chars32[b4 + rt->b1root - rt->b1_lower];
237 else
238 return rt->chars16[b4 + rt->b1root - rt->b1_lower];
239 }
240 return 0; /* shouldn't happen */
241}
242
243/*
244 * UTF8 ---> local code
245 *
246 * utf: input string in UTF8 encoding (need not be null-terminated)
247 * len: length of input string (in bytes)
248 * iso: pointer to the output area (must be large enough!)
249 * (output string will be null-terminated)
250 * map: conversion map for single characters
251 * cmap: conversion map for combined characters
252 * (optional, pass NULL if none)
253 * cmapsize: number of entries in the conversion map for combined characters
254 * (optional, pass 0 if none)
255 * conv_func: algorithmic encoding conversion function
256 * (optional, pass NULL if none)
257 * encoding: PG identifier for the local encoding
258 *
259 * For each character, the cmap (if provided) is consulted first; if no match,
260 * the map is consulted next; if still no match, the conv_func (if provided)
261 * is applied. An error is raised if no match is found.
262 *
263 * See pg_wchar.h for more details about the data structures used here.
264 *
265 * Returns the number of input bytes consumed. If noError is true, this can
266 * be less than 'len'.
267 */
268int
269UtfToLocal(const unsigned char *utf, int len,
270 unsigned char *iso,
271 const pg_mb_radix_tree *map,
272 const pg_utf_to_local_combined *cmap, int cmapsize,
274 int encoding, bool noError)
275{
276 uint32 iutf;
277 int l;
279 const unsigned char *start = utf;
280
284 errmsg("invalid encoding number: %d", encoding)));
285
286 for (; len > 0; len -= l)
287 {
288 unsigned char b1 = 0;
289 unsigned char b2 = 0;
290 unsigned char b3 = 0;
291 unsigned char b4 = 0;
292
293 /* "break" cases all represent errors */
294 if (*utf == '\0')
295 break;
296
297 l = pg_utf_mblen(utf);
298 if (len < l)
299 break;
300
301 if (!pg_utf8_islegal(utf, l))
302 break;
303
304 if (l == 1)
305 {
306 /* ASCII case is easy, assume it's one-to-one conversion */
307 *iso++ = *utf++;
308 continue;
309 }
310
311 /* collect coded char of length l */
312 if (l == 2)
313 {
314 b3 = *utf++;
315 b4 = *utf++;
316 }
317 else if (l == 3)
318 {
319 b2 = *utf++;
320 b3 = *utf++;
321 b4 = *utf++;
322 }
323 else if (l == 4)
324 {
325 b1 = *utf++;
326 b2 = *utf++;
327 b3 = *utf++;
328 b4 = *utf++;
329 }
330 else
331 {
332 elog(ERROR, "unsupported character length %d", l);
333 iutf = 0; /* keep compiler quiet */
334 }
335 iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
336
337 /* First, try with combined map if possible */
338 if (cmap && len > l)
339 {
340 const unsigned char *utf_save = utf;
341 int len_save = len;
342 int l_save = l;
343
344 /* collect next character, same as above */
345 len -= l;
346
347 l = pg_utf_mblen(utf);
348 if (len < l)
349 {
350 /* need more data to decide if this is a combined char */
351 utf -= l_save;
352 break;
353 }
354
355 if (!pg_utf8_islegal(utf, l))
356 {
357 if (!noError)
358 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
359 utf -= l_save;
360 break;
361 }
362
363 /* We assume ASCII character cannot be in combined map */
364 if (l > 1)
365 {
367 uint32 cutf[2];
368
369 if (l == 2)
370 {
371 iutf2 = *utf++ << 8;
372 iutf2 |= *utf++;
373 }
374 else if (l == 3)
375 {
376 iutf2 = *utf++ << 16;
377 iutf2 |= *utf++ << 8;
378 iutf2 |= *utf++;
379 }
380 else if (l == 4)
381 {
382 iutf2 = *utf++ << 24;
383 iutf2 |= *utf++ << 16;
384 iutf2 |= *utf++ << 8;
385 iutf2 |= *utf++;
386 }
387 else
388 {
389 elog(ERROR, "unsupported character length %d", l);
390 iutf2 = 0; /* keep compiler quiet */
391 }
392
393 cutf[0] = iutf;
394 cutf[1] = iutf2;
395
396 cp = bsearch(cutf, cmap, cmapsize,
398
399 if (cp)
400 {
401 iso = store_coded_char(iso, cp->code);
402 continue;
403 }
404 }
405
406 /* fail, so back up to reprocess second character next time */
407 utf = utf_save;
408 len = len_save;
409 l = l_save;
410 }
411
412 /* Now check ordinary map */
413 if (map)
414 {
415 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
416
417 if (converted)
418 {
420 continue;
421 }
422 }
423
424 /* if there's a conversion function, try that */
425 if (conv_func)
426 {
427 uint32 converted = (*conv_func) (iutf);
428
429 if (converted)
430 {
432 continue;
433 }
434 }
435
436 /* failed to translate this character */
437 utf -= l;
438 if (noError)
439 break;
441 (const char *) utf, len);
442 }
443
444 /* if we broke out of loop early, must be invalid input */
445 if (len > 0 && !noError)
446 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
447
448 *iso = '\0';
449
450 return utf - start;
451}
452
453/*
454 * local code ---> UTF8
455 *
456 * iso: input string in local encoding (need not be null-terminated)
457 * len: length of input string (in bytes)
458 * utf: pointer to the output area (must be large enough!)
459 * (output string will be null-terminated)
460 * map: conversion map for single characters
461 * cmap: conversion map for combined characters
462 * (optional, pass NULL if none)
463 * cmapsize: number of entries in the conversion map for combined characters
464 * (optional, pass 0 if none)
465 * conv_func: algorithmic encoding conversion function
466 * (optional, pass NULL if none)
467 * encoding: PG identifier for the local encoding
468 *
469 * For each character, the map is consulted first; if no match, the cmap
470 * (if provided) is consulted next; if still no match, the conv_func
471 * (if provided) is applied. An error is raised if no match is found.
472 *
473 * See pg_wchar.h for more details about the data structures used here.
474 *
475 * Returns the number of input bytes consumed. If noError is true, this can
476 * be less than 'len'.
477 */
478int
479LocalToUtf(const unsigned char *iso, int len,
480 unsigned char *utf,
481 const pg_mb_radix_tree *map,
482 const pg_local_to_utf_combined *cmap, int cmapsize,
484 int encoding,
485 bool noError)
486{
487 uint32 iiso;
488 int l;
490 const unsigned char *start = iso;
491
495 errmsg("invalid encoding number: %d", encoding)));
496
497 for (; len > 0; len -= l)
498 {
499 unsigned char b1 = 0;
500 unsigned char b2 = 0;
501 unsigned char b3 = 0;
502 unsigned char b4 = 0;
503
504 /* "break" cases all represent errors */
505 if (*iso == '\0')
506 break;
507
508 if (!IS_HIGHBIT_SET(*iso))
509 {
510 /* ASCII case is easy, assume it's one-to-one conversion */
511 *utf++ = *iso++;
512 l = 1;
513 continue;
514 }
515
516 l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
517 if (l < 0)
518 break;
519
520 /* collect coded char of length l */
521 if (l == 1)
522 b4 = *iso++;
523 else if (l == 2)
524 {
525 b3 = *iso++;
526 b4 = *iso++;
527 }
528 else if (l == 3)
529 {
530 b2 = *iso++;
531 b3 = *iso++;
532 b4 = *iso++;
533 }
534 else if (l == 4)
535 {
536 b1 = *iso++;
537 b2 = *iso++;
538 b3 = *iso++;
539 b4 = *iso++;
540 }
541 else
542 {
543 elog(ERROR, "unsupported character length %d", l);
544 iiso = 0; /* keep compiler quiet */
545 }
546 iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
547
548 if (map)
549 {
550 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
551
552 if (converted)
553 {
555 continue;
556 }
557
558 /* If there's a combined character map, try that */
559 if (cmap)
560 {
561 cp = bsearch(&iiso, cmap, cmapsize,
563
564 if (cp)
565 {
566 utf = store_coded_char(utf, cp->utf1);
567 utf = store_coded_char(utf, cp->utf2);
568 continue;
569 }
570 }
571 }
572
573 /* if there's a conversion function, try that */
574 if (conv_func)
575 {
576 uint32 converted = (*conv_func) (iiso);
577
578 if (converted)
579 {
581 continue;
582 }
583 }
584
585 /* failed to translate this character */
586 iso -= l;
587 if (noError)
588 break;
590 (const char *) iso, len);
591 }
592
593 /* if we broke out of loop early, must be invalid input */
594 if (len > 0 && !noError)
595 report_invalid_encoding(encoding, (const char *) iso, len);
596
597 *utf = '\0';
598
599 return iso - start;
600}
Datum idx(PG_FUNCTION_ARGS)
Definition _int_op.c:262
#define IS_HIGHBIT_SET(ch)
Definition c.h:1244
uint16_t uint16
Definition c.h:623
uint32_t uint32
Definition c.h:624
#define HIGHBIT
Definition c.h:1243
int UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
Definition conv.c:269
static int compare3(const void *p1, const void *p2)
Definition conv.c:82
static unsigned char * store_coded_char(unsigned char *dest, uint32 code)
Definition conv.c:115
static uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, int l, unsigned char b1, unsigned char b2, unsigned char b3, unsigned char b4)
Definition conv.c:135
int LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError)
Definition conv.c:479
static int compare4(const void *p1, const void *p2)
Definition conv.c:101
int local2local(const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, const unsigned char *tab, bool noError)
Definition conv.c:33
int errcode(int sqlerrcode)
Definition elog.c:874
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
return str start
static char * encoding
Definition initdb.c:139
#define PG_UTF8
Definition mbprint.c:43
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition mbutils.c:1869
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition mbutils.c:1824
static char * errmsg
const void size_t len
#define pg_utf_mblen
Definition pg_wchar.h:486
uint32(* utf_local_conversion_func)(uint32 code)
Definition pg_wchar.h:352
#define PG_VALID_ENCODING(_enc)
Definition pg_wchar.h:140
static int fb(int x)
char * s1
char * s2
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition wchar.c:1789
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition wchar.c:1988