PostgreSQL Source Code git master
encode.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * encode.c
4 * Various data encoding/decoding things.
5 *
6 * Copyright (c) 2001-2025, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/encode.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#include "postgres.h"
15
16#include <ctype.h>
17
18#include "mb/pg_wchar.h"
19#include "utils/builtins.h"
20#include "utils/memutils.h"
21#include "varatt.h"
22
23
24/*
25 * Encoding conversion API.
26 * encode_len() and decode_len() compute the amount of space needed, while
27 * encode() and decode() perform the actual conversions. It is okay for
28 * the _len functions to return an overestimate, but not an underestimate.
29 * (Having said that, large overestimates could cause unnecessary errors,
30 * so it's better to get it right.) The conversion routines write to the
31 * buffer at *res and return the true length of their output.
32 */
34{
35 uint64 (*encode_len) (const char *data, size_t dlen);
36 uint64 (*decode_len) (const char *data, size_t dlen);
37 uint64 (*encode) (const char *data, size_t dlen, char *res);
38 uint64 (*decode) (const char *data, size_t dlen, char *res);
39};
40
41static const struct pg_encoding *pg_find_encoding(const char *name);
42
43/*
44 * SQL functions.
45 */
46
49{
52 text *result;
53 char *namebuf;
54 char *dataptr;
55 size_t datalen;
56 uint64 resultlen;
57 uint64 res;
58 const struct pg_encoding *enc;
59
60 namebuf = TextDatumGetCString(name);
61
62 enc = pg_find_encoding(namebuf);
63 if (enc == NULL)
65 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
66 errmsg("unrecognized encoding: \"%s\"", namebuf)));
67
68 dataptr = VARDATA_ANY(data);
69 datalen = VARSIZE_ANY_EXHDR(data);
70
71 resultlen = enc->encode_len(dataptr, datalen);
72
73 /*
74 * resultlen possibly overflows uint32, therefore on 32-bit machines it's
75 * unsafe to rely on palloc's internal check.
76 */
77 if (resultlen > MaxAllocSize - VARHDRSZ)
79 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
80 errmsg("result of encoding conversion is too large")));
81
82 result = palloc(VARHDRSZ + resultlen);
83
84 res = enc->encode(dataptr, datalen, VARDATA(result));
85
86 /* Make this FATAL 'cause we've trodden on memory ... */
87 if (res > resultlen)
88 elog(FATAL, "overflow - encode estimate too small");
89
90 SET_VARSIZE(result, VARHDRSZ + res);
91
92 PG_RETURN_TEXT_P(result);
93}
94
97{
100 bytea *result;
101 char *namebuf;
102 char *dataptr;
103 size_t datalen;
104 uint64 resultlen;
105 uint64 res;
106 const struct pg_encoding *enc;
107
108 namebuf = TextDatumGetCString(name);
109
110 enc = pg_find_encoding(namebuf);
111 if (enc == NULL)
113 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
114 errmsg("unrecognized encoding: \"%s\"", namebuf)));
115
116 dataptr = VARDATA_ANY(data);
117 datalen = VARSIZE_ANY_EXHDR(data);
118
119 resultlen = enc->decode_len(dataptr, datalen);
120
121 /*
122 * resultlen possibly overflows uint32, therefore on 32-bit machines it's
123 * unsafe to rely on palloc's internal check.
124 */
125 if (resultlen > MaxAllocSize - VARHDRSZ)
127 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
128 errmsg("result of decoding conversion is too large")));
129
130 result = palloc(VARHDRSZ + resultlen);
131
132 res = enc->decode(dataptr, datalen, VARDATA(result));
133
134 /* Make this FATAL 'cause we've trodden on memory ... */
135 if (res > resultlen)
136 elog(FATAL, "overflow - decode estimate too small");
137
138 SET_VARSIZE(result, VARHDRSZ + res);
139
140 PG_RETURN_BYTEA_P(result);
141}
142
143
144/*
145 * HEX
146 */
147
148/*
149 * The hex expansion of each possible byte value (two chars per value).
150 */
151static const char hextbl[512] =
152"000102030405060708090a0b0c0d0e0f"
153"101112131415161718191a1b1c1d1e1f"
154"202122232425262728292a2b2c2d2e2f"
155"303132333435363738393a3b3c3d3e3f"
156"404142434445464748494a4b4c4d4e4f"
157"505152535455565758595a5b5c5d5e5f"
158"606162636465666768696a6b6c6d6e6f"
159"707172737475767778797a7b7c7d7e7f"
160"808182838485868788898a8b8c8d8e8f"
161"909192939495969798999a9b9c9d9e9f"
162"a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
163"b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
164"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
165"d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
166"e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
167"f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
168
169static const int8 hexlookup[128] = {
170 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
171 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
172 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
173 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
174 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
175 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
176 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
177 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
178};
179
180uint64
181hex_encode(const char *src, size_t len, char *dst)
182{
183 const char *end = src + len;
184
185 while (src < end)
186 {
187 unsigned char usrc = *((const unsigned char *) src);
188
189 memcpy(dst, &hextbl[2 * usrc], 2);
190 src++;
191 dst += 2;
192 }
193 return (uint64) len * 2;
194}
195
196static inline bool
197get_hex(const char *cp, char *out)
198{
199 unsigned char c = (unsigned char) *cp;
200 int res = -1;
201
202 if (c < 127)
203 res = hexlookup[c];
204
205 *out = (char) res;
206
207 return (res >= 0);
208}
209
210uint64
211hex_decode(const char *src, size_t len, char *dst)
212{
213 return hex_decode_safe(src, len, dst, NULL);
214}
215
216uint64
217hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
218{
219 const char *s,
220 *srcend;
221 char v1,
222 v2,
223 *p;
224
225 srcend = src + len;
226 s = src;
227 p = dst;
228 while (s < srcend)
229 {
230 if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
231 {
232 s++;
233 continue;
234 }
235 if (!get_hex(s, &v1))
236 ereturn(escontext, 0,
237 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
238 errmsg("invalid hexadecimal digit: \"%.*s\"",
239 pg_mblen(s), s)));
240 s++;
241 if (s >= srcend)
242 ereturn(escontext, 0,
243 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
244 errmsg("invalid hexadecimal data: odd number of digits")));
245 if (!get_hex(s, &v2))
246 ereturn(escontext, 0,
247 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
248 errmsg("invalid hexadecimal digit: \"%.*s\"",
249 pg_mblen(s), s)));
250 s++;
251 *p++ = (v1 << 4) | v2;
252 }
253
254 return p - dst;
255}
256
257static uint64
258hex_enc_len(const char *src, size_t srclen)
259{
260 return (uint64) srclen << 1;
261}
262
263static uint64
264hex_dec_len(const char *src, size_t srclen)
265{
266 return (uint64) srclen >> 1;
267}
268
269/*
270 * BASE64
271 */
272
273static const char _base64[] =
274"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
275
276static const int8 b64lookup[128] = {
277 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
278 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
279 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
280 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
281 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
282 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
283 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
284 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
285};
286
287static uint64
288pg_base64_encode(const char *src, size_t len, char *dst)
289{
290 char *p,
291 *lend = dst + 76;
292 const char *s,
293 *end = src + len;
294 int pos = 2;
295 uint32 buf = 0;
296
297 s = src;
298 p = dst;
299
300 while (s < end)
301 {
302 buf |= (unsigned char) *s << (pos << 3);
303 pos--;
304 s++;
305
306 /* write it out */
307 if (pos < 0)
308 {
309 *p++ = _base64[(buf >> 18) & 0x3f];
310 *p++ = _base64[(buf >> 12) & 0x3f];
311 *p++ = _base64[(buf >> 6) & 0x3f];
312 *p++ = _base64[buf & 0x3f];
313
314 pos = 2;
315 buf = 0;
316 }
317 if (p >= lend)
318 {
319 *p++ = '\n';
320 lend = p + 76;
321 }
322 }
323 if (pos != 2)
324 {
325 *p++ = _base64[(buf >> 18) & 0x3f];
326 *p++ = _base64[(buf >> 12) & 0x3f];
327 *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
328 *p++ = '=';
329 }
330
331 return p - dst;
332}
333
334static uint64
335pg_base64_decode(const char *src, size_t len, char *dst)
336{
337 const char *srcend = src + len,
338 *s = src;
339 char *p = dst;
340 char c;
341 int b = 0;
342 uint32 buf = 0;
343 int pos = 0,
344 end = 0;
345
346 while (s < srcend)
347 {
348 c = *s++;
349
350 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
351 continue;
352
353 if (c == '=')
354 {
355 /* end sequence */
356 if (!end)
357 {
358 if (pos == 2)
359 end = 1;
360 else if (pos == 3)
361 end = 2;
362 else
364 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
365 errmsg("unexpected \"=\" while decoding base64 sequence")));
366 }
367 b = 0;
368 }
369 else
370 {
371 b = -1;
372 if (c > 0 && c < 127)
373 b = b64lookup[(unsigned char) c];
374 if (b < 0)
376 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
377 errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
378 pg_mblen(s - 1), s - 1)));
379 }
380 /* add it to buffer */
381 buf = (buf << 6) + b;
382 pos++;
383 if (pos == 4)
384 {
385 *p++ = (buf >> 16) & 255;
386 if (end == 0 || end > 1)
387 *p++ = (buf >> 8) & 255;
388 if (end == 0 || end > 2)
389 *p++ = buf & 255;
390 buf = 0;
391 pos = 0;
392 }
393 }
394
395 if (pos != 0)
397 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
398 errmsg("invalid base64 end sequence"),
399 errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
400
401 return p - dst;
402}
403
404
405static uint64
406pg_base64_enc_len(const char *src, size_t srclen)
407{
408 /* 3 bytes will be converted to 4, linefeed after 76 chars */
409 return ((uint64) srclen + 2) / 3 * 4 + (uint64) srclen / (76 * 3 / 4);
410}
411
412static uint64
413pg_base64_dec_len(const char *src, size_t srclen)
414{
415 return ((uint64) srclen * 3) >> 2;
416}
417
418/*
419 * Escape
420 * Minimally escape bytea to text.
421 * De-escape text to bytea.
422 *
423 * We must escape zero bytes and high-bit-set bytes to avoid generating
424 * text that might be invalid in the current encoding, or that might
425 * change to something else if passed through an encoding conversion
426 * (leading to failing to de-escape to the original bytea value).
427 * Also of course backslash itself has to be escaped.
428 *
429 * De-escaping processes \\ and any \### octal
430 */
431
432#define VAL(CH) ((CH) - '0')
433#define DIG(VAL) ((VAL) + '0')
434
435static uint64
436esc_encode(const char *src, size_t srclen, char *dst)
437{
438 const char *end = src + srclen;
439 char *rp = dst;
440 uint64 len = 0;
441
442 while (src < end)
443 {
444 unsigned char c = (unsigned char) *src;
445
446 if (c == '\0' || IS_HIGHBIT_SET(c))
447 {
448 rp[0] = '\\';
449 rp[1] = DIG(c >> 6);
450 rp[2] = DIG((c >> 3) & 7);
451 rp[3] = DIG(c & 7);
452 rp += 4;
453 len += 4;
454 }
455 else if (c == '\\')
456 {
457 rp[0] = '\\';
458 rp[1] = '\\';
459 rp += 2;
460 len += 2;
461 }
462 else
463 {
464 *rp++ = c;
465 len++;
466 }
467
468 src++;
469 }
470
471 return len;
472}
473
474static uint64
475esc_decode(const char *src, size_t srclen, char *dst)
476{
477 const char *end = src + srclen;
478 char *rp = dst;
479 uint64 len = 0;
480
481 while (src < end)
482 {
483 if (src[0] != '\\')
484 *rp++ = *src++;
485 else if (src + 3 < end &&
486 (src[1] >= '0' && src[1] <= '3') &&
487 (src[2] >= '0' && src[2] <= '7') &&
488 (src[3] >= '0' && src[3] <= '7'))
489 {
490 int val;
491
492 val = VAL(src[1]);
493 val <<= 3;
494 val += VAL(src[2]);
495 val <<= 3;
496 *rp++ = val + VAL(src[3]);
497 src += 4;
498 }
499 else if (src + 1 < end &&
500 (src[1] == '\\'))
501 {
502 *rp++ = '\\';
503 src += 2;
504 }
505 else
506 {
507 /*
508 * One backslash, not followed by ### valid octal. Should never
509 * get here, since esc_dec_len does same check.
510 */
512 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
513 errmsg("invalid input syntax for type %s", "bytea")));
514 }
515
516 len++;
517 }
518
519 return len;
520}
521
522static uint64
523esc_enc_len(const char *src, size_t srclen)
524{
525 const char *end = src + srclen;
526 uint64 len = 0;
527
528 while (src < end)
529 {
530 if (*src == '\0' || IS_HIGHBIT_SET(*src))
531 len += 4;
532 else if (*src == '\\')
533 len += 2;
534 else
535 len++;
536
537 src++;
538 }
539
540 return len;
541}
542
543static uint64
544esc_dec_len(const char *src, size_t srclen)
545{
546 const char *end = src + srclen;
547 uint64 len = 0;
548
549 while (src < end)
550 {
551 if (src[0] != '\\')
552 src++;
553 else if (src + 3 < end &&
554 (src[1] >= '0' && src[1] <= '3') &&
555 (src[2] >= '0' && src[2] <= '7') &&
556 (src[3] >= '0' && src[3] <= '7'))
557 {
558 /*
559 * backslash + valid octal
560 */
561 src += 4;
562 }
563 else if (src + 1 < end &&
564 (src[1] == '\\'))
565 {
566 /*
567 * two backslashes = backslash
568 */
569 src += 2;
570 }
571 else
572 {
573 /*
574 * one backslash, not followed by ### valid octal
575 */
577 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
578 errmsg("invalid input syntax for type %s", "bytea")));
579 }
580
581 len++;
582 }
583 return len;
584}
585
586/*
587 * Common
588 */
589
590static const struct
591{
592 const char *name;
594} enclist[] =
595
596{
597 {
598 "hex",
599 {
601 }
602 },
603 {
604 "base64",
605 {
607 }
608 },
609 {
610 "escape",
611 {
613 }
614 },
615 {
616 NULL,
617 {
618 NULL, NULL, NULL, NULL
619 }
620 }
622
623static const struct pg_encoding *
625{
626 int i;
627
628 for (i = 0; enclist[i].name; i++)
629 if (pg_strcasecmp(enclist[i].name, name) == 0)
630 return &enclist[i].enc;
631
632 return NULL;
633}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1112
#define VARHDRSZ
Definition: c.h:649
int8_t int8
Definition: c.h:482
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ereturn(context, dummy_value,...)
Definition: elog.h:277
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
static uint64 pg_base64_decode(const char *src, size_t len, char *dst)
Definition: encode.c:335
static const struct @22 enclist[]
#define DIG(VAL)
Definition: encode.c:433
static bool get_hex(const char *cp, char *out)
Definition: encode.c:197
static uint64 hex_dec_len(const char *src, size_t srclen)
Definition: encode.c:264
static const struct pg_encoding * pg_find_encoding(const char *name)
Definition: encode.c:624
static uint64 pg_base64_encode(const char *src, size_t len, char *dst)
Definition: encode.c:288
static uint64 esc_encode(const char *src, size_t srclen, char *dst)
Definition: encode.c:436
static uint64 hex_enc_len(const char *src, size_t srclen)
Definition: encode.c:258
Datum binary_decode(PG_FUNCTION_ARGS)
Definition: encode.c:96
static const char hextbl[512]
Definition: encode.c:151
const char * name
Definition: encode.c:592
uint64 hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
Definition: encode.c:217
uint64 hex_encode(const char *src, size_t len, char *dst)
Definition: encode.c:181
struct pg_encoding enc
Definition: encode.c:593
static uint64 esc_enc_len(const char *src, size_t srclen)
Definition: encode.c:523
static uint64 pg_base64_enc_len(const char *src, size_t srclen)
Definition: encode.c:406
static const char _base64[]
Definition: encode.c:273
static uint64 esc_decode(const char *src, size_t srclen, char *dst)
Definition: encode.c:475
static uint64 esc_dec_len(const char *src, size_t srclen)
Definition: encode.c:544
Datum binary_encode(PG_FUNCTION_ARGS)
Definition: encode.c:48
#define VAL(CH)
Definition: encode.c:432
uint64 hex_decode(const char *src, size_t len, char *dst)
Definition: encode.c:211
static const int8 b64lookup[128]
Definition: encode.c:276
static const int8 hexlookup[128]
Definition: encode.c:169
static uint64 pg_base64_dec_len(const char *src, size_t srclen)
Definition: encode.c:413
#define MaxAllocSize
Definition: fe_memutils.h:22
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:308
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
long val
Definition: informix.c:689
int b
Definition: isn.c:69
int i
Definition: isn.c:72
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void * palloc(Size size)
Definition: mcxt.c:1317
const void size_t len
const void * data
static char * buf
Definition: pg_test_fsync.c:72
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
uintptr_t Datum
Definition: postgres.h:69
char * c
Definition: nodes.h:129
uint64(* encode_len)(const char *data, size_t dlen)
Definition: encode.c:35
uint64(* decode_len)(const char *data, size_t dlen)
Definition: encode.c:36
uint64(* decode)(const char *data, size_t dlen, char *res)
Definition: encode.c:38
uint64(* encode)(const char *data, size_t dlen, char *res)
Definition: encode.c:37
Definition: c.h:644
#define VARDATA(PTR)
Definition: varatt.h:278
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317