PostgreSQL Source Code  git master
encode.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * encode.c
4  * Various data encoding/decoding things.
5  *
6  * Copyright (c) 2001-2024, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/encode.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include <ctype.h>
17 
18 #include "mb/pg_wchar.h"
19 #include "utils/builtins.h"
20 #include "utils/memutils.h"
21 #include "varatt.h"
22 
23 
24 /*
25  * Encoding conversion API.
26  * encode_len() and decode_len() compute the amount of space needed, while
27  * encode() and decode() perform the actual conversions. It is okay for
28  * the _len functions to return an overestimate, but not an underestimate.
29  * (Having said that, large overestimates could cause unnecessary errors,
30  * so it's better to get it right.) The conversion routines write to the
31  * buffer at *res and return the true length of their output.
32  */
34 {
35  uint64 (*encode_len) (const char *data, size_t dlen);
36  uint64 (*decode_len) (const char *data, size_t dlen);
37  uint64 (*encode) (const char *data, size_t dlen, char *res);
38  uint64 (*decode) (const char *data, size_t dlen, char *res);
39 };
40 
41 static const struct pg_encoding *pg_find_encoding(const char *name);
42 
43 /*
44  * SQL functions.
45  */
46 
47 Datum
49 {
52  text *result;
53  char *namebuf;
54  char *dataptr;
55  size_t datalen;
56  uint64 resultlen;
57  uint64 res;
58  const struct pg_encoding *enc;
59 
60  namebuf = TextDatumGetCString(name);
61 
62  enc = pg_find_encoding(namebuf);
63  if (enc == NULL)
64  ereport(ERROR,
65  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
66  errmsg("unrecognized encoding: \"%s\"", namebuf)));
67 
68  dataptr = VARDATA_ANY(data);
69  datalen = VARSIZE_ANY_EXHDR(data);
70 
71  resultlen = enc->encode_len(dataptr, datalen);
72 
73  /*
74  * resultlen possibly overflows uint32, therefore on 32-bit machines it's
75  * unsafe to rely on palloc's internal check.
76  */
77  if (resultlen > MaxAllocSize - VARHDRSZ)
78  ereport(ERROR,
79  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
80  errmsg("result of encoding conversion is too large")));
81 
82  result = palloc(VARHDRSZ + resultlen);
83 
84  res = enc->encode(dataptr, datalen, VARDATA(result));
85 
86  /* Make this FATAL 'cause we've trodden on memory ... */
87  if (res > resultlen)
88  elog(FATAL, "overflow - encode estimate too small");
89 
90  SET_VARSIZE(result, VARHDRSZ + res);
91 
92  PG_RETURN_TEXT_P(result);
93 }
94 
95 Datum
97 {
100  bytea *result;
101  char *namebuf;
102  char *dataptr;
103  size_t datalen;
104  uint64 resultlen;
105  uint64 res;
106  const struct pg_encoding *enc;
107 
108  namebuf = TextDatumGetCString(name);
109 
110  enc = pg_find_encoding(namebuf);
111  if (enc == NULL)
112  ereport(ERROR,
113  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
114  errmsg("unrecognized encoding: \"%s\"", namebuf)));
115 
116  dataptr = VARDATA_ANY(data);
117  datalen = VARSIZE_ANY_EXHDR(data);
118 
119  resultlen = enc->decode_len(dataptr, datalen);
120 
121  /*
122  * resultlen possibly overflows uint32, therefore on 32-bit machines it's
123  * unsafe to rely on palloc's internal check.
124  */
125  if (resultlen > MaxAllocSize - VARHDRSZ)
126  ereport(ERROR,
127  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
128  errmsg("result of decoding conversion is too large")));
129 
130  result = palloc(VARHDRSZ + resultlen);
131 
132  res = enc->decode(dataptr, datalen, VARDATA(result));
133 
134  /* Make this FATAL 'cause we've trodden on memory ... */
135  if (res > resultlen)
136  elog(FATAL, "overflow - decode estimate too small");
137 
138  SET_VARSIZE(result, VARHDRSZ + res);
139 
140  PG_RETURN_BYTEA_P(result);
141 }
142 
143 
144 /*
145  * HEX
146  */
147 
148 static const char hextbl[] = "0123456789abcdef";
149 
150 static const int8 hexlookup[128] = {
151  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
152  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
153  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
154  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
155  -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
156  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
157  -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
158  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
159 };
160 
161 uint64
162 hex_encode(const char *src, size_t len, char *dst)
163 {
164  const char *end = src + len;
165 
166  while (src < end)
167  {
168  *dst++ = hextbl[(*src >> 4) & 0xF];
169  *dst++ = hextbl[*src & 0xF];
170  src++;
171  }
172  return (uint64) len * 2;
173 }
174 
175 static inline bool
176 get_hex(const char *cp, char *out)
177 {
178  unsigned char c = (unsigned char) *cp;
179  int res = -1;
180 
181  if (c < 127)
182  res = hexlookup[c];
183 
184  *out = (char) res;
185 
186  return (res >= 0);
187 }
188 
189 uint64
190 hex_decode(const char *src, size_t len, char *dst)
191 {
192  return hex_decode_safe(src, len, dst, NULL);
193 }
194 
195 uint64
196 hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
197 {
198  const char *s,
199  *srcend;
200  char v1,
201  v2,
202  *p;
203 
204  srcend = src + len;
205  s = src;
206  p = dst;
207  while (s < srcend)
208  {
209  if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
210  {
211  s++;
212  continue;
213  }
214  if (!get_hex(s, &v1))
215  ereturn(escontext, 0,
216  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
217  errmsg("invalid hexadecimal digit: \"%.*s\"",
218  pg_mblen(s), s)));
219  s++;
220  if (s >= srcend)
221  ereturn(escontext, 0,
222  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
223  errmsg("invalid hexadecimal data: odd number of digits")));
224  if (!get_hex(s, &v2))
225  ereturn(escontext, 0,
226  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
227  errmsg("invalid hexadecimal digit: \"%.*s\"",
228  pg_mblen(s), s)));
229  s++;
230  *p++ = (v1 << 4) | v2;
231  }
232 
233  return p - dst;
234 }
235 
236 static uint64
237 hex_enc_len(const char *src, size_t srclen)
238 {
239  return (uint64) srclen << 1;
240 }
241 
242 static uint64
243 hex_dec_len(const char *src, size_t srclen)
244 {
245  return (uint64) srclen >> 1;
246 }
247 
248 /*
249  * BASE64
250  */
251 
252 static const char _base64[] =
253 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
254 
255 static const int8 b64lookup[128] = {
256  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
257  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
258  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
259  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
260  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
261  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
262  -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
263  41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
264 };
265 
266 static uint64
267 pg_base64_encode(const char *src, size_t len, char *dst)
268 {
269  char *p,
270  *lend = dst + 76;
271  const char *s,
272  *end = src + len;
273  int pos = 2;
274  uint32 buf = 0;
275 
276  s = src;
277  p = dst;
278 
279  while (s < end)
280  {
281  buf |= (unsigned char) *s << (pos << 3);
282  pos--;
283  s++;
284 
285  /* write it out */
286  if (pos < 0)
287  {
288  *p++ = _base64[(buf >> 18) & 0x3f];
289  *p++ = _base64[(buf >> 12) & 0x3f];
290  *p++ = _base64[(buf >> 6) & 0x3f];
291  *p++ = _base64[buf & 0x3f];
292 
293  pos = 2;
294  buf = 0;
295  }
296  if (p >= lend)
297  {
298  *p++ = '\n';
299  lend = p + 76;
300  }
301  }
302  if (pos != 2)
303  {
304  *p++ = _base64[(buf >> 18) & 0x3f];
305  *p++ = _base64[(buf >> 12) & 0x3f];
306  *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
307  *p++ = '=';
308  }
309 
310  return p - dst;
311 }
312 
313 static uint64
314 pg_base64_decode(const char *src, size_t len, char *dst)
315 {
316  const char *srcend = src + len,
317  *s = src;
318  char *p = dst;
319  char c;
320  int b = 0;
321  uint32 buf = 0;
322  int pos = 0,
323  end = 0;
324 
325  while (s < srcend)
326  {
327  c = *s++;
328 
329  if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
330  continue;
331 
332  if (c == '=')
333  {
334  /* end sequence */
335  if (!end)
336  {
337  if (pos == 2)
338  end = 1;
339  else if (pos == 3)
340  end = 2;
341  else
342  ereport(ERROR,
343  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
344  errmsg("unexpected \"=\" while decoding base64 sequence")));
345  }
346  b = 0;
347  }
348  else
349  {
350  b = -1;
351  if (c > 0 && c < 127)
352  b = b64lookup[(unsigned char) c];
353  if (b < 0)
354  ereport(ERROR,
355  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
356  errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
357  pg_mblen(s - 1), s - 1)));
358  }
359  /* add it to buffer */
360  buf = (buf << 6) + b;
361  pos++;
362  if (pos == 4)
363  {
364  *p++ = (buf >> 16) & 255;
365  if (end == 0 || end > 1)
366  *p++ = (buf >> 8) & 255;
367  if (end == 0 || end > 2)
368  *p++ = buf & 255;
369  buf = 0;
370  pos = 0;
371  }
372  }
373 
374  if (pos != 0)
375  ereport(ERROR,
376  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
377  errmsg("invalid base64 end sequence"),
378  errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
379 
380  return p - dst;
381 }
382 
383 
384 static uint64
385 pg_base64_enc_len(const char *src, size_t srclen)
386 {
387  /* 3 bytes will be converted to 4, linefeed after 76 chars */
388  return ((uint64) srclen + 2) / 3 * 4 + (uint64) srclen / (76 * 3 / 4);
389 }
390 
391 static uint64
392 pg_base64_dec_len(const char *src, size_t srclen)
393 {
394  return ((uint64) srclen * 3) >> 2;
395 }
396 
397 /*
398  * Escape
399  * Minimally escape bytea to text.
400  * De-escape text to bytea.
401  *
402  * We must escape zero bytes and high-bit-set bytes to avoid generating
403  * text that might be invalid in the current encoding, or that might
404  * change to something else if passed through an encoding conversion
405  * (leading to failing to de-escape to the original bytea value).
406  * Also of course backslash itself has to be escaped.
407  *
408  * De-escaping processes \\ and any \### octal
409  */
410 
411 #define VAL(CH) ((CH) - '0')
412 #define DIG(VAL) ((VAL) + '0')
413 
414 static uint64
415 esc_encode(const char *src, size_t srclen, char *dst)
416 {
417  const char *end = src + srclen;
418  char *rp = dst;
419  uint64 len = 0;
420 
421  while (src < end)
422  {
423  unsigned char c = (unsigned char) *src;
424 
425  if (c == '\0' || IS_HIGHBIT_SET(c))
426  {
427  rp[0] = '\\';
428  rp[1] = DIG(c >> 6);
429  rp[2] = DIG((c >> 3) & 7);
430  rp[3] = DIG(c & 7);
431  rp += 4;
432  len += 4;
433  }
434  else if (c == '\\')
435  {
436  rp[0] = '\\';
437  rp[1] = '\\';
438  rp += 2;
439  len += 2;
440  }
441  else
442  {
443  *rp++ = c;
444  len++;
445  }
446 
447  src++;
448  }
449 
450  return len;
451 }
452 
453 static uint64
454 esc_decode(const char *src, size_t srclen, char *dst)
455 {
456  const char *end = src + srclen;
457  char *rp = dst;
458  uint64 len = 0;
459 
460  while (src < end)
461  {
462  if (src[0] != '\\')
463  *rp++ = *src++;
464  else if (src + 3 < end &&
465  (src[1] >= '0' && src[1] <= '3') &&
466  (src[2] >= '0' && src[2] <= '7') &&
467  (src[3] >= '0' && src[3] <= '7'))
468  {
469  int val;
470 
471  val = VAL(src[1]);
472  val <<= 3;
473  val += VAL(src[2]);
474  val <<= 3;
475  *rp++ = val + VAL(src[3]);
476  src += 4;
477  }
478  else if (src + 1 < end &&
479  (src[1] == '\\'))
480  {
481  *rp++ = '\\';
482  src += 2;
483  }
484  else
485  {
486  /*
487  * One backslash, not followed by ### valid octal. Should never
488  * get here, since esc_dec_len does same check.
489  */
490  ereport(ERROR,
491  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
492  errmsg("invalid input syntax for type %s", "bytea")));
493  }
494 
495  len++;
496  }
497 
498  return len;
499 }
500 
501 static uint64
502 esc_enc_len(const char *src, size_t srclen)
503 {
504  const char *end = src + srclen;
505  uint64 len = 0;
506 
507  while (src < end)
508  {
509  if (*src == '\0' || IS_HIGHBIT_SET(*src))
510  len += 4;
511  else if (*src == '\\')
512  len += 2;
513  else
514  len++;
515 
516  src++;
517  }
518 
519  return len;
520 }
521 
522 static uint64
523 esc_dec_len(const char *src, size_t srclen)
524 {
525  const char *end = src + srclen;
526  uint64 len = 0;
527 
528  while (src < end)
529  {
530  if (src[0] != '\\')
531  src++;
532  else if (src + 3 < end &&
533  (src[1] >= '0' && src[1] <= '3') &&
534  (src[2] >= '0' && src[2] <= '7') &&
535  (src[3] >= '0' && src[3] <= '7'))
536  {
537  /*
538  * backslash + valid octal
539  */
540  src += 4;
541  }
542  else if (src + 1 < end &&
543  (src[1] == '\\'))
544  {
545  /*
546  * two backslashes = backslash
547  */
548  src += 2;
549  }
550  else
551  {
552  /*
553  * one backslash, not followed by ### valid octal
554  */
555  ereport(ERROR,
556  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
557  errmsg("invalid input syntax for type %s", "bytea")));
558  }
559 
560  len++;
561  }
562  return len;
563 }
564 
565 /*
566  * Common
567  */
568 
569 static const struct
570 {
571  const char *name;
572  struct pg_encoding enc;
573 } enclist[] =
574 
575 {
576  {
577  "hex",
578  {
580  }
581  },
582  {
583  "base64",
584  {
586  }
587  },
588  {
589  "escape",
590  {
592  }
593  },
594  {
595  NULL,
596  {
597  NULL, NULL, NULL, NULL
598  }
599  }
600 };
601 
602 static const struct pg_encoding *
603 pg_find_encoding(const char *name)
604 {
605  int i;
606 
607  for (i = 0; enclist[i].name; i++)
608  if (pg_strcasecmp(enclist[i].name, name) == 0)
609  return &enclist[i].enc;
610 
611  return NULL;
612 }
#define TextDatumGetCString(d)
Definition: builtins.h:98
unsigned int uint32
Definition: c.h:509
signed char int8
Definition: c.h:495
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1158
#define VARHDRSZ
Definition: c.h:695
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ereturn(context, dummy_value,...)
Definition: elog.h:277
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
static uint64 pg_base64_decode(const char *src, size_t len, char *dst)
Definition: encode.c:314
#define DIG(VAL)
Definition: encode.c:412
static bool get_hex(const char *cp, char *out)
Definition: encode.c:176
static uint64 hex_dec_len(const char *src, size_t srclen)
Definition: encode.c:243
static const struct pg_encoding * pg_find_encoding(const char *name)
Definition: encode.c:603
static uint64 pg_base64_encode(const char *src, size_t len, char *dst)
Definition: encode.c:267
static uint64 esc_encode(const char *src, size_t srclen, char *dst)
Definition: encode.c:415
static uint64 hex_enc_len(const char *src, size_t srclen)
Definition: encode.c:237
Datum binary_decode(PG_FUNCTION_ARGS)
Definition: encode.c:96
static const struct @24 enclist[]
const char * name
Definition: encode.c:571
uint64 hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
Definition: encode.c:196
static const char hextbl[]
Definition: encode.c:148
uint64 hex_encode(const char *src, size_t len, char *dst)
Definition: encode.c:162
struct pg_encoding enc
Definition: encode.c:572
static uint64 esc_enc_len(const char *src, size_t srclen)
Definition: encode.c:502
static uint64 pg_base64_enc_len(const char *src, size_t srclen)
Definition: encode.c:385
static const char _base64[]
Definition: encode.c:252
static uint64 esc_decode(const char *src, size_t srclen, char *dst)
Definition: encode.c:454
static uint64 esc_dec_len(const char *src, size_t srclen)
Definition: encode.c:523
Datum binary_encode(PG_FUNCTION_ARGS)
Definition: encode.c:48
#define VAL(CH)
Definition: encode.c:411
uint64 hex_decode(const char *src, size_t len, char *dst)
Definition: encode.c:190
static const int8 b64lookup[128]
Definition: encode.c:255
static const int8 hexlookup[128]
Definition: encode.c:150
static uint64 pg_base64_dec_len(const char *src, size_t srclen)
Definition: encode.c:392
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:308
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
long val
Definition: informix.c:689
int b
Definition: isn.c:70
int i
Definition: isn.c:73
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void * palloc(Size size)
Definition: mcxt.c:1317
#define MaxAllocSize
Definition: memutils.h:40
const void size_t len
const void * data
static char * buf
Definition: pg_test_fsync.c:73
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
uintptr_t Datum
Definition: postgres.h:64
char * c
Definition: nodes.h:129
uint64(* encode_len)(const char *data, size_t dlen)
Definition: encode.c:35
uint64(* decode_len)(const char *data, size_t dlen)
Definition: encode.c:36
uint64(* decode)(const char *data, size_t dlen, char *res)
Definition: encode.c:38
uint64(* encode)(const char *data, size_t dlen, char *res)
Definition: encode.c:37
Definition: c.h:690
#define VARDATA(PTR)
Definition: varatt.h:278
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317