PostgreSQL Source Code  git master
encode.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * encode.c
4  * Various data encoding/decoding things.
5  *
6  * Copyright (c) 2001-2020, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/encode.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include <ctype.h>
17 
18 #include "utils/builtins.h"
19 #include "utils/memutils.h"
20 
21 
22 /*
23  * Encoding conversion API.
24  * encode_len() and decode_len() compute the amount of space needed, while
25  * encode() and decode() perform the actual conversions. It is okay for
26  * the _len functions to return an overestimate, but not an underestimate.
27  * (Having said that, large overestimates could cause unnecessary errors,
28  * so it's better to get it right.) The conversion routines write to the
29  * buffer at *res and return the true length of their output.
30  */
32 {
33  uint64 (*encode_len) (const char *data, size_t dlen);
34  uint64 (*decode_len) (const char *data, size_t dlen);
35  uint64 (*encode) (const char *data, size_t dlen, char *res);
36  uint64 (*decode) (const char *data, size_t dlen, char *res);
37 };
38 
39 static const struct pg_encoding *pg_find_encoding(const char *name);
40 
41 /*
42  * SQL functions.
43  */
44 
45 Datum
47 {
48  bytea *data = PG_GETARG_BYTEA_PP(0);
50  text *result;
51  char *namebuf;
52  char *dataptr;
53  size_t datalen;
54  uint64 resultlen;
55  uint64 res;
56  const struct pg_encoding *enc;
57 
58  namebuf = TextDatumGetCString(name);
59 
60  enc = pg_find_encoding(namebuf);
61  if (enc == NULL)
62  ereport(ERROR,
63  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
64  errmsg("unrecognized encoding: \"%s\"", namebuf)));
65 
66  dataptr = VARDATA_ANY(data);
67  datalen = VARSIZE_ANY_EXHDR(data);
68 
69  resultlen = enc->encode_len(dataptr, datalen);
70 
71  /*
72  * resultlen possibly overflows uint32, therefore on 32-bit machines it's
73  * unsafe to rely on palloc's internal check.
74  */
75  if (resultlen > MaxAllocSize - VARHDRSZ)
76  ereport(ERROR,
77  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
78  errmsg("result of encoding conversion is too large")));
79 
80  result = palloc(VARHDRSZ + resultlen);
81 
82  res = enc->encode(dataptr, datalen, VARDATA(result));
83 
84  /* Make this FATAL 'cause we've trodden on memory ... */
85  if (res > resultlen)
86  elog(FATAL, "overflow - encode estimate too small");
87 
88  SET_VARSIZE(result, VARHDRSZ + res);
89 
90  PG_RETURN_TEXT_P(result);
91 }
92 
93 Datum
95 {
96  text *data = PG_GETARG_TEXT_PP(0);
98  bytea *result;
99  char *namebuf;
100  char *dataptr;
101  size_t datalen;
102  uint64 resultlen;
103  uint64 res;
104  const struct pg_encoding *enc;
105 
106  namebuf = TextDatumGetCString(name);
107 
108  enc = pg_find_encoding(namebuf);
109  if (enc == NULL)
110  ereport(ERROR,
111  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
112  errmsg("unrecognized encoding: \"%s\"", namebuf)));
113 
114  dataptr = VARDATA_ANY(data);
115  datalen = VARSIZE_ANY_EXHDR(data);
116 
117  resultlen = enc->decode_len(dataptr, datalen);
118 
119  /*
120  * resultlen possibly overflows uint32, therefore on 32-bit machines it's
121  * unsafe to rely on palloc's internal check.
122  */
123  if (resultlen > MaxAllocSize - VARHDRSZ)
124  ereport(ERROR,
125  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
126  errmsg("result of decoding conversion is too large")));
127 
128  result = palloc(VARHDRSZ + resultlen);
129 
130  res = enc->decode(dataptr, datalen, VARDATA(result));
131 
132  /* Make this FATAL 'cause we've trodden on memory ... */
133  if (res > resultlen)
134  elog(FATAL, "overflow - decode estimate too small");
135 
136  SET_VARSIZE(result, VARHDRSZ + res);
137 
138  PG_RETURN_BYTEA_P(result);
139 }
140 
141 
142 /*
143  * HEX
144  */
145 
146 static const char hextbl[] = "0123456789abcdef";
147 
148 static const int8 hexlookup[128] = {
149  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
150  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
151  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
152  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
153  -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
154  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
155  -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
156  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
157 };
158 
159 uint64
160 hex_encode(const char *src, size_t len, char *dst)
161 {
162  const char *end = src + len;
163 
164  while (src < end)
165  {
166  *dst++ = hextbl[(*src >> 4) & 0xF];
167  *dst++ = hextbl[*src & 0xF];
168  src++;
169  }
170  return (uint64) len * 2;
171 }
172 
173 static inline char
174 get_hex(char c)
175 {
176  int res = -1;
177 
178  if (c > 0 && c < 127)
179  res = hexlookup[(unsigned char) c];
180 
181  if (res < 0)
182  ereport(ERROR,
183  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
184  errmsg("invalid hexadecimal digit: \"%c\"", c)));
185 
186  return (char) res;
187 }
188 
189 uint64
190 hex_decode(const char *src, size_t len, char *dst)
191 {
192  const char *s,
193  *srcend;
194  char v1,
195  v2,
196  *p;
197 
198  srcend = src + len;
199  s = src;
200  p = dst;
201  while (s < srcend)
202  {
203  if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
204  {
205  s++;
206  continue;
207  }
208  v1 = get_hex(*s++) << 4;
209  if (s >= srcend)
210  ereport(ERROR,
211  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
212  errmsg("invalid hexadecimal data: odd number of digits")));
213 
214  v2 = get_hex(*s++);
215  *p++ = v1 | v2;
216  }
217 
218  return p - dst;
219 }
220 
221 static uint64
222 hex_enc_len(const char *src, size_t srclen)
223 {
224  return (uint64) srclen << 1;
225 }
226 
227 static uint64
228 hex_dec_len(const char *src, size_t srclen)
229 {
230  return (uint64) srclen >> 1;
231 }
232 
233 /*
234  * BASE64
235  */
236 
237 static const char _base64[] =
238 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
239 
240 static const int8 b64lookup[128] = {
241  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
242  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
243  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
244  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
245  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
246  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
247  -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
248  41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
249 };
250 
251 static uint64
252 pg_base64_encode(const char *src, size_t len, char *dst)
253 {
254  char *p,
255  *lend = dst + 76;
256  const char *s,
257  *end = src + len;
258  int pos = 2;
259  uint32 buf = 0;
260 
261  s = src;
262  p = dst;
263 
264  while (s < end)
265  {
266  buf |= (unsigned char) *s << (pos << 3);
267  pos--;
268  s++;
269 
270  /* write it out */
271  if (pos < 0)
272  {
273  *p++ = _base64[(buf >> 18) & 0x3f];
274  *p++ = _base64[(buf >> 12) & 0x3f];
275  *p++ = _base64[(buf >> 6) & 0x3f];
276  *p++ = _base64[buf & 0x3f];
277 
278  pos = 2;
279  buf = 0;
280  }
281  if (p >= lend)
282  {
283  *p++ = '\n';
284  lend = p + 76;
285  }
286  }
287  if (pos != 2)
288  {
289  *p++ = _base64[(buf >> 18) & 0x3f];
290  *p++ = _base64[(buf >> 12) & 0x3f];
291  *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
292  *p++ = '=';
293  }
294 
295  return p - dst;
296 }
297 
298 static uint64
299 pg_base64_decode(const char *src, size_t len, char *dst)
300 {
301  const char *srcend = src + len,
302  *s = src;
303  char *p = dst;
304  char c;
305  int b = 0;
306  uint32 buf = 0;
307  int pos = 0,
308  end = 0;
309 
310  while (s < srcend)
311  {
312  c = *s++;
313 
314  if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
315  continue;
316 
317  if (c == '=')
318  {
319  /* end sequence */
320  if (!end)
321  {
322  if (pos == 2)
323  end = 1;
324  else if (pos == 3)
325  end = 2;
326  else
327  ereport(ERROR,
328  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
329  errmsg("unexpected \"=\" while decoding base64 sequence")));
330  }
331  b = 0;
332  }
333  else
334  {
335  b = -1;
336  if (c > 0 && c < 127)
337  b = b64lookup[(unsigned char) c];
338  if (b < 0)
339  ereport(ERROR,
340  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
341  errmsg("invalid symbol \"%c\" while decoding base64 sequence", (int) c)));
342  }
343  /* add it to buffer */
344  buf = (buf << 6) + b;
345  pos++;
346  if (pos == 4)
347  {
348  *p++ = (buf >> 16) & 255;
349  if (end == 0 || end > 1)
350  *p++ = (buf >> 8) & 255;
351  if (end == 0 || end > 2)
352  *p++ = buf & 255;
353  buf = 0;
354  pos = 0;
355  }
356  }
357 
358  if (pos != 0)
359  ereport(ERROR,
360  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
361  errmsg("invalid base64 end sequence"),
362  errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
363 
364  return p - dst;
365 }
366 
367 
368 static uint64
369 pg_base64_enc_len(const char *src, size_t srclen)
370 {
371  /* 3 bytes will be converted to 4, linefeed after 76 chars */
372  return ((uint64) srclen + 2) * 4 / 3 + (uint64) srclen / (76 * 3 / 4);
373 }
374 
375 static uint64
376 pg_base64_dec_len(const char *src, size_t srclen)
377 {
378  return ((uint64) srclen * 3) >> 2;
379 }
380 
381 /*
382  * Escape
383  * Minimally escape bytea to text.
384  * De-escape text to bytea.
385  *
386  * We must escape zero bytes and high-bit-set bytes to avoid generating
387  * text that might be invalid in the current encoding, or that might
388  * change to something else if passed through an encoding conversion
389  * (leading to failing to de-escape to the original bytea value).
390  * Also of course backslash itself has to be escaped.
391  *
392  * De-escaping processes \\ and any \### octal
393  */
394 
395 #define VAL(CH) ((CH) - '0')
396 #define DIG(VAL) ((VAL) + '0')
397 
398 static uint64
399 esc_encode(const char *src, size_t srclen, char *dst)
400 {
401  const char *end = src + srclen;
402  char *rp = dst;
403  uint64 len = 0;
404 
405  while (src < end)
406  {
407  unsigned char c = (unsigned char) *src;
408 
409  if (c == '\0' || IS_HIGHBIT_SET(c))
410  {
411  rp[0] = '\\';
412  rp[1] = DIG(c >> 6);
413  rp[2] = DIG((c >> 3) & 7);
414  rp[3] = DIG(c & 7);
415  rp += 4;
416  len += 4;
417  }
418  else if (c == '\\')
419  {
420  rp[0] = '\\';
421  rp[1] = '\\';
422  rp += 2;
423  len += 2;
424  }
425  else
426  {
427  *rp++ = c;
428  len++;
429  }
430 
431  src++;
432  }
433 
434  return len;
435 }
436 
437 static uint64
438 esc_decode(const char *src, size_t srclen, char *dst)
439 {
440  const char *end = src + srclen;
441  char *rp = dst;
442  uint64 len = 0;
443 
444  while (src < end)
445  {
446  if (src[0] != '\\')
447  *rp++ = *src++;
448  else if (src + 3 < end &&
449  (src[1] >= '0' && src[1] <= '3') &&
450  (src[2] >= '0' && src[2] <= '7') &&
451  (src[3] >= '0' && src[3] <= '7'))
452  {
453  int val;
454 
455  val = VAL(src[1]);
456  val <<= 3;
457  val += VAL(src[2]);
458  val <<= 3;
459  *rp++ = val + VAL(src[3]);
460  src += 4;
461  }
462  else if (src + 1 < end &&
463  (src[1] == '\\'))
464  {
465  *rp++ = '\\';
466  src += 2;
467  }
468  else
469  {
470  /*
471  * One backslash, not followed by ### valid octal. Should never
472  * get here, since esc_dec_len does same check.
473  */
474  ereport(ERROR,
475  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
476  errmsg("invalid input syntax for type %s", "bytea")));
477  }
478 
479  len++;
480  }
481 
482  return len;
483 }
484 
485 static uint64
486 esc_enc_len(const char *src, size_t srclen)
487 {
488  const char *end = src + srclen;
489  uint64 len = 0;
490 
491  while (src < end)
492  {
493  if (*src == '\0' || IS_HIGHBIT_SET(*src))
494  len += 4;
495  else if (*src == '\\')
496  len += 2;
497  else
498  len++;
499 
500  src++;
501  }
502 
503  return len;
504 }
505 
506 static uint64
507 esc_dec_len(const char *src, size_t srclen)
508 {
509  const char *end = src + srclen;
510  uint64 len = 0;
511 
512  while (src < end)
513  {
514  if (src[0] != '\\')
515  src++;
516  else if (src + 3 < end &&
517  (src[1] >= '0' && src[1] <= '3') &&
518  (src[2] >= '0' && src[2] <= '7') &&
519  (src[3] >= '0' && src[3] <= '7'))
520  {
521  /*
522  * backslash + valid octal
523  */
524  src += 4;
525  }
526  else if (src + 1 < end &&
527  (src[1] == '\\'))
528  {
529  /*
530  * two backslashes = backslash
531  */
532  src += 2;
533  }
534  else
535  {
536  /*
537  * one backslash, not followed by ### valid octal
538  */
539  ereport(ERROR,
540  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
541  errmsg("invalid input syntax for type %s", "bytea")));
542  }
543 
544  len++;
545  }
546  return len;
547 }
548 
549 /*
550  * Common
551  */
552 
553 static const struct
554 {
555  const char *name;
556  struct pg_encoding enc;
557 } enclist[] =
558 
559 {
560  {
561  "hex",
562  {
564  }
565  },
566  {
567  "base64",
568  {
570  }
571  },
572  {
573  "escape",
574  {
576  }
577  },
578  {
579  NULL,
580  {
581  NULL, NULL, NULL, NULL
582  }
583  }
584 };
585 
586 static const struct pg_encoding *
587 pg_find_encoding(const char *name)
588 {
589  int i;
590 
591  for (i = 0; enclist[i].name; i++)
592  if (pg_strcasecmp(enclist[i].name, name) == 0)
593  return &enclist[i].enc;
594 
595  return NULL;
596 }
uint64(* decode)(const char *data, size_t dlen, char *res)
Definition: encode.c:36
int errhint(const char *fmt,...)
Definition: elog.c:1071
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
#define VARDATA(PTR)
Definition: postgres.h:302
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define VARHDRSZ
Definition: c.h:561
int errcode(int sqlerrcode)
Definition: elog.c:610
static uint64 esc_dec_len(const char *src, size_t srclen)
Definition: encode.c:507
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:369
static uint64 esc_encode(const char *src, size_t srclen, char *dst)
Definition: encode.c:399
static const char _base64[]
Definition: encode.c:237
static const int8 hexlookup[128]
Definition: encode.c:148
static uint64 pg_base64_enc_len(const char *src, size_t srclen)
Definition: encode.c:369
Datum binary_encode(PG_FUNCTION_ARGS)
Definition: encode.c:46
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:308
static uint64 pg_base64_encode(const char *src, size_t len, char *dst)
Definition: encode.c:252
uint64(* encode_len)(const char *data, size_t dlen)
Definition: encode.c:33
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1119
#define ERROR
Definition: elog.h:43
struct pg_encoding enc
Definition: encode.c:556
#define FATAL
Definition: elog.h:52
char * c
static char * buf
Definition: pg_test_fsync.c:67
static const char hextbl[]
Definition: encode.c:146
static uint64 hex_dec_len(const char *src, size_t srclen)
Definition: encode.c:228
uint64(* decode_len)(const char *data, size_t dlen)
Definition: encode.c:34
unsigned int uint32
Definition: c.h:367
static const struct pg_encoding * pg_find_encoding(const char *name)
Definition: encode.c:587
uint64 hex_encode(const char *src, size_t len, char *dst)
Definition: encode.c:160
static const struct @26 enclist[]
#define MaxAllocSize
Definition: memutils.h:40
signed char int8
Definition: c.h:353
uint64 hex_decode(const char *src, size_t len, char *dst)
Definition: encode.c:190
#define TextDatumGetCString(d)
Definition: builtins.h:88
uintptr_t Datum
Definition: postgres.h:367
static char get_hex(char c)
Definition: encode.c:174
#define VAL(CH)
Definition: encode.c:395
static uint64 pg_base64_decode(const char *src, size_t len, char *dst)
Definition: encode.c:299
#define ereport(elevel,...)
Definition: elog.h:144
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:370
uint64(* encode)(const char *data, size_t dlen, char *res)
Definition: encode.c:35
static uint64 esc_enc_len(const char *src, size_t srclen)
Definition: encode.c:486
static uint64 hex_enc_len(const char *src, size_t srclen)
Definition: encode.c:222
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:307
const char * name
Definition: encode.c:555
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:341
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define DIG(VAL)
Definition: encode.c:396
#define elog(elevel,...)
Definition: elog.h:214
int i
Definition: c.h:555
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
static uint64 esc_decode(const char *src, size_t srclen, char *dst)
Definition: encode.c:438
Datum binary_decode(PG_FUNCTION_ARGS)
Definition: encode.c:94
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:329
static uint64 pg_base64_dec_len(const char *src, size_t srclen)
Definition: encode.c:376
static const int8 b64lookup[128]
Definition: encode.c:240
long val
Definition: informix.c:664