PostgreSQL Source Code  git master
encode.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * encode.c
4  * Various data encoding/decoding things.
5  *
6  * Copyright (c) 2001-2021, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/encode.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include <ctype.h>
17 
18 #include "common/hex.h"
19 #include "mb/pg_wchar.h"
20 #include "utils/builtins.h"
21 #include "utils/memutils.h"
22 
23 
24 /*
25  * Encoding conversion API.
26  * encode_len() and decode_len() compute the amount of space needed, while
27  * encode() and decode() perform the actual conversions. It is okay for
28  * the _len functions to return an overestimate, but not an underestimate.
29  * (Having said that, large overestimates could cause unnecessary errors,
30  * so it's better to get it right.) The conversion routines write to the
31  * buffer at *res and return the true length of their output.
32  */
34 {
35  uint64 (*encode_len) (const char *src, size_t srclen);
36  uint64 (*decode_len) (const char *src, size_t srclen);
37  uint64 (*encode) (const char *src, size_t srclen,
38  char *dst, size_t dstlen);
39  uint64 (*decode) (const char *src, size_t srclen,
40  char *dst, size_t dstlen);
41 };
42 
43 static const struct pg_encoding *pg_find_encoding(const char *name);
44 
45 /*
46  * SQL functions.
47  */
48 
49 Datum
51 {
52  bytea *data = PG_GETARG_BYTEA_PP(0);
54  text *result;
55  char *namebuf;
56  char *dataptr;
57  size_t datalen;
58  uint64 resultlen;
59  uint64 res;
60  const struct pg_encoding *enc;
61 
62  namebuf = TextDatumGetCString(name);
63 
64  enc = pg_find_encoding(namebuf);
65  if (enc == NULL)
66  ereport(ERROR,
67  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
68  errmsg("unrecognized encoding: \"%s\"", namebuf)));
69 
70  dataptr = VARDATA_ANY(data);
71  datalen = VARSIZE_ANY_EXHDR(data);
72 
73  resultlen = enc->encode_len(dataptr, datalen);
74 
75  /*
76  * resultlen possibly overflows uint32, therefore on 32-bit machines it's
77  * unsafe to rely on palloc's internal check.
78  */
79  if (resultlen > MaxAllocSize - VARHDRSZ)
80  ereport(ERROR,
81  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
82  errmsg("result of encoding conversion is too large")));
83 
84  result = palloc(VARHDRSZ + resultlen);
85 
86  res = enc->encode(dataptr, datalen, VARDATA(result), resultlen);
87 
88  SET_VARSIZE(result, VARHDRSZ + res);
89 
90  PG_RETURN_TEXT_P(result);
91 }
92 
93 Datum
95 {
96  text *data = PG_GETARG_TEXT_PP(0);
98  bytea *result;
99  char *namebuf;
100  char *dataptr;
101  size_t datalen;
102  uint64 resultlen;
103  uint64 res;
104  const struct pg_encoding *enc;
105 
106  namebuf = TextDatumGetCString(name);
107 
108  enc = pg_find_encoding(namebuf);
109  if (enc == NULL)
110  ereport(ERROR,
111  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
112  errmsg("unrecognized encoding: \"%s\"", namebuf)));
113 
114  dataptr = VARDATA_ANY(data);
115  datalen = VARSIZE_ANY_EXHDR(data);
116 
117  resultlen = enc->decode_len(dataptr, datalen);
118 
119  /*
120  * resultlen possibly overflows uint32, therefore on 32-bit machines it's
121  * unsafe to rely on palloc's internal check.
122  */
123  if (resultlen > MaxAllocSize - VARHDRSZ)
124  ereport(ERROR,
125  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
126  errmsg("result of decoding conversion is too large")));
127 
128  result = palloc(VARHDRSZ + resultlen);
129 
130  res = enc->decode(dataptr, datalen, VARDATA(result), resultlen);
131 
132  SET_VARSIZE(result, VARHDRSZ + res);
133 
134  PG_RETURN_BYTEA_P(result);
135 }
136 
137 
138 /*
139  * HEX
140  */
141 
142 /*
143  * Those two wrappers are still needed to match with the layer of
144  * src/common/.
145  */
146 static uint64
147 hex_enc_len(const char *src, size_t srclen)
148 {
149  return pg_hex_enc_len(srclen);
150 }
151 
152 static uint64
153 hex_dec_len(const char *src, size_t srclen)
154 {
155  return pg_hex_dec_len(srclen);
156 }
157 
158 /*
159  * BASE64
160  */
161 
162 static const char _base64[] =
163 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
164 
165 static const int8 b64lookup[128] = {
166  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
167  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
168  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
169  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
170  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
171  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
172  -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
173  41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
174 };
175 
176 static uint64
177 pg_base64_encode(const char *src, size_t srclen, char *dst, size_t dstlen)
178 {
179  char *p,
180  *lend = dst + 76;
181  const char *s,
182  *end = src + srclen;
183  int pos = 2;
184  uint32 buf = 0;
185 
186  s = src;
187  p = dst;
188 
189  while (s < end)
190  {
191  buf |= (unsigned char) *s << (pos << 3);
192  pos--;
193  s++;
194 
195  /* write it out */
196  if (pos < 0)
197  {
198  if ((p - dst + 4) > dstlen)
199  elog(ERROR, "overflow of destination buffer in base64 encoding");
200  *p++ = _base64[(buf >> 18) & 0x3f];
201  *p++ = _base64[(buf >> 12) & 0x3f];
202  *p++ = _base64[(buf >> 6) & 0x3f];
203  *p++ = _base64[buf & 0x3f];
204 
205  pos = 2;
206  buf = 0;
207  }
208  if (p >= lend)
209  {
210  if ((p - dst + 1) > dstlen)
211  elog(ERROR, "overflow of destination buffer in base64 encoding");
212  *p++ = '\n';
213  lend = p + 76;
214  }
215  }
216  if (pos != 2)
217  {
218  if ((p - dst + 4) > dstlen)
219  elog(ERROR, "overflow of destination buffer in base64 encoding");
220  *p++ = _base64[(buf >> 18) & 0x3f];
221  *p++ = _base64[(buf >> 12) & 0x3f];
222  *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
223  *p++ = '=';
224  }
225 
226  Assert((p - dst) <= dstlen);
227  return p - dst;
228 }
229 
230 static uint64
231 pg_base64_decode(const char *src, size_t srclen, char *dst, size_t dstlen)
232 {
233  const char *srcend = src + srclen,
234  *s = src;
235  char *p = dst;
236  char c;
237  int b = 0;
238  uint32 buf = 0;
239  int pos = 0,
240  end = 0;
241 
242  while (s < srcend)
243  {
244  c = *s++;
245 
246  if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
247  continue;
248 
249  if (c == '=')
250  {
251  /* end sequence */
252  if (!end)
253  {
254  if (pos == 2)
255  end = 1;
256  else if (pos == 3)
257  end = 2;
258  else
259  ereport(ERROR,
260  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
261  errmsg("unexpected \"=\" while decoding base64 sequence")));
262  }
263  b = 0;
264  }
265  else
266  {
267  b = -1;
268  if (c > 0 && c < 127)
269  b = b64lookup[(unsigned char) c];
270  if (b < 0)
271  ereport(ERROR,
272  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
273  errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
274  pg_mblen(s - 1), s - 1)));
275  }
276  /* add it to buffer */
277  buf = (buf << 6) + b;
278  pos++;
279  if (pos == 4)
280  {
281  if ((p - dst + 1) > dstlen)
282  elog(ERROR, "overflow of destination buffer in base64 decoding");
283  *p++ = (buf >> 16) & 255;
284  if (end == 0 || end > 1)
285  {
286  if ((p - dst + 1) > dstlen)
287  elog(ERROR, "overflow of destination buffer in base64 decoding");
288  *p++ = (buf >> 8) & 255;
289  }
290  if (end == 0 || end > 2)
291  {
292  if ((p - dst + 1) > dstlen)
293  elog(ERROR, "overflow of destination buffer in base64 decoding");
294  *p++ = buf & 255;
295  }
296  buf = 0;
297  pos = 0;
298  }
299  }
300 
301  if (pos != 0)
302  ereport(ERROR,
303  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
304  errmsg("invalid base64 end sequence"),
305  errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
306 
307  Assert((p - dst) <= dstlen);
308  return p - dst;
309 }
310 
311 
312 static uint64
313 pg_base64_enc_len(const char *src, size_t srclen)
314 {
315  /* 3 bytes will be converted to 4, linefeed after 76 chars */
316  return ((uint64) srclen + 2) * 4 / 3 + (uint64) srclen / (76 * 3 / 4);
317 }
318 
319 static uint64
320 pg_base64_dec_len(const char *src, size_t srclen)
321 {
322  return ((uint64) srclen * 3) >> 2;
323 }
324 
325 /*
326  * Escape
327  * Minimally escape bytea to text.
328  * De-escape text to bytea.
329  *
330  * We must escape zero bytes and high-bit-set bytes to avoid generating
331  * text that might be invalid in the current encoding, or that might
332  * change to something else if passed through an encoding conversion
333  * (leading to failing to de-escape to the original bytea value).
334  * Also of course backslash itself has to be escaped.
335  *
336  * De-escaping processes \\ and any \### octal
337  */
338 
339 #define VAL(CH) ((CH) - '0')
340 #define DIG(VAL) ((VAL) + '0')
341 
342 static uint64
343 esc_encode(const char *src, size_t srclen, char *dst, size_t dstlen)
344 {
345  const char *end = src + srclen;
346  char *rp = dst;
347  uint64 len = 0;
348 
349  while (src < end)
350  {
351  unsigned char c = (unsigned char) *src;
352 
353  if (c == '\0' || IS_HIGHBIT_SET(c))
354  {
355  if ((rp - dst + 4) > dstlen)
356  elog(ERROR, "overflow of destination buffer in escape encoding");
357  rp[0] = '\\';
358  rp[1] = DIG(c >> 6);
359  rp[2] = DIG((c >> 3) & 7);
360  rp[3] = DIG(c & 7);
361  rp += 4;
362  len += 4;
363  }
364  else if (c == '\\')
365  {
366  if ((rp - dst + 2) > dstlen)
367  elog(ERROR, "overflow of destination buffer in escape encoding");
368  rp[0] = '\\';
369  rp[1] = '\\';
370  rp += 2;
371  len += 2;
372  }
373  else
374  {
375  if ((rp - dst + 1) > dstlen)
376  elog(ERROR, "overflow of destination buffer in escape encoding");
377  *rp++ = c;
378  len++;
379  }
380 
381  src++;
382  }
383 
384  Assert((rp - dst) <= dstlen);
385  return len;
386 }
387 
388 static uint64
389 esc_decode(const char *src, size_t srclen, char *dst, size_t dstlen)
390 {
391  const char *end = src + srclen;
392  char *rp = dst;
393  uint64 len = 0;
394 
395  while (src < end)
396  {
397  if (src[0] != '\\')
398  {
399  if ((rp - dst + 1) > dstlen)
400  elog(ERROR, "overflow of destination buffer in escape decoding");
401  *rp++ = *src++;
402  }
403  else if (src + 3 < end &&
404  (src[1] >= '0' && src[1] <= '3') &&
405  (src[2] >= '0' && src[2] <= '7') &&
406  (src[3] >= '0' && src[3] <= '7'))
407  {
408  int val;
409 
410  val = VAL(src[1]);
411  val <<= 3;
412  val += VAL(src[2]);
413  val <<= 3;
414  if ((rp - dst + 1) > dstlen)
415  elog(ERROR, "overflow of destination buffer in escape decoding");
416  *rp++ = val + VAL(src[3]);
417  src += 4;
418  }
419  else if (src + 1 < end &&
420  (src[1] == '\\'))
421  {
422  if ((rp - dst + 1) > dstlen)
423  elog(ERROR, "overflow of destination buffer in escape decoding");
424  *rp++ = '\\';
425  src += 2;
426  }
427  else
428  {
429  /*
430  * One backslash, not followed by ### valid octal. Should never
431  * get here, since esc_dec_len does same check.
432  */
433  ereport(ERROR,
434  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
435  errmsg("invalid input syntax for type %s", "bytea")));
436  }
437 
438  len++;
439  }
440 
441  Assert((rp - dst) <= dstlen);
442  return len;
443 }
444 
445 static uint64
446 esc_enc_len(const char *src, size_t srclen)
447 {
448  const char *end = src + srclen;
449  uint64 len = 0;
450 
451  while (src < end)
452  {
453  if (*src == '\0' || IS_HIGHBIT_SET(*src))
454  len += 4;
455  else if (*src == '\\')
456  len += 2;
457  else
458  len++;
459 
460  src++;
461  }
462 
463  return len;
464 }
465 
466 static uint64
467 esc_dec_len(const char *src, size_t srclen)
468 {
469  const char *end = src + srclen;
470  uint64 len = 0;
471 
472  while (src < end)
473  {
474  if (src[0] != '\\')
475  src++;
476  else if (src + 3 < end &&
477  (src[1] >= '0' && src[1] <= '3') &&
478  (src[2] >= '0' && src[2] <= '7') &&
479  (src[3] >= '0' && src[3] <= '7'))
480  {
481  /*
482  * backslash + valid octal
483  */
484  src += 4;
485  }
486  else if (src + 1 < end &&
487  (src[1] == '\\'))
488  {
489  /*
490  * two backslashes = backslash
491  */
492  src += 2;
493  }
494  else
495  {
496  /*
497  * one backslash, not followed by ### valid octal
498  */
499  ereport(ERROR,
500  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
501  errmsg("invalid input syntax for type %s", "bytea")));
502  }
503 
504  len++;
505  }
506  return len;
507 }
508 
509 /*
510  * Common
511  */
512 
513 static const struct
514 {
515  const char *name;
516  struct pg_encoding enc;
517 } enclist[] =
518 
519 {
520  {
521  "hex",
522  {
524  }
525  },
526  {
527  "base64",
528  {
530  }
531  },
532  {
533  "escape",
534  {
536  }
537  },
538  {
539  NULL,
540  {
541  NULL, NULL, NULL, NULL
542  }
543  }
544 };
545 
546 static const struct pg_encoding *
547 pg_find_encoding(const char *name)
548 {
549  int i;
550 
551  for (i = 0; enclist[i].name; i++)
552  if (pg_strcasecmp(enclist[i].name, name) == 0)
553  return &enclist[i].enc;
554 
555  return NULL;
556 }
int errhint(const char *fmt,...)
Definition: elog.c:1162
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
#define VARDATA(PTR)
Definition: postgres.h:302
uint64 pg_hex_enc_len(size_t srclen)
Definition: hex.c:175
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define VARHDRSZ
Definition: c.h:615
int errcode(int sqlerrcode)
Definition: elog.c:704
static uint64 esc_dec_len(const char *src, size_t srclen)
Definition: encode.c:467
static uint64 pg_base64_decode(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: encode.c:231
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
static const char _base64[]
Definition: encode.c:162
static uint64 pg_base64_enc_len(const char *src, size_t srclen)
Definition: encode.c:313
uint64 pg_hex_dec_len(size_t srclen)
Definition: hex.c:189
Datum binary_encode(PG_FUNCTION_ARGS)
Definition: encode.c:50
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
uint64(* decode_len)(const char *src, size_t srclen)
Definition: encode.c:36
uint64(* encode_len)(const char *src, size_t srclen)
Definition: encode.c:35
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1144
#define ERROR
Definition: elog.h:45
struct pg_encoding enc
Definition: encode.c:516
char * c
static char * buf
Definition: pg_test_fsync.c:68
static uint64 hex_dec_len(const char *src, size_t srclen)
Definition: encode.c:153
unsigned int uint32
Definition: c.h:429
static const struct pg_encoding * pg_find_encoding(const char *name)
Definition: encode.c:547
#define MaxAllocSize
Definition: memutils.h:40
uint64 pg_hex_encode(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: hex.c:74
signed char int8
Definition: c.h:415
uint64(* decode)(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: encode.c:39
static uint64 esc_decode(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: encode.c:389
#define TextDatumGetCString(d)
Definition: builtins.h:83
uintptr_t Datum
Definition: postgres.h:367
#define VAL(CH)
Definition: encode.c:339
#define ereport(elevel,...)
Definition: elog.h:155
static uint64 pg_base64_encode(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: encode.c:177
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define Assert(condition)
Definition: c.h:792
static uint64 esc_enc_len(const char *src, size_t srclen)
Definition: encode.c:446
uint64 pg_hex_decode(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: hex.c:112
static uint64 hex_enc_len(const char *src, size_t srclen)
Definition: encode.c:147
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:308
int pg_mblen(const char *mbstr)
Definition: mbutils.c:907
const char * name
Definition: encode.c:515
uint64(* encode)(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: encode.c:37
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:341
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:915
#define DIG(VAL)
Definition: encode.c:340
#define elog(elevel,...)
Definition: elog.h:228
int i
static const struct @24 enclist[]
Definition: c.h:609
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
Datum binary_decode(PG_FUNCTION_ARGS)
Definition: encode.c:94
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:329
static uint64 esc_encode(const char *src, size_t srclen, char *dst, size_t dstlen)
Definition: encode.c:343
static uint64 pg_base64_dec_len(const char *src, size_t srclen)
Definition: encode.c:320
static const int8 b64lookup[128]
Definition: encode.c:165
long val
Definition: informix.c:664