PostgreSQL Source Code  git master
mbutils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  * This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required. However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs. Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr(). Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 #include "varatt.h"
44 
45 /*
46  * We maintain a simple linked list caching the fmgr lookup info for the
47  * currently selected conversion functions, as well as any that have been
48  * selected previously in the current session. (We remember previous
49  * settings because we must be able to restore a previous setting during
50  * transaction rollback, without doing any fresh catalog accesses.)
51  *
52  * Since we'll never release this data, we just keep it in TopMemoryContext.
53  */
54 typedef struct ConvProcInfo
55 {
56  int s_encoding; /* server and client encoding IDs */
58  FmgrInfo to_server_info; /* lookup info for conversion procs */
61 
62 static List *ConvProcList = NIL; /* List of ConvProcInfo */
63 
64 /*
65  * These variables point to the currently active conversion functions,
66  * or are NULL when no conversion is needed.
67  */
68 static FmgrInfo *ToServerConvProc = NULL;
69 static FmgrInfo *ToClientConvProc = NULL;
70 
71 /*
72  * This variable stores the conversion function to convert from UTF-8
73  * to the server encoding. It's NULL if the server encoding *is* UTF-8,
74  * or if we lack a conversion function for this.
75  */
77 
78 /*
79  * These variables track the currently-selected encodings.
80  */
84 
85 /*
86  * During backend startup we can't set client encoding because we (a)
87  * can't look up the conversion functions, and (b) may not know the database
88  * encoding yet either. So SetClientEncoding() just accepts anything and
89  * remembers it for InitializeClientEncoding() to apply later.
90  */
91 static bool backend_startup_complete = false;
93 
94 
95 /* Internal functions */
96 static char *perform_default_encoding_conversion(const char *src,
97  int len, bool is_client_to_server);
98 static int cliplen(const char *str, int len, int limit);
99 
100 
101 /*
102  * Prepare for a future call to SetClientEncoding. Success should mean
103  * that SetClientEncoding is guaranteed to succeed for this encoding request.
104  *
105  * (But note that success before backend_startup_complete does not guarantee
106  * success after ...)
107  *
108  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
109  */
110 int
112 {
113  int current_server_encoding;
114  ListCell *lc;
115 
117  return -1;
118 
119  /* Can't do anything during startup, per notes above */
121  return 0;
122 
123  current_server_encoding = GetDatabaseEncoding();
124 
125  /*
126  * Check for cases that require no conversion function.
127  */
128  if (current_server_encoding == encoding ||
129  current_server_encoding == PG_SQL_ASCII ||
131  return 0;
132 
133  if (IsTransactionState())
134  {
135  /*
136  * If we're in a live transaction, it's safe to access the catalogs,
137  * so look up the functions. We repeat the lookup even if the info is
138  * already cached, so that we can react to changes in the contents of
139  * pg_conversion.
140  */
141  Oid to_server_proc,
142  to_client_proc;
143  ConvProcInfo *convinfo;
144  MemoryContext oldcontext;
145 
146  to_server_proc = FindDefaultConversionProc(encoding,
147  current_server_encoding);
148  if (!OidIsValid(to_server_proc))
149  return -1;
150  to_client_proc = FindDefaultConversionProc(current_server_encoding,
151  encoding);
152  if (!OidIsValid(to_client_proc))
153  return -1;
154 
155  /*
156  * Load the fmgr info into TopMemoryContext (could still fail here)
157  */
159  sizeof(ConvProcInfo));
160  convinfo->s_encoding = current_server_encoding;
161  convinfo->c_encoding = encoding;
162  fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
164  fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
166 
167  /* Attach new info to head of list */
169  ConvProcList = lcons(convinfo, ConvProcList);
170  MemoryContextSwitchTo(oldcontext);
171 
172  /*
173  * We cannot yet remove any older entry for the same encoding pair,
174  * since it could still be in use. SetClientEncoding will clean up.
175  */
176 
177  return 0; /* success */
178  }
179  else
180  {
181  /*
182  * If we're not in a live transaction, the only thing we can do is
183  * restore a previous setting using the cache. This covers all
184  * transaction-rollback cases. The only case it might not work for is
185  * trying to change client_encoding on the fly by editing
186  * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
187  * thing to do anyway.
188  */
189  foreach(lc, ConvProcList)
190  {
191  ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
192 
193  if (oldinfo->s_encoding == current_server_encoding &&
194  oldinfo->c_encoding == encoding)
195  return 0;
196  }
197 
198  return -1; /* it's not cached, so fail */
199  }
200 }
201 
202 /*
203  * Set the active client encoding and set up the conversion-function pointers.
204  * PrepareClientEncoding should have been called previously for this encoding.
205  *
206  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
207  */
208 int
210 {
211  int current_server_encoding;
212  bool found;
213  ListCell *lc;
214 
216  return -1;
217 
218  /* Can't do anything during startup, per notes above */
220  {
222  return 0;
223  }
224 
225  current_server_encoding = GetDatabaseEncoding();
226 
227  /*
228  * Check for cases that require no conversion function.
229  */
230  if (current_server_encoding == encoding ||
231  current_server_encoding == PG_SQL_ASCII ||
233  {
235  ToServerConvProc = NULL;
236  ToClientConvProc = NULL;
237  return 0;
238  }
239 
240  /*
241  * Search the cache for the entry previously prepared by
242  * PrepareClientEncoding; if there isn't one, we lose. While at it,
243  * release any duplicate entries so that repeated Prepare/Set cycles don't
244  * leak memory.
245  */
246  found = false;
247  foreach(lc, ConvProcList)
248  {
249  ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
250 
251  if (convinfo->s_encoding == current_server_encoding &&
252  convinfo->c_encoding == encoding)
253  {
254  if (!found)
255  {
256  /* Found newest entry, so set up */
258  ToServerConvProc = &convinfo->to_server_info;
259  ToClientConvProc = &convinfo->to_client_info;
260  found = true;
261  }
262  else
263  {
264  /* Duplicate entry, release it */
266  pfree(convinfo);
267  }
268  }
269  }
270 
271  if (found)
272  return 0; /* success */
273  else
274  return -1; /* it's not cached, so fail */
275 }
276 
277 /*
278  * Initialize client encoding conversions.
279  * Called from InitPostgres() once during backend startup.
280  */
281 void
283 {
284  int current_server_encoding;
285 
288 
291  {
292  /*
293  * Oops, the requested conversion is not available. We couldn't fail
294  * before, but we can now.
295  */
296  ereport(FATAL,
297  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
298  errmsg("conversion between %s and %s is not supported",
301  }
302 
303  /*
304  * Also look up the UTF8-to-server conversion function if needed. Since
305  * the server encoding is fixed within any one backend process, we don't
306  * have to do this more than once.
307  */
308  current_server_encoding = GetDatabaseEncoding();
309  if (current_server_encoding != PG_UTF8 &&
310  current_server_encoding != PG_SQL_ASCII)
311  {
312  Oid utf8_to_server_proc;
313 
315  utf8_to_server_proc =
317  current_server_encoding);
318  /* If there's no such conversion, just leave the pointer as NULL */
319  if (OidIsValid(utf8_to_server_proc))
320  {
321  FmgrInfo *finfo;
322 
324  sizeof(FmgrInfo));
325  fmgr_info_cxt(utf8_to_server_proc, finfo,
327  /* Set Utf8ToServerConvProc only after data is fully valid */
328  Utf8ToServerConvProc = finfo;
329  }
330  }
331 }
332 
333 /*
334  * returns the current client encoding
335  */
336 int
338 {
339  return ClientEncoding->encoding;
340 }
341 
342 /*
343  * returns the current client encoding name
344  */
345 const char *
347 {
348  return ClientEncoding->name;
349 }
350 
351 /*
352  * Convert src string to another encoding (general case).
353  *
354  * See the notes about string conversion functions at the top of this file.
355  */
356 unsigned char *
357 pg_do_encoding_conversion(unsigned char *src, int len,
358  int src_encoding, int dest_encoding)
359 {
360  unsigned char *result;
361  Oid proc;
362 
363  if (len <= 0)
364  return src; /* empty string is always valid */
365 
366  if (src_encoding == dest_encoding)
367  return src; /* no conversion required, assume valid */
368 
369  if (dest_encoding == PG_SQL_ASCII)
370  return src; /* any string is valid in SQL_ASCII */
371 
372  if (src_encoding == PG_SQL_ASCII)
373  {
374  /* No conversion is possible, but we must validate the result */
375  (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
376  return src;
377  }
378 
379  if (!IsTransactionState()) /* shouldn't happen */
380  elog(ERROR, "cannot perform encoding conversion outside a transaction");
381 
382  proc = FindDefaultConversionProc(src_encoding, dest_encoding);
383  if (!OidIsValid(proc))
384  ereport(ERROR,
385  (errcode(ERRCODE_UNDEFINED_FUNCTION),
386  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
387  pg_encoding_to_char(src_encoding),
388  pg_encoding_to_char(dest_encoding))));
389 
390  /*
391  * Allocate space for conversion result, being wary of integer overflow.
392  *
393  * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
394  * required space, so it might exceed MaxAllocSize even though the result
395  * would actually fit. We do not want to hand back a result string that
396  * exceeds MaxAllocSize, because callers might not cope gracefully --- but
397  * if we just allocate more than that, and don't use it, that's fine.
398  */
400  ereport(ERROR,
401  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
402  errmsg("out of memory"),
403  errdetail("String of %d bytes is too long for encoding conversion.",
404  len)));
405 
406  result = (unsigned char *)
408  (Size) len * MAX_CONVERSION_GROWTH + 1);
409 
410  (void) OidFunctionCall6(proc,
411  Int32GetDatum(src_encoding),
412  Int32GetDatum(dest_encoding),
413  CStringGetDatum((char *) src),
414  CStringGetDatum((char *) result),
416  BoolGetDatum(false));
417 
418  /*
419  * If the result is large, it's worth repalloc'ing to release any extra
420  * space we asked for. The cutoff here is somewhat arbitrary, but we
421  * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
422  */
423  if (len > 1000000)
424  {
425  Size resultlen = strlen((char *) result);
426 
427  if (resultlen >= MaxAllocSize)
428  ereport(ERROR,
429  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
430  errmsg("out of memory"),
431  errdetail("String of %d bytes is too long for encoding conversion.",
432  len)));
433 
434  result = (unsigned char *) repalloc(result, resultlen + 1);
435  }
436 
437  return result;
438 }
439 
440 /*
441  * Convert src string to another encoding.
442  *
443  * This function has a different API than the other conversion functions.
444  * The caller should've looked up the conversion function using
445  * FindDefaultConversionProc(). Unlike the other functions, the converted
446  * result is not palloc'd. It is written to the caller-supplied buffer
447  * instead.
448  *
449  * src_encoding - encoding to convert from
450  * dest_encoding - encoding to convert to
451  * src, srclen - input buffer and its length in bytes
452  * dest, destlen - destination buffer and its size in bytes
453  *
454  * The output is null-terminated.
455  *
456  * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
457  * wouldn't necessarily fit in the output buffer, and the function will not
458  * convert the whole input.
459  *
460  * TODO: The conversion function interface is not great. Firstly, it
461  * would be nice to pass through the destination buffer size to the
462  * conversion function, so that if you pass a shorter destination buffer, it
463  * could still continue to fill up the whole buffer. Currently, we have to
464  * assume worst case expansion and stop the conversion short, even if there
465  * is in fact space left in the destination buffer. Secondly, it would be
466  * nice to return the number of bytes written to the caller, to avoid a call
467  * to strlen().
468  */
469 int
471  int src_encoding,
472  int dest_encoding,
473  unsigned char *src, int srclen,
474  unsigned char *dest, int destlen,
475  bool noError)
476 {
477  Datum result;
478 
479  /*
480  * If the destination buffer is not large enough to hold the result in the
481  * worst case, limit the input size passed to the conversion function.
482  */
483  if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
484  srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
485 
486  result = OidFunctionCall6(proc,
487  Int32GetDatum(src_encoding),
488  Int32GetDatum(dest_encoding),
489  CStringGetDatum((char *) src),
490  CStringGetDatum((char *) dest),
491  Int32GetDatum(srclen),
492  BoolGetDatum(noError));
493  return DatumGetInt32(result);
494 }
495 
496 /*
497  * Convert string to encoding encoding_name. The source
498  * encoding is the DB encoding.
499  *
500  * BYTEA convert_to(TEXT string, NAME encoding_name) */
501 Datum
503 {
504  Datum string = PG_GETARG_DATUM(0);
505  Datum dest_encoding_name = PG_GETARG_DATUM(1);
506  Datum src_encoding_name = DirectFunctionCall1(namein,
508  Datum result;
509 
510  /*
511  * pg_convert expects a bytea as its first argument. We're passing it a
512  * text argument here, relying on the fact that they are both in fact
513  * varlena types, and thus structurally identical.
514  */
515  result = DirectFunctionCall3(pg_convert, string,
516  src_encoding_name, dest_encoding_name);
517 
518  PG_RETURN_DATUM(result);
519 }
520 
521 /*
522  * Convert string from encoding encoding_name. The destination
523  * encoding is the DB encoding.
524  *
525  * TEXT convert_from(BYTEA string, NAME encoding_name) */
526 Datum
528 {
529  Datum string = PG_GETARG_DATUM(0);
530  Datum src_encoding_name = PG_GETARG_DATUM(1);
531  Datum dest_encoding_name = DirectFunctionCall1(namein,
533  Datum result;
534 
535  result = DirectFunctionCall3(pg_convert, string,
536  src_encoding_name, dest_encoding_name);
537 
538  /*
539  * pg_convert returns a bytea, which we in turn return as text, relying on
540  * the fact that they are both in fact varlena types, and thus
541  * structurally identical. Although not all bytea values are valid text,
542  * in this case it will be because we've told pg_convert to return one
543  * that is valid as text in the current database encoding.
544  */
545  PG_RETURN_DATUM(result);
546 }
547 
548 /*
549  * Convert string between two arbitrary encodings.
550  *
551  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
552  */
553 Datum
555 {
556  bytea *string = PG_GETARG_BYTEA_PP(0);
557  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
558  int src_encoding = pg_char_to_encoding(src_encoding_name);
559  char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
560  int dest_encoding = pg_char_to_encoding(dest_encoding_name);
561  const char *src_str;
562  char *dest_str;
563  bytea *retval;
564  int len;
565 
566  if (src_encoding < 0)
567  ereport(ERROR,
568  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
569  errmsg("invalid source encoding name \"%s\"",
570  src_encoding_name)));
571  if (dest_encoding < 0)
572  ereport(ERROR,
573  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
574  errmsg("invalid destination encoding name \"%s\"",
575  dest_encoding_name)));
576 
577  /* make sure that source string is valid */
578  len = VARSIZE_ANY_EXHDR(string);
579  src_str = VARDATA_ANY(string);
580  (void) pg_verify_mbstr(src_encoding, src_str, len, false);
581 
582  /* perform conversion */
583  dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
584  len,
585  src_encoding,
586  dest_encoding);
587 
588  /* update len if conversion actually happened */
589  if (dest_str != src_str)
590  len = strlen(dest_str);
591 
592  /*
593  * build bytea data type structure.
594  */
595  retval = (bytea *) palloc(len + VARHDRSZ);
596  SET_VARSIZE(retval, len + VARHDRSZ);
597  memcpy(VARDATA(retval), dest_str, len);
598 
599  if (dest_str != src_str)
600  pfree(dest_str);
601 
602  /* free memory if allocated by the toaster */
603  PG_FREE_IF_COPY(string, 0);
604 
605  PG_RETURN_BYTEA_P(retval);
606 }
607 
608 /*
609  * get the length of the string considered as text in the specified
610  * encoding. Raises an error if the data is not valid in that
611  * encoding.
612  *
613  * INT4 length (BYTEA string, NAME src_encoding_name)
614  */
615 Datum
617 {
618  bytea *string = PG_GETARG_BYTEA_PP(0);
619  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
620  int src_encoding = pg_char_to_encoding(src_encoding_name);
621  const char *src_str;
622  int len;
623  int retval;
624 
625  if (src_encoding < 0)
626  ereport(ERROR,
627  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
628  errmsg("invalid encoding name \"%s\"",
629  src_encoding_name)));
630 
631  len = VARSIZE_ANY_EXHDR(string);
632  src_str = VARDATA_ANY(string);
633 
634  retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
635 
636  PG_RETURN_INT32(retval);
637 }
638 
639 /*
640  * Get maximum multibyte character length in the specified encoding.
641  *
642  * Note encoding is specified numerically, not by name as above.
643  */
644 Datum
646 {
647  int encoding = PG_GETARG_INT32(0);
648 
651  else
652  PG_RETURN_NULL();
653 }
654 
655 /*
656  * Convert client encoding to server encoding.
657  *
658  * See the notes about string conversion functions at the top of this file.
659  */
660 char *
661 pg_client_to_server(const char *s, int len)
662 {
664 }
665 
666 /*
667  * Convert any encoding to server encoding.
668  *
669  * See the notes about string conversion functions at the top of this file.
670  *
671  * Unlike the other string conversion functions, this will apply validation
672  * even if encoding == DatabaseEncoding->encoding. This is because this is
673  * used to process data coming in from outside the database, and we never
674  * want to just assume validity.
675  */
676 char *
677 pg_any_to_server(const char *s, int len, int encoding)
678 {
679  if (len <= 0)
680  return unconstify(char *, s); /* empty string is always valid */
681 
684  {
685  /*
686  * No conversion is needed, but we must still validate the data.
687  */
688  (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
689  return unconstify(char *, s);
690  }
691 
693  {
694  /*
695  * No conversion is possible, but we must still validate the data,
696  * because the client-side code might have done string escaping using
697  * the selected client_encoding. If the client encoding is ASCII-safe
698  * then we just do a straight validation under that encoding. For an
699  * ASCII-unsafe encoding we have a problem: we dare not pass such data
700  * to the parser but we have no way to convert it. We compromise by
701  * rejecting the data if it contains any non-ASCII characters.
702  */
704  (void) pg_verify_mbstr(encoding, s, len, false);
705  else
706  {
707  int i;
708 
709  for (i = 0; i < len; i++)
710  {
711  if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
712  ereport(ERROR,
713  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
714  errmsg("invalid byte value for encoding \"%s\": 0x%02x",
716  (unsigned char) s[i])));
717  }
718  }
719  return unconstify(char *, s);
720  }
721 
722  /* Fast path if we can use cached conversion function */
724  return perform_default_encoding_conversion(s, len, true);
725 
726  /* General case ... will not work outside transactions */
727  return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
728  len,
729  encoding,
731 }
732 
733 /*
734  * Convert server encoding to client encoding.
735  *
736  * See the notes about string conversion functions at the top of this file.
737  */
738 char *
739 pg_server_to_client(const char *s, int len)
740 {
742 }
743 
744 /*
745  * Convert server encoding to any encoding.
746  *
747  * See the notes about string conversion functions at the top of this file.
748  */
749 char *
750 pg_server_to_any(const char *s, int len, int encoding)
751 {
752  if (len <= 0)
753  return unconstify(char *, s); /* empty string is always valid */
754 
757  return unconstify(char *, s); /* assume data is valid */
758 
760  {
761  /* No conversion is possible, but we must validate the result */
762  (void) pg_verify_mbstr(encoding, s, len, false);
763  return unconstify(char *, s);
764  }
765 
766  /* Fast path if we can use cached conversion function */
768  return perform_default_encoding_conversion(s, len, false);
769 
770  /* General case ... will not work outside transactions */
771  return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
772  len,
774  encoding);
775 }
776 
777 /*
778  * Perform default encoding conversion using cached FmgrInfo. Since
779  * this function does not access database at all, it is safe to call
780  * outside transactions. If the conversion has not been set up by
781  * SetClientEncoding(), no conversion is performed.
782  */
783 static char *
785  bool is_client_to_server)
786 {
787  char *result;
788  int src_encoding,
789  dest_encoding;
790  FmgrInfo *flinfo;
791 
792  if (is_client_to_server)
793  {
794  src_encoding = ClientEncoding->encoding;
795  dest_encoding = DatabaseEncoding->encoding;
796  flinfo = ToServerConvProc;
797  }
798  else
799  {
800  src_encoding = DatabaseEncoding->encoding;
801  dest_encoding = ClientEncoding->encoding;
802  flinfo = ToClientConvProc;
803  }
804 
805  if (flinfo == NULL)
806  return unconstify(char *, src);
807 
808  /*
809  * Allocate space for conversion result, being wary of integer overflow.
810  * See comments in pg_do_encoding_conversion.
811  */
813  ereport(ERROR,
814  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
815  errmsg("out of memory"),
816  errdetail("String of %d bytes is too long for encoding conversion.",
817  len)));
818 
819  result = (char *)
821  (Size) len * MAX_CONVERSION_GROWTH + 1);
822 
823  FunctionCall6(flinfo,
824  Int32GetDatum(src_encoding),
825  Int32GetDatum(dest_encoding),
826  CStringGetDatum(src),
827  CStringGetDatum(result),
829  BoolGetDatum(false));
830 
831  /*
832  * Release extra space if there might be a lot --- see comments in
833  * pg_do_encoding_conversion.
834  */
835  if (len > 1000000)
836  {
837  Size resultlen = strlen(result);
838 
839  if (resultlen >= MaxAllocSize)
840  ereport(ERROR,
841  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
842  errmsg("out of memory"),
843  errdetail("String of %d bytes is too long for encoding conversion.",
844  len)));
845 
846  result = (char *) repalloc(result, resultlen + 1);
847  }
848 
849  return result;
850 }
851 
852 /*
853  * Convert a single Unicode code point into a string in the server encoding.
854  *
855  * The code point given by "c" is converted and stored at *s, which must
856  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
857  * The output will have a trailing '\0'. Throws error if the conversion
858  * cannot be performed.
859  *
860  * Note that this relies on having previously looked up any required
861  * conversion function. That's partly for speed but mostly because the parser
862  * may call this outside any transaction, or in an aborted transaction.
863  */
864 void
865 pg_unicode_to_server(pg_wchar c, unsigned char *s)
866 {
867  unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
868  int c_as_utf8_len;
869  int server_encoding;
870 
871  /*
872  * Complain if invalid Unicode code point. The choice of errcode here is
873  * debatable, but really our caller should have checked this anyway.
874  */
876  ereport(ERROR,
877  (errcode(ERRCODE_SYNTAX_ERROR),
878  errmsg("invalid Unicode code point")));
879 
880  /* Otherwise, if it's in ASCII range, conversion is trivial */
881  if (c <= 0x7F)
882  {
883  s[0] = (unsigned char) c;
884  s[1] = '\0';
885  return;
886  }
887 
888  /* If the server encoding is UTF-8, we just need to reformat the code */
889  server_encoding = GetDatabaseEncoding();
890  if (server_encoding == PG_UTF8)
891  {
892  unicode_to_utf8(c, s);
893  s[pg_utf_mblen(s)] = '\0';
894  return;
895  }
896 
897  /* For all other cases, we must have a conversion function available */
898  if (Utf8ToServerConvProc == NULL)
899  ereport(ERROR,
900  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
901  errmsg("conversion between %s and %s is not supported",
904 
905  /* Construct UTF-8 source string */
906  unicode_to_utf8(c, c_as_utf8);
907  c_as_utf8_len = pg_utf_mblen(c_as_utf8);
908  c_as_utf8[c_as_utf8_len] = '\0';
909 
910  /* Convert, or throw error if we can't */
913  Int32GetDatum(server_encoding),
914  CStringGetDatum((char *) c_as_utf8),
915  CStringGetDatum((char *) s),
916  Int32GetDatum(c_as_utf8_len),
917  BoolGetDatum(false));
918 }
919 
920 /*
921  * Convert a single Unicode code point into a string in the server encoding.
922  *
923  * Same as pg_unicode_to_server(), except that we don't throw errors,
924  * but simply return false on conversion failure.
925  */
926 bool
928 {
929  unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
930  int c_as_utf8_len;
931  int converted_len;
932  int server_encoding;
933 
934  /* Fail if invalid Unicode code point */
936  return false;
937 
938  /* Otherwise, if it's in ASCII range, conversion is trivial */
939  if (c <= 0x7F)
940  {
941  s[0] = (unsigned char) c;
942  s[1] = '\0';
943  return true;
944  }
945 
946  /* If the server encoding is UTF-8, we just need to reformat the code */
947  server_encoding = GetDatabaseEncoding();
948  if (server_encoding == PG_UTF8)
949  {
950  unicode_to_utf8(c, s);
951  s[pg_utf_mblen(s)] = '\0';
952  return true;
953  }
954 
955  /* For all other cases, we must have a conversion function available */
956  if (Utf8ToServerConvProc == NULL)
957  return false;
958 
959  /* Construct UTF-8 source string */
960  unicode_to_utf8(c, c_as_utf8);
961  c_as_utf8_len = pg_utf_mblen(c_as_utf8);
962  c_as_utf8[c_as_utf8_len] = '\0';
963 
964  /* Convert, but without throwing error if we can't */
967  Int32GetDatum(server_encoding),
968  CStringGetDatum((char *) c_as_utf8),
969  CStringGetDatum((char *) s),
970  Int32GetDatum(c_as_utf8_len),
971  BoolGetDatum(true)));
972 
973  /* Conversion was successful iff it consumed the whole input */
974  return (converted_len == c_as_utf8_len);
975 }
976 
977 
978 /* convert a multibyte string to a wchar */
979 int
980 pg_mb2wchar(const char *from, pg_wchar *to)
981 {
982  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
983 }
984 
985 /* convert a multibyte string to a wchar with a limited length */
986 int
987 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
988 {
989  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
990 }
991 
992 /* same, with any encoding */
993 int
995  const char *from, pg_wchar *to, int len)
996 {
997  return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
998 }
999 
1000 /* convert a wchar string to a multibyte */
1001 int
1002 pg_wchar2mb(const pg_wchar *from, char *to)
1003 {
1004  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
1005 }
1006 
1007 /* convert a wchar string to a multibyte with a limited length */
1008 int
1009 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
1010 {
1011  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1012 }
1013 
1014 /* same, with any encoding */
1015 int
1017  const pg_wchar *from, char *to, int len)
1018 {
1019  return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1020 }
1021 
1022 /* returns the byte length of a multibyte character */
1023 int
1024 pg_mblen(const char *mbstr)
1025 {
1026  return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1027 }
1028 
1029 /* returns the display length of a multibyte character */
1030 int
1031 pg_dsplen(const char *mbstr)
1032 {
1033  return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1034 }
1035 
1036 /* returns the length (counted in wchars) of a multibyte string */
1037 int
1038 pg_mbstrlen(const char *mbstr)
1039 {
1040  int len = 0;
1041 
1042  /* optimization for single byte encoding */
1044  return strlen(mbstr);
1045 
1046  while (*mbstr)
1047  {
1048  mbstr += pg_mblen(mbstr);
1049  len++;
1050  }
1051  return len;
1052 }
1053 
1054 /* returns the length (counted in wchars) of a multibyte string
1055  * (not necessarily NULL terminated)
1056  */
1057 int
1058 pg_mbstrlen_with_len(const char *mbstr, int limit)
1059 {
1060  int len = 0;
1061 
1062  /* optimization for single byte encoding */
1064  return limit;
1065 
1066  while (limit > 0 && *mbstr)
1067  {
1068  int l = pg_mblen(mbstr);
1069 
1070  limit -= l;
1071  mbstr += l;
1072  len++;
1073  }
1074  return len;
1075 }
1076 
1077 /*
1078  * returns the byte length of a multibyte string
1079  * (not necessarily NULL terminated)
1080  * that is no longer than limit.
1081  * this function does not break multibyte character boundary.
1082  */
1083 int
1084 pg_mbcliplen(const char *mbstr, int len, int limit)
1085 {
1087  len, limit);
1088 }
1089 
1090 /*
1091  * pg_mbcliplen with specified encoding
1092  */
1093 int
1094 pg_encoding_mbcliplen(int encoding, const char *mbstr,
1095  int len, int limit)
1096 {
1097  mblen_converter mblen_fn;
1098  int clen = 0;
1099  int l;
1100 
1101  /* optimization for single byte encoding */
1102  if (pg_encoding_max_length(encoding) == 1)
1103  return cliplen(mbstr, len, limit);
1104 
1105  mblen_fn = pg_wchar_table[encoding].mblen;
1106 
1107  while (len > 0 && *mbstr)
1108  {
1109  l = (*mblen_fn) ((const unsigned char *) mbstr);
1110  if ((clen + l) > limit)
1111  break;
1112  clen += l;
1113  if (clen == limit)
1114  break;
1115  len -= l;
1116  mbstr += l;
1117  }
1118  return clen;
1119 }
1120 
1121 /*
1122  * Similar to pg_mbcliplen except the limit parameter specifies the
1123  * character length, not the byte length.
1124  */
1125 int
1126 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1127 {
1128  int clen = 0;
1129  int nch = 0;
1130  int l;
1131 
1132  /* optimization for single byte encoding */
1134  return cliplen(mbstr, len, limit);
1135 
1136  while (len > 0 && *mbstr)
1137  {
1138  l = pg_mblen(mbstr);
1139  nch++;
1140  if (nch > limit)
1141  break;
1142  clen += l;
1143  len -= l;
1144  mbstr += l;
1145  }
1146  return clen;
1147 }
1148 
1149 /* mbcliplen for any single-byte encoding */
1150 static int
1151 cliplen(const char *str, int len, int limit)
1152 {
1153  int l = 0;
1154 
1155  len = Min(len, limit);
1156  while (l < len && str[l])
1157  l++;
1158  return l;
1159 }
1160 
1161 void
1163 {
1165  elog(ERROR, "invalid database encoding: %d", encoding);
1166 
1169 }
1170 
1171 void
1173 {
1174  /* Some calls happen before we can elog()! */
1176 
1179 }
1180 
1181 #ifdef ENABLE_NLS
1182 /*
1183  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1184  * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1185  * fail for gettext-internal causes like out-of-memory.
1186  */
1187 static bool
1188 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1189 {
1190  bool elog_ok = (CurrentMemoryContext != NULL);
1191  int i;
1192 
1193  for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
1194  {
1196  {
1197  if (bind_textdomain_codeset(domainname,
1198  pg_enc2gettext_tbl[i].name) != NULL)
1199  return true;
1200 
1201  if (elog_ok)
1202  elog(LOG, "bind_textdomain_codeset failed");
1203  else
1204  write_stderr("bind_textdomain_codeset failed");
1205 
1206  break;
1207  }
1208  }
1209 
1210  return false;
1211 }
1212 
1213 /*
1214  * Bind a gettext message domain to the codeset corresponding to the database
1215  * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1216  * Return the MessageEncoding implied by the new settings.
1217  *
1218  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1219  * When that matches the database encoding, we don't need to do anything. In
1220  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1221  * database encoding, except for the C locale. (On Windows, we also permit a
1222  * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1223  * gettext to the right codeset.
1224  *
1225  * On Windows, gettext defaults to the Windows ANSI code page. This is a
1226  * convenient departure for software that passes the strings to Windows ANSI
1227  * APIs, but we don't do that. Compel gettext to use database encoding or,
1228  * failing that, the LC_CTYPE encoding as it would on other platforms.
1229  *
1230  * This function is called before elog() and palloc() are usable.
1231  */
1232 int
1233 pg_bind_textdomain_codeset(const char *domainname)
1234 {
1235  bool elog_ok = (CurrentMemoryContext != NULL);
1236  int encoding = GetDatabaseEncoding();
1237  int new_msgenc;
1238 
1239 #ifndef WIN32
1240  const char *ctype = setlocale(LC_CTYPE, NULL);
1241 
1242  if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1243 #endif
1244  if (encoding != PG_SQL_ASCII &&
1245  raw_pg_bind_textdomain_codeset(domainname, encoding))
1246  return encoding;
1247 
1248  new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1249  if (new_msgenc < 0)
1250  new_msgenc = PG_SQL_ASCII;
1251 
1252 #ifdef WIN32
1253  if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1254  /* On failure, the old message encoding remains valid. */
1255  return GetMessageEncoding();
1256 #endif
1257 
1258  return new_msgenc;
1259 }
1260 #endif
1261 
1262 /*
1263  * The database encoding, also called the server encoding, represents the
1264  * encoding of data stored in text-like data types. Affected types include
1265  * cstring, text, varchar, name, xml, and json.
1266  */
1267 int
1269 {
1270  return DatabaseEncoding->encoding;
1271 }
1272 
1273 const char *
1275 {
1276  return DatabaseEncoding->name;
1277 }
1278 
1279 Datum
1281 {
1283 }
1284 
1285 Datum
1287 {
1289 }
1290 
1291 Datum
1293 {
1294  Name s = PG_GETARG_NAME(0);
1295 
1297 }
1298 
1299 Datum
1301 {
1303  const char *encoding_name = pg_encoding_to_char(encoding);
1304 
1305  return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1306 }
1307 
1308 /*
1309  * gettext() returns messages in this encoding. This often matches the
1310  * database encoding, but it differs for SQL_ASCII databases, for processes
1311  * not attached to a database, and under a database encoding lacking iconv
1312  * support (MULE_INTERNAL).
1313  */
1314 int
1316 {
1317  return MessageEncoding->encoding;
1318 }
1319 
1320 
1321 /*
1322  * Generic character incrementer function.
1323  *
1324  * Not knowing anything about the properties of the encoding in use, we just
1325  * keep incrementing the last byte until we get a validly-encoded result,
1326  * or we run out of values to try. We don't bother to try incrementing
1327  * higher-order bytes, so there's no growth in runtime for wider characters.
1328  * (If we did try to do that, we'd need to consider the likelihood that 255
1329  * is not a valid final byte in the encoding.)
1330  */
1331 static bool
1332 pg_generic_charinc(unsigned char *charptr, int len)
1333 {
1334  unsigned char *lastbyte = charptr + len - 1;
1335  mbchar_verifier mbverify;
1336 
1337  /* We can just invoke the character verifier directly. */
1339 
1340  while (*lastbyte < (unsigned char) 255)
1341  {
1342  (*lastbyte)++;
1343  if ((*mbverify) (charptr, len) == len)
1344  return true;
1345  }
1346 
1347  return false;
1348 }
1349 
1350 /*
1351  * UTF-8 character incrementer function.
1352  *
1353  * For a one-byte character less than 0x7F, we just increment the byte.
1354  *
1355  * For a multibyte character, every byte but the first must fall between 0x80
1356  * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1357  * the last byte that's not already at its maximum value. If we can't find a
1358  * byte that's less than the maximum allowable value, we simply fail. We also
1359  * need some special-case logic to skip regions used for surrogate pair
1360  * handling, as those should not occur in valid UTF-8.
1361  *
1362  * Note that we don't reset lower-order bytes back to their minimums, since
1363  * we can't afford to make an exhaustive search (see make_greater_string).
1364  */
1365 static bool
1366 pg_utf8_increment(unsigned char *charptr, int length)
1367 {
1368  unsigned char a;
1369  unsigned char limit;
1370 
1371  switch (length)
1372  {
1373  default:
1374  /* reject lengths 5 and 6 for now */
1375  return false;
1376  case 4:
1377  a = charptr[3];
1378  if (a < 0xBF)
1379  {
1380  charptr[3]++;
1381  break;
1382  }
1383  /* FALL THRU */
1384  case 3:
1385  a = charptr[2];
1386  if (a < 0xBF)
1387  {
1388  charptr[2]++;
1389  break;
1390  }
1391  /* FALL THRU */
1392  case 2:
1393  a = charptr[1];
1394  switch (*charptr)
1395  {
1396  case 0xED:
1397  limit = 0x9F;
1398  break;
1399  case 0xF4:
1400  limit = 0x8F;
1401  break;
1402  default:
1403  limit = 0xBF;
1404  break;
1405  }
1406  if (a < limit)
1407  {
1408  charptr[1]++;
1409  break;
1410  }
1411  /* FALL THRU */
1412  case 1:
1413  a = *charptr;
1414  if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1415  return false;
1416  charptr[0]++;
1417  break;
1418  }
1419 
1420  return true;
1421 }
1422 
1423 /*
1424  * EUC-JP character incrementer function.
1425  *
1426  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1427  * representing JIS X 0201 characters with the second byte ranging between
1428  * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1429  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1430  *
1431  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1432  * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1433  * is incremented if possible, otherwise the second-to-last byte.
1434  *
1435  * If the sequence starts with a value other than the above and its MSB
1436  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1437  * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1438  * incremented if possible, otherwise the second-to-last byte.
1439  *
1440  * Otherwise, the sequence is a single-byte ASCII character. It is
1441  * incremented up to 0x7f.
1442  */
1443 static bool
1444 pg_eucjp_increment(unsigned char *charptr, int length)
1445 {
1446  unsigned char c1,
1447  c2;
1448  int i;
1449 
1450  c1 = *charptr;
1451 
1452  switch (c1)
1453  {
1454  case SS2: /* JIS X 0201 */
1455  if (length != 2)
1456  return false;
1457 
1458  c2 = charptr[1];
1459 
1460  if (c2 >= 0xdf)
1461  charptr[0] = charptr[1] = 0xa1;
1462  else if (c2 < 0xa1)
1463  charptr[1] = 0xa1;
1464  else
1465  charptr[1]++;
1466  break;
1467 
1468  case SS3: /* JIS X 0212 */
1469  if (length != 3)
1470  return false;
1471 
1472  for (i = 2; i > 0; i--)
1473  {
1474  c2 = charptr[i];
1475  if (c2 < 0xa1)
1476  {
1477  charptr[i] = 0xa1;
1478  return true;
1479  }
1480  else if (c2 < 0xfe)
1481  {
1482  charptr[i]++;
1483  return true;
1484  }
1485  }
1486 
1487  /* Out of 3-byte code region */
1488  return false;
1489 
1490  default:
1491  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1492  {
1493  if (length != 2)
1494  return false;
1495 
1496  for (i = 1; i >= 0; i--)
1497  {
1498  c2 = charptr[i];
1499  if (c2 < 0xa1)
1500  {
1501  charptr[i] = 0xa1;
1502  return true;
1503  }
1504  else if (c2 < 0xfe)
1505  {
1506  charptr[i]++;
1507  return true;
1508  }
1509  }
1510 
1511  /* Out of 2 byte code region */
1512  return false;
1513  }
1514  else
1515  { /* ASCII, single byte */
1516  if (c1 > 0x7e)
1517  return false;
1518  (*charptr)++;
1519  }
1520  break;
1521  }
1522 
1523  return true;
1524 }
1525 
1526 /*
1527  * get the character incrementer for the encoding for the current database
1528  */
1531 {
1532  /*
1533  * Eventually it might be best to add a field to pg_wchar_table[], but for
1534  * now we just use a switch.
1535  */
1536  switch (GetDatabaseEncoding())
1537  {
1538  case PG_UTF8:
1539  return pg_utf8_increment;
1540 
1541  case PG_EUC_JP:
1542  return pg_eucjp_increment;
1543 
1544  default:
1545  return pg_generic_charinc;
1546  }
1547 }
1548 
1549 /*
1550  * fetch maximum length of the encoding for the current database
1551  */
1552 int
1554 {
1556 }
1557 
1558 /*
1559  * Verify mbstr to make sure that it is validly encoded in the current
1560  * database encoding. Otherwise same as pg_verify_mbstr().
1561  */
1562 bool
1563 pg_verifymbstr(const char *mbstr, int len, bool noError)
1564 {
1565  return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1566 }
1567 
1568 /*
1569  * Verify mbstr to make sure that it is validly encoded in the specified
1570  * encoding.
1571  */
1572 bool
1573 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1574 {
1575  int oklen;
1576 
1578 
1579  oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1580  if (oklen != len)
1581  {
1582  if (noError)
1583  return false;
1584  report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1585  }
1586  return true;
1587 }
1588 
1589 /*
1590  * Verify mbstr to make sure that it is validly encoded in the specified
1591  * encoding.
1592  *
1593  * mbstr is not necessarily zero terminated; length of mbstr is
1594  * specified by len.
1595  *
1596  * If OK, return length of string in the encoding.
1597  * If a problem is found, return -1 when noError is
1598  * true; when noError is false, ereport() a descriptive message.
1599  *
1600  * Note: We cannot use the faster encoding-specific mbverifystr() function
1601  * here, because we need to count the number of characters in the string.
1602  */
1603 int
1604 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1605 {
1606  mbchar_verifier mbverifychar;
1607  int mb_len;
1608 
1610 
1611  /*
1612  * In single-byte encodings, we need only reject nulls (\0).
1613  */
1614  if (pg_encoding_max_length(encoding) <= 1)
1615  {
1616  const char *nullpos = memchr(mbstr, 0, len);
1617 
1618  if (nullpos == NULL)
1619  return len;
1620  if (noError)
1621  return -1;
1622  report_invalid_encoding(encoding, nullpos, 1);
1623  }
1624 
1625  /* fetch function pointer just once */
1626  mbverifychar = pg_wchar_table[encoding].mbverifychar;
1627 
1628  mb_len = 0;
1629 
1630  while (len > 0)
1631  {
1632  int l;
1633 
1634  /* fast path for ASCII-subset characters */
1635  if (!IS_HIGHBIT_SET(*mbstr))
1636  {
1637  if (*mbstr != '\0')
1638  {
1639  mb_len++;
1640  mbstr++;
1641  len--;
1642  continue;
1643  }
1644  if (noError)
1645  return -1;
1647  }
1648 
1649  l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1650 
1651  if (l < 0)
1652  {
1653  if (noError)
1654  return -1;
1656  }
1657 
1658  mbstr += l;
1659  len -= l;
1660  mb_len++;
1661  }
1662  return mb_len;
1663 }
1664 
1665 /*
1666  * check_encoding_conversion_args: check arguments of a conversion function
1667  *
1668  * "expected" arguments can be either an encoding ID or -1 to indicate that
1669  * the caller will check whether it accepts the ID.
1670  *
1671  * Note: the errors here are not really user-facing, so elog instead of
1672  * ereport seems sufficient. Also, we trust that the "expected" encoding
1673  * arguments are valid encoding IDs, but we don't trust the actuals.
1674  */
1675 void
1677  int dest_encoding,
1678  int len,
1679  int expected_src_encoding,
1680  int expected_dest_encoding)
1681 {
1682  if (!PG_VALID_ENCODING(src_encoding))
1683  elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1684  if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1685  elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1686  pg_enc2name_tbl[expected_src_encoding].name,
1687  pg_enc2name_tbl[src_encoding].name);
1688  if (!PG_VALID_ENCODING(dest_encoding))
1689  elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1690  if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1691  elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1692  pg_enc2name_tbl[expected_dest_encoding].name,
1693  pg_enc2name_tbl[dest_encoding].name);
1694  if (len < 0)
1695  elog(ERROR, "encoding conversion length must not be negative");
1696 }
1697 
1698 /*
1699  * report_invalid_encoding: complain about invalid multibyte character
1700  *
1701  * note: len is remaining length of string, not length of character;
1702  * len must be greater than zero, as we always examine the first byte.
1703  */
1704 void
1705 report_invalid_encoding(int encoding, const char *mbstr, int len)
1706 {
1707  int l = pg_encoding_mblen(encoding, mbstr);
1708  char buf[8 * 5 + 1];
1709  char *p = buf;
1710  int j,
1711  jlimit;
1712 
1713  jlimit = Min(l, len);
1714  jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1715 
1716  for (j = 0; j < jlimit; j++)
1717  {
1718  p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1719  if (j < jlimit - 1)
1720  p += sprintf(p, " ");
1721  }
1722 
1723  ereport(ERROR,
1724  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1725  errmsg("invalid byte sequence for encoding \"%s\": %s",
1727  buf)));
1728 }
1729 
1730 /*
1731  * report_untranslatable_char: complain about untranslatable character
1732  *
1733  * note: len is remaining length of string, not length of character;
1734  * len must be greater than zero, as we always examine the first byte.
1735  */
1736 void
1737 report_untranslatable_char(int src_encoding, int dest_encoding,
1738  const char *mbstr, int len)
1739 {
1740  int l = pg_encoding_mblen(src_encoding, mbstr);
1741  char buf[8 * 5 + 1];
1742  char *p = buf;
1743  int j,
1744  jlimit;
1745 
1746  jlimit = Min(l, len);
1747  jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1748 
1749  for (j = 0; j < jlimit; j++)
1750  {
1751  p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1752  if (j < jlimit - 1)
1753  p += sprintf(p, " ");
1754  }
1755 
1756  ereport(ERROR,
1757  (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1758  errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1759  buf,
1760  pg_enc2name_tbl[src_encoding].name,
1761  pg_enc2name_tbl[dest_encoding].name)));
1762 }
1763 
1764 
1765 #ifdef WIN32
1766 /*
1767  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1768  * string. The character length is also passed to utf16len if not
1769  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1770  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1771  */
1772 WCHAR *
1773 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1774 {
1775  int msgenc = GetMessageEncoding();
1776  WCHAR *utf16;
1777  int dstlen;
1778  UINT codepage;
1779 
1780  if (msgenc == PG_SQL_ASCII)
1781  /* No conversion is possible, and SQL_ASCII is never utf16. */
1782  return NULL;
1783 
1784  codepage = pg_enc2name_tbl[msgenc].codepage;
1785 
1786  /*
1787  * Use MultiByteToWideChar directly if there is a corresponding codepage,
1788  * or double conversion through UTF8 if not. Double conversion is needed,
1789  * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1790  */
1791  if (codepage != 0)
1792  {
1793  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1794  dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1795  utf16[dstlen] = (WCHAR) 0;
1796  }
1797  else
1798  {
1799  char *utf8;
1800 
1801  /*
1802  * XXX pg_do_encoding_conversion() requires a transaction. In the
1803  * absence of one, hope for the input to be valid UTF8.
1804  */
1805  if (IsTransactionState())
1806  {
1807  utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1808  len,
1809  msgenc,
1810  PG_UTF8);
1811  if (utf8 != str)
1812  len = strlen(utf8);
1813  }
1814  else
1815  utf8 = (char *) str;
1816 
1817  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1818  dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1819  utf16[dstlen] = (WCHAR) 0;
1820 
1821  if (utf8 != str)
1822  pfree(utf8);
1823  }
1824 
1825  if (dstlen == 0 && len > 0)
1826  {
1827  pfree(utf16);
1828  return NULL; /* error */
1829  }
1830 
1831  if (utf16len)
1832  *utf16len = dstlen;
1833  return utf16;
1834 }
1835 
1836 #endif /* WIN32 */
#define write_stderr(str)
Definition: parallel.c:184
#define NameStr(name)
Definition: c.h:735
#define unconstify(underlying_type, expr)
Definition: c.h:1255
#define Min(x, y)
Definition: c.h:993
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1168
signed int int32
Definition: c.h:483
#define VARHDRSZ
Definition: c.h:681
#define OidIsValid(objectId)
Definition: c.h:764
size_t Size
Definition: c.h:594
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
const pg_enc2gettext pg_enc2gettext_tbl[]
Definition: encnames.c:361
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:309
int pg_char_to_encoding(const char *name)
Definition: encnames.c:550
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:588
void fmgr_info_cxt(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt)
Definition: fmgr.c:137
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:308
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:642
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_GETARG_NAME(n)
Definition: fmgr.h:278
#define OidFunctionCall6(functionId, arg1, arg2, arg3, arg4, arg5, arg6)
Definition: fmgr.h:690
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:353
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
#define FunctionCall6(flinfo, arg1, arg2, arg3, arg4, arg5, arg6)
Definition: fmgr.h:670
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lcons(void *datum, List *list)
Definition: list.c:494
unsigned int pg_wchar
Definition: mbprint.c:31
const char * pg_get_client_encoding_name(void)
Definition: mbutils.c:346
unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding)
Definition: mbutils.c:357
bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
Definition: mbutils.c:927
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:677
int GetDatabaseEncoding(void)
Definition: mbutils.c:1268
Datum pg_convert_to(PG_FUNCTION_ARGS)
Definition: mbutils.c:502
struct ConvProcInfo ConvProcInfo
int pg_encoding_wchar2mb_with_len(int encoding, const pg_wchar *from, char *to, int len)
Definition: mbutils.c:1016
static bool pg_generic_charinc(unsigned char *charptr, int len)
Definition: mbutils.c:1332
static const pg_enc2name * ClientEncoding
Definition: mbutils.c:81
static FmgrInfo * ToServerConvProc
Definition: mbutils.c:68
static FmgrInfo * ToClientConvProc
Definition: mbutils.c:69
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition: mbutils.c:1604
void InitializeClientEncoding(void)
Definition: mbutils.c:282
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1031
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition: mbutils.c:1058
mbcharacter_incrementer pg_database_encoding_character_incrementer(void)
Definition: mbutils.c:1530
char * pg_client_to_server(const char *s, int len)
Definition: mbutils.c:661
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:1009
static FmgrInfo * Utf8ToServerConvProc
Definition: mbutils.c:76
static List * ConvProcList
Definition: mbutils.c:62
int pg_mb2wchar(const char *from, pg_wchar *to)
Definition: mbutils.c:980
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1126
Datum PG_char_to_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1292
static const pg_enc2name * MessageEncoding
Definition: mbutils.c:83
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1737
int pg_wchar2mb(const pg_wchar *from, char *to)
Definition: mbutils.c:1002
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:1038
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: mbutils.c:1573
static char * perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
Definition: mbutils.c:784
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1563
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1084
int GetMessageEncoding(void)
Definition: mbutils.c:1315
Datum pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
Definition: mbutils.c:645
Datum getdatabaseencoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1280
int pg_do_encoding_conversion_buf(Oid proc, int src_encoding, int dest_encoding, unsigned char *src, int srclen, unsigned char *dest, int destlen, bool noError)
Definition: mbutils.c:470
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1705
int SetClientEncoding(int encoding)
Definition: mbutils.c:209
void SetMessageEncoding(int encoding)
Definition: mbutils.c:1172
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:865
Datum pg_convert(PG_FUNCTION_ARGS)
Definition: mbutils.c:554
void check_encoding_conversion_args(int src_encoding, int dest_encoding, int len, int expected_src_encoding, int expected_dest_encoding)
Definition: mbutils.c:1676
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int PrepareClientEncoding(int encoding)
Definition: mbutils.c:111
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1274
static bool backend_startup_complete
Definition: mbutils.c:91
char * pg_server_to_client(const char *s, int len)
Definition: mbutils.c:739
Datum pg_convert_from(PG_FUNCTION_ARGS)
Definition: mbutils.c:527
int pg_get_client_encoding(void)
Definition: mbutils.c:337
static bool pg_utf8_increment(unsigned char *charptr, int length)
Definition: mbutils.c:1366
Datum length_in_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:616
static int cliplen(const char *str, int len, int limit)
Definition: mbutils.c:1151
static int pending_client_encoding
Definition: mbutils.c:92
void SetDatabaseEncoding(int encoding)
Definition: mbutils.c:1162
int pg_encoding_mbcliplen(int encoding, const char *mbstr, int len, int limit)
Definition: mbutils.c:1094
Datum pg_client_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1286
Datum PG_encoding_to_char(PG_FUNCTION_ARGS)
Definition: mbutils.c:1300
int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len)
Definition: mbutils.c:994
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:750
static bool pg_eucjp_increment(unsigned char *charptr, int length)
Definition: mbutils.c:1444
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:987
static const pg_enc2name * DatabaseEncoding
Definition: mbutils.c:82
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
void pfree(void *pointer)
Definition: mcxt.c:1456
MemoryContext TopMemoryContext
Definition: mcxt.c:141
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1021
void * MemoryContextAllocHuge(MemoryContext context, Size size)
Definition: mcxt.c:1586
void * palloc(Size size)
Definition: mcxt.c:1226
#define MaxAllocHugeSize
Definition: memutils.h:45
#define MaxAllocSize
Definition: memutils.h:40
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:48
Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding)
Definition: namespace.c:3612
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
const void size_t len
int32 encoding
Definition: pg_database.h:41
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
#define foreach_delete_current(lst, cell)
Definition: pg_list.h:390
static char * buf
Definition: pg_test_fsync.c:67
#define MAX_MULTIBYTE_CHAR_LEN
Definition: pg_wchar.h:32
#define MAX_CONVERSION_GROWTH
Definition: pg_wchar.h:302
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_UTF8
Definition: pg_wchar.h:232
#define SS2
Definition: pg_wchar.h:37
bool(* mbcharacter_incrementer)(unsigned char *mbstr, int len)
Definition: pg_wchar.h:376
int(* mbchar_verifier)(const unsigned char *mbstr, int len)
Definition: pg_wchar.h:378
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:291
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:525
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:281
#define SS3
Definition: pg_wchar.h:38
int(* mblen_converter)(const unsigned char *mbstr)
Definition: pg_wchar.h:372
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define sprintf
Definition: port.h:240
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:428
uintptr_t Datum
Definition: postgres.h:64
static Datum BoolGetDatum(bool X)
Definition: postgres.h:102
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
unsigned int Oid
Definition: postgres_ext.h:31
char * c
int s_encoding
Definition: mbutils.c:56
FmgrInfo to_client_info
Definition: mbutils.c:59
int c_encoding
Definition: mbutils.c:57
FmgrInfo to_server_info
Definition: mbutils.c:58
Definition: fmgr.h:57
Definition: pg_list.h:54
Definition: c.h:730
const char * name
Definition: pg_wchar.h:356
pg_enc encoding
Definition: pg_wchar.h:342
const char * name
Definition: pg_wchar.h:341
mbstr_verifier mbverifystr
Definition: pg_wchar.h:391
wchar2mb_with_len_converter wchar2mb_with_len
Definition: pg_wchar.h:386
mb2wchar_with_len_converter mb2wchar_with_len
Definition: pg_wchar.h:384
mblen_converter mblen
Definition: pg_wchar.h:388
mbdisplaylen_converter dsplen
Definition: pg_wchar.h:389
int maxmblen
Definition: pg_wchar.h:392
mbchar_verifier mbverifychar
Definition: pg_wchar.h:390
Definition: c.h:676
#define VARDATA(PTR)
Definition: varatt.h:278
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
const char * name
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:483
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:549
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2076
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2188
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2130
#define setlocale(a, b)
Definition: win32_port.h:467
size_t pg_wchar_strlen(const pg_wchar *str)
Definition: wstrncmp.c:70
bool IsTransactionState(void)
Definition: xact.c:378