PostgreSQL Source Code  git master
mbutils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  * This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required. However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs. Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr(). Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/fmgrprotos.h"
41 #include "utils/memutils.h"
42 #include "varatt.h"
43 
44 /*
45  * We maintain a simple linked list caching the fmgr lookup info for the
46  * currently selected conversion functions, as well as any that have been
47  * selected previously in the current session. (We remember previous
48  * settings because we must be able to restore a previous setting during
49  * transaction rollback, without doing any fresh catalog accesses.)
50  *
51  * Since we'll never release this data, we just keep it in TopMemoryContext.
52  */
53 typedef struct ConvProcInfo
54 {
55  int s_encoding; /* server and client encoding IDs */
57  FmgrInfo to_server_info; /* lookup info for conversion procs */
60 
61 static List *ConvProcList = NIL; /* List of ConvProcInfo */
62 
63 /*
64  * These variables point to the currently active conversion functions,
65  * or are NULL when no conversion is needed.
66  */
67 static FmgrInfo *ToServerConvProc = NULL;
68 static FmgrInfo *ToClientConvProc = NULL;
69 
70 /*
71  * This variable stores the conversion function to convert from UTF-8
72  * to the server encoding. It's NULL if the server encoding *is* UTF-8,
73  * or if we lack a conversion function for this.
74  */
76 
77 /*
78  * These variables track the currently-selected encodings.
79  */
83 
84 /*
85  * During backend startup we can't set client encoding because we (a)
86  * can't look up the conversion functions, and (b) may not know the database
87  * encoding yet either. So SetClientEncoding() just accepts anything and
88  * remembers it for InitializeClientEncoding() to apply later.
89  */
90 static bool backend_startup_complete = false;
92 
93 
94 /* Internal functions */
95 static char *perform_default_encoding_conversion(const char *src,
96  int len, bool is_client_to_server);
97 static int cliplen(const char *str, int len, int limit);
98 
99 
100 /*
101  * Prepare for a future call to SetClientEncoding. Success should mean
102  * that SetClientEncoding is guaranteed to succeed for this encoding request.
103  *
104  * (But note that success before backend_startup_complete does not guarantee
105  * success after ...)
106  *
107  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
108  */
109 int
111 {
112  int current_server_encoding;
113  ListCell *lc;
114 
116  return -1;
117 
118  /* Can't do anything during startup, per notes above */
120  return 0;
121 
122  current_server_encoding = GetDatabaseEncoding();
123 
124  /*
125  * Check for cases that require no conversion function.
126  */
127  if (current_server_encoding == encoding ||
128  current_server_encoding == PG_SQL_ASCII ||
130  return 0;
131 
132  if (IsTransactionState())
133  {
134  /*
135  * If we're in a live transaction, it's safe to access the catalogs,
136  * so look up the functions. We repeat the lookup even if the info is
137  * already cached, so that we can react to changes in the contents of
138  * pg_conversion.
139  */
140  Oid to_server_proc,
141  to_client_proc;
142  ConvProcInfo *convinfo;
143  MemoryContext oldcontext;
144 
145  to_server_proc = FindDefaultConversionProc(encoding,
146  current_server_encoding);
147  if (!OidIsValid(to_server_proc))
148  return -1;
149  to_client_proc = FindDefaultConversionProc(current_server_encoding,
150  encoding);
151  if (!OidIsValid(to_client_proc))
152  return -1;
153 
154  /*
155  * Load the fmgr info into TopMemoryContext (could still fail here)
156  */
158  sizeof(ConvProcInfo));
159  convinfo->s_encoding = current_server_encoding;
160  convinfo->c_encoding = encoding;
161  fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
163  fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
165 
166  /* Attach new info to head of list */
168  ConvProcList = lcons(convinfo, ConvProcList);
169  MemoryContextSwitchTo(oldcontext);
170 
171  /*
172  * We cannot yet remove any older entry for the same encoding pair,
173  * since it could still be in use. SetClientEncoding will clean up.
174  */
175 
176  return 0; /* success */
177  }
178  else
179  {
180  /*
181  * If we're not in a live transaction, the only thing we can do is
182  * restore a previous setting using the cache. This covers all
183  * transaction-rollback cases. The only case it might not work for is
184  * trying to change client_encoding on the fly by editing
185  * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
186  * thing to do anyway.
187  */
188  foreach(lc, ConvProcList)
189  {
190  ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
191 
192  if (oldinfo->s_encoding == current_server_encoding &&
193  oldinfo->c_encoding == encoding)
194  return 0;
195  }
196 
197  return -1; /* it's not cached, so fail */
198  }
199 }
200 
201 /*
202  * Set the active client encoding and set up the conversion-function pointers.
203  * PrepareClientEncoding should have been called previously for this encoding.
204  *
205  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
206  */
207 int
209 {
210  int current_server_encoding;
211  bool found;
212  ListCell *lc;
213 
215  return -1;
216 
217  /* Can't do anything during startup, per notes above */
219  {
221  return 0;
222  }
223 
224  current_server_encoding = GetDatabaseEncoding();
225 
226  /*
227  * Check for cases that require no conversion function.
228  */
229  if (current_server_encoding == encoding ||
230  current_server_encoding == PG_SQL_ASCII ||
232  {
234  ToServerConvProc = NULL;
235  ToClientConvProc = NULL;
236  return 0;
237  }
238 
239  /*
240  * Search the cache for the entry previously prepared by
241  * PrepareClientEncoding; if there isn't one, we lose. While at it,
242  * release any duplicate entries so that repeated Prepare/Set cycles don't
243  * leak memory.
244  */
245  found = false;
246  foreach(lc, ConvProcList)
247  {
248  ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
249 
250  if (convinfo->s_encoding == current_server_encoding &&
251  convinfo->c_encoding == encoding)
252  {
253  if (!found)
254  {
255  /* Found newest entry, so set up */
257  ToServerConvProc = &convinfo->to_server_info;
258  ToClientConvProc = &convinfo->to_client_info;
259  found = true;
260  }
261  else
262  {
263  /* Duplicate entry, release it */
265  pfree(convinfo);
266  }
267  }
268  }
269 
270  if (found)
271  return 0; /* success */
272  else
273  return -1; /* it's not cached, so fail */
274 }
275 
276 /*
277  * Initialize client encoding conversions.
278  * Called from InitPostgres() once during backend startup.
279  */
280 void
282 {
283  int current_server_encoding;
284 
287 
290  {
291  /*
292  * Oops, the requested conversion is not available. We couldn't fail
293  * before, but we can now.
294  */
295  ereport(FATAL,
296  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
297  errmsg("conversion between %s and %s is not supported",
300  }
301 
302  /*
303  * Also look up the UTF8-to-server conversion function if needed. Since
304  * the server encoding is fixed within any one backend process, we don't
305  * have to do this more than once.
306  */
307  current_server_encoding = GetDatabaseEncoding();
308  if (current_server_encoding != PG_UTF8 &&
309  current_server_encoding != PG_SQL_ASCII)
310  {
311  Oid utf8_to_server_proc;
312 
314  utf8_to_server_proc =
316  current_server_encoding);
317  /* If there's no such conversion, just leave the pointer as NULL */
318  if (OidIsValid(utf8_to_server_proc))
319  {
320  FmgrInfo *finfo;
321 
323  sizeof(FmgrInfo));
324  fmgr_info_cxt(utf8_to_server_proc, finfo,
326  /* Set Utf8ToServerConvProc only after data is fully valid */
327  Utf8ToServerConvProc = finfo;
328  }
329  }
330 }
331 
332 /*
333  * returns the current client encoding
334  */
335 int
337 {
338  return ClientEncoding->encoding;
339 }
340 
341 /*
342  * returns the current client encoding name
343  */
344 const char *
346 {
347  return ClientEncoding->name;
348 }
349 
350 /*
351  * Convert src string to another encoding (general case).
352  *
353  * See the notes about string conversion functions at the top of this file.
354  */
355 unsigned char *
356 pg_do_encoding_conversion(unsigned char *src, int len,
357  int src_encoding, int dest_encoding)
358 {
359  unsigned char *result;
360  Oid proc;
361 
362  if (len <= 0)
363  return src; /* empty string is always valid */
364 
365  if (src_encoding == dest_encoding)
366  return src; /* no conversion required, assume valid */
367 
368  if (dest_encoding == PG_SQL_ASCII)
369  return src; /* any string is valid in SQL_ASCII */
370 
371  if (src_encoding == PG_SQL_ASCII)
372  {
373  /* No conversion is possible, but we must validate the result */
374  (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
375  return src;
376  }
377 
378  if (!IsTransactionState()) /* shouldn't happen */
379  elog(ERROR, "cannot perform encoding conversion outside a transaction");
380 
381  proc = FindDefaultConversionProc(src_encoding, dest_encoding);
382  if (!OidIsValid(proc))
383  ereport(ERROR,
384  (errcode(ERRCODE_UNDEFINED_FUNCTION),
385  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
386  pg_encoding_to_char(src_encoding),
387  pg_encoding_to_char(dest_encoding))));
388 
389  /*
390  * Allocate space for conversion result, being wary of integer overflow.
391  *
392  * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
393  * required space, so it might exceed MaxAllocSize even though the result
394  * would actually fit. We do not want to hand back a result string that
395  * exceeds MaxAllocSize, because callers might not cope gracefully --- but
396  * if we just allocate more than that, and don't use it, that's fine.
397  */
399  ereport(ERROR,
400  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
401  errmsg("out of memory"),
402  errdetail("String of %d bytes is too long for encoding conversion.",
403  len)));
404 
405  result = (unsigned char *)
407  (Size) len * MAX_CONVERSION_GROWTH + 1);
408 
409  (void) OidFunctionCall6(proc,
410  Int32GetDatum(src_encoding),
411  Int32GetDatum(dest_encoding),
412  CStringGetDatum((char *) src),
413  CStringGetDatum((char *) result),
415  BoolGetDatum(false));
416 
417  /*
418  * If the result is large, it's worth repalloc'ing to release any extra
419  * space we asked for. The cutoff here is somewhat arbitrary, but we
420  * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
421  */
422  if (len > 1000000)
423  {
424  Size resultlen = strlen((char *) result);
425 
426  if (resultlen >= MaxAllocSize)
427  ereport(ERROR,
428  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
429  errmsg("out of memory"),
430  errdetail("String of %d bytes is too long for encoding conversion.",
431  len)));
432 
433  result = (unsigned char *) repalloc(result, resultlen + 1);
434  }
435 
436  return result;
437 }
438 
439 /*
440  * Convert src string to another encoding.
441  *
442  * This function has a different API than the other conversion functions.
443  * The caller should've looked up the conversion function using
444  * FindDefaultConversionProc(). Unlike the other functions, the converted
445  * result is not palloc'd. It is written to the caller-supplied buffer
446  * instead.
447  *
448  * src_encoding - encoding to convert from
449  * dest_encoding - encoding to convert to
450  * src, srclen - input buffer and its length in bytes
451  * dest, destlen - destination buffer and its size in bytes
452  *
453  * The output is null-terminated.
454  *
455  * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
456  * wouldn't necessarily fit in the output buffer, and the function will not
457  * convert the whole input.
458  *
459  * TODO: The conversion function interface is not great. Firstly, it
460  * would be nice to pass through the destination buffer size to the
461  * conversion function, so that if you pass a shorter destination buffer, it
462  * could still continue to fill up the whole buffer. Currently, we have to
463  * assume worst case expansion and stop the conversion short, even if there
464  * is in fact space left in the destination buffer. Secondly, it would be
465  * nice to return the number of bytes written to the caller, to avoid a call
466  * to strlen().
467  */
468 int
470  int src_encoding,
471  int dest_encoding,
472  unsigned char *src, int srclen,
473  unsigned char *dest, int destlen,
474  bool noError)
475 {
476  Datum result;
477 
478  /*
479  * If the destination buffer is not large enough to hold the result in the
480  * worst case, limit the input size passed to the conversion function.
481  */
482  if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
483  srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
484 
485  result = OidFunctionCall6(proc,
486  Int32GetDatum(src_encoding),
487  Int32GetDatum(dest_encoding),
488  CStringGetDatum((char *) src),
489  CStringGetDatum((char *) dest),
490  Int32GetDatum(srclen),
491  BoolGetDatum(noError));
492  return DatumGetInt32(result);
493 }
494 
495 /*
496  * Convert string to encoding encoding_name. The source
497  * encoding is the DB encoding.
498  *
499  * BYTEA convert_to(TEXT string, NAME encoding_name) */
500 Datum
502 {
503  Datum string = PG_GETARG_DATUM(0);
504  Datum dest_encoding_name = PG_GETARG_DATUM(1);
505  Datum src_encoding_name = DirectFunctionCall1(namein,
507  Datum result;
508 
509  /*
510  * pg_convert expects a bytea as its first argument. We're passing it a
511  * text argument here, relying on the fact that they are both in fact
512  * varlena types, and thus structurally identical.
513  */
514  result = DirectFunctionCall3(pg_convert, string,
515  src_encoding_name, dest_encoding_name);
516 
517  PG_RETURN_DATUM(result);
518 }
519 
520 /*
521  * Convert string from encoding encoding_name. The destination
522  * encoding is the DB encoding.
523  *
524  * TEXT convert_from(BYTEA string, NAME encoding_name) */
525 Datum
527 {
528  Datum string = PG_GETARG_DATUM(0);
529  Datum src_encoding_name = PG_GETARG_DATUM(1);
530  Datum dest_encoding_name = DirectFunctionCall1(namein,
532  Datum result;
533 
534  result = DirectFunctionCall3(pg_convert, string,
535  src_encoding_name, dest_encoding_name);
536 
537  /*
538  * pg_convert returns a bytea, which we in turn return as text, relying on
539  * the fact that they are both in fact varlena types, and thus
540  * structurally identical. Although not all bytea values are valid text,
541  * in this case it will be because we've told pg_convert to return one
542  * that is valid as text in the current database encoding.
543  */
544  PG_RETURN_DATUM(result);
545 }
546 
547 /*
548  * Convert string between two arbitrary encodings.
549  *
550  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
551  */
552 Datum
554 {
555  bytea *string = PG_GETARG_BYTEA_PP(0);
556  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
557  int src_encoding = pg_char_to_encoding(src_encoding_name);
558  char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
559  int dest_encoding = pg_char_to_encoding(dest_encoding_name);
560  const char *src_str;
561  char *dest_str;
562  bytea *retval;
563  int len;
564 
565  if (src_encoding < 0)
566  ereport(ERROR,
567  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
568  errmsg("invalid source encoding name \"%s\"",
569  src_encoding_name)));
570  if (dest_encoding < 0)
571  ereport(ERROR,
572  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
573  errmsg("invalid destination encoding name \"%s\"",
574  dest_encoding_name)));
575 
576  /* make sure that source string is valid */
577  len = VARSIZE_ANY_EXHDR(string);
578  src_str = VARDATA_ANY(string);
579  (void) pg_verify_mbstr(src_encoding, src_str, len, false);
580 
581  /* perform conversion */
582  dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
583  len,
584  src_encoding,
585  dest_encoding);
586 
587 
588  /* return source string if no conversion happened */
589  if (dest_str == src_str)
590  PG_RETURN_BYTEA_P(string);
591 
592  /*
593  * build bytea data type structure.
594  */
595  len = strlen(dest_str);
596  retval = (bytea *) palloc(len + VARHDRSZ);
597  SET_VARSIZE(retval, len + VARHDRSZ);
598  memcpy(VARDATA(retval), dest_str, len);
599  pfree(dest_str);
600 
601  /* free memory if allocated by the toaster */
602  PG_FREE_IF_COPY(string, 0);
603 
604  PG_RETURN_BYTEA_P(retval);
605 }
606 
607 /*
608  * get the length of the string considered as text in the specified
609  * encoding. Raises an error if the data is not valid in that
610  * encoding.
611  *
612  * INT4 length (BYTEA string, NAME src_encoding_name)
613  */
614 Datum
616 {
617  bytea *string = PG_GETARG_BYTEA_PP(0);
618  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
619  int src_encoding = pg_char_to_encoding(src_encoding_name);
620  const char *src_str;
621  int len;
622  int retval;
623 
624  if (src_encoding < 0)
625  ereport(ERROR,
626  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
627  errmsg("invalid encoding name \"%s\"",
628  src_encoding_name)));
629 
630  len = VARSIZE_ANY_EXHDR(string);
631  src_str = VARDATA_ANY(string);
632 
633  retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
634 
635  PG_RETURN_INT32(retval);
636 }
637 
638 /*
639  * Get maximum multibyte character length in the specified encoding.
640  *
641  * Note encoding is specified numerically, not by name as above.
642  */
643 Datum
645 {
646  int encoding = PG_GETARG_INT32(0);
647 
650  else
651  PG_RETURN_NULL();
652 }
653 
654 /*
655  * Convert client encoding to server encoding.
656  *
657  * See the notes about string conversion functions at the top of this file.
658  */
659 char *
660 pg_client_to_server(const char *s, int len)
661 {
663 }
664 
665 /*
666  * Convert any encoding to server encoding.
667  *
668  * See the notes about string conversion functions at the top of this file.
669  *
670  * Unlike the other string conversion functions, this will apply validation
671  * even if encoding == DatabaseEncoding->encoding. This is because this is
672  * used to process data coming in from outside the database, and we never
673  * want to just assume validity.
674  */
675 char *
676 pg_any_to_server(const char *s, int len, int encoding)
677 {
678  if (len <= 0)
679  return unconstify(char *, s); /* empty string is always valid */
680 
683  {
684  /*
685  * No conversion is needed, but we must still validate the data.
686  */
687  (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
688  return unconstify(char *, s);
689  }
690 
692  {
693  /*
694  * No conversion is possible, but we must still validate the data,
695  * because the client-side code might have done string escaping using
696  * the selected client_encoding. If the client encoding is ASCII-safe
697  * then we just do a straight validation under that encoding. For an
698  * ASCII-unsafe encoding we have a problem: we dare not pass such data
699  * to the parser but we have no way to convert it. We compromise by
700  * rejecting the data if it contains any non-ASCII characters.
701  */
703  (void) pg_verify_mbstr(encoding, s, len, false);
704  else
705  {
706  int i;
707 
708  for (i = 0; i < len; i++)
709  {
710  if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
711  ereport(ERROR,
712  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
713  errmsg("invalid byte value for encoding \"%s\": 0x%02x",
715  (unsigned char) s[i])));
716  }
717  }
718  return unconstify(char *, s);
719  }
720 
721  /* Fast path if we can use cached conversion function */
723  return perform_default_encoding_conversion(s, len, true);
724 
725  /* General case ... will not work outside transactions */
726  return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
727  len,
728  encoding,
730 }
731 
732 /*
733  * Convert server encoding to client encoding.
734  *
735  * See the notes about string conversion functions at the top of this file.
736  */
737 char *
738 pg_server_to_client(const char *s, int len)
739 {
741 }
742 
743 /*
744  * Convert server encoding to any encoding.
745  *
746  * See the notes about string conversion functions at the top of this file.
747  */
748 char *
749 pg_server_to_any(const char *s, int len, int encoding)
750 {
751  if (len <= 0)
752  return unconstify(char *, s); /* empty string is always valid */
753 
756  return unconstify(char *, s); /* assume data is valid */
757 
759  {
760  /* No conversion is possible, but we must validate the result */
761  (void) pg_verify_mbstr(encoding, s, len, false);
762  return unconstify(char *, s);
763  }
764 
765  /* Fast path if we can use cached conversion function */
767  return perform_default_encoding_conversion(s, len, false);
768 
769  /* General case ... will not work outside transactions */
770  return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
771  len,
773  encoding);
774 }
775 
776 /*
777  * Perform default encoding conversion using cached FmgrInfo. Since
778  * this function does not access database at all, it is safe to call
779  * outside transactions. If the conversion has not been set up by
780  * SetClientEncoding(), no conversion is performed.
781  */
782 static char *
784  bool is_client_to_server)
785 {
786  char *result;
787  int src_encoding,
788  dest_encoding;
789  FmgrInfo *flinfo;
790 
791  if (is_client_to_server)
792  {
793  src_encoding = ClientEncoding->encoding;
794  dest_encoding = DatabaseEncoding->encoding;
795  flinfo = ToServerConvProc;
796  }
797  else
798  {
799  src_encoding = DatabaseEncoding->encoding;
800  dest_encoding = ClientEncoding->encoding;
801  flinfo = ToClientConvProc;
802  }
803 
804  if (flinfo == NULL)
805  return unconstify(char *, src);
806 
807  /*
808  * Allocate space for conversion result, being wary of integer overflow.
809  * See comments in pg_do_encoding_conversion.
810  */
812  ereport(ERROR,
813  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
814  errmsg("out of memory"),
815  errdetail("String of %d bytes is too long for encoding conversion.",
816  len)));
817 
818  result = (char *)
820  (Size) len * MAX_CONVERSION_GROWTH + 1);
821 
822  FunctionCall6(flinfo,
823  Int32GetDatum(src_encoding),
824  Int32GetDatum(dest_encoding),
825  CStringGetDatum(src),
826  CStringGetDatum(result),
828  BoolGetDatum(false));
829 
830  /*
831  * Release extra space if there might be a lot --- see comments in
832  * pg_do_encoding_conversion.
833  */
834  if (len > 1000000)
835  {
836  Size resultlen = strlen(result);
837 
838  if (resultlen >= MaxAllocSize)
839  ereport(ERROR,
840  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
841  errmsg("out of memory"),
842  errdetail("String of %d bytes is too long for encoding conversion.",
843  len)));
844 
845  result = (char *) repalloc(result, resultlen + 1);
846  }
847 
848  return result;
849 }
850 
851 /*
852  * Convert a single Unicode code point into a string in the server encoding.
853  *
854  * The code point given by "c" is converted and stored at *s, which must
855  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
856  * The output will have a trailing '\0'. Throws error if the conversion
857  * cannot be performed.
858  *
859  * Note that this relies on having previously looked up any required
860  * conversion function. That's partly for speed but mostly because the parser
861  * may call this outside any transaction, or in an aborted transaction.
862  */
863 void
864 pg_unicode_to_server(pg_wchar c, unsigned char *s)
865 {
866  unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
867  int c_as_utf8_len;
868  int server_encoding;
869 
870  /*
871  * Complain if invalid Unicode code point. The choice of errcode here is
872  * debatable, but really our caller should have checked this anyway.
873  */
875  ereport(ERROR,
876  (errcode(ERRCODE_SYNTAX_ERROR),
877  errmsg("invalid Unicode code point")));
878 
879  /* Otherwise, if it's in ASCII range, conversion is trivial */
880  if (c <= 0x7F)
881  {
882  s[0] = (unsigned char) c;
883  s[1] = '\0';
884  return;
885  }
886 
887  /* If the server encoding is UTF-8, we just need to reformat the code */
888  server_encoding = GetDatabaseEncoding();
889  if (server_encoding == PG_UTF8)
890  {
891  unicode_to_utf8(c, s);
892  s[pg_utf_mblen(s)] = '\0';
893  return;
894  }
895 
896  /* For all other cases, we must have a conversion function available */
897  if (Utf8ToServerConvProc == NULL)
898  ereport(ERROR,
899  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
900  errmsg("conversion between %s and %s is not supported",
903 
904  /* Construct UTF-8 source string */
905  unicode_to_utf8(c, c_as_utf8);
906  c_as_utf8_len = pg_utf_mblen(c_as_utf8);
907  c_as_utf8[c_as_utf8_len] = '\0';
908 
909  /* Convert, or throw error if we can't */
912  Int32GetDatum(server_encoding),
913  CStringGetDatum((char *) c_as_utf8),
914  CStringGetDatum((char *) s),
915  Int32GetDatum(c_as_utf8_len),
916  BoolGetDatum(false));
917 }
918 
919 /*
920  * Convert a single Unicode code point into a string in the server encoding.
921  *
922  * Same as pg_unicode_to_server(), except that we don't throw errors,
923  * but simply return false on conversion failure.
924  */
925 bool
927 {
928  unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
929  int c_as_utf8_len;
930  int converted_len;
931  int server_encoding;
932 
933  /* Fail if invalid Unicode code point */
935  return false;
936 
937  /* Otherwise, if it's in ASCII range, conversion is trivial */
938  if (c <= 0x7F)
939  {
940  s[0] = (unsigned char) c;
941  s[1] = '\0';
942  return true;
943  }
944 
945  /* If the server encoding is UTF-8, we just need to reformat the code */
946  server_encoding = GetDatabaseEncoding();
947  if (server_encoding == PG_UTF8)
948  {
949  unicode_to_utf8(c, s);
950  s[pg_utf_mblen(s)] = '\0';
951  return true;
952  }
953 
954  /* For all other cases, we must have a conversion function available */
955  if (Utf8ToServerConvProc == NULL)
956  return false;
957 
958  /* Construct UTF-8 source string */
959  unicode_to_utf8(c, c_as_utf8);
960  c_as_utf8_len = pg_utf_mblen(c_as_utf8);
961  c_as_utf8[c_as_utf8_len] = '\0';
962 
963  /* Convert, but without throwing error if we can't */
966  Int32GetDatum(server_encoding),
967  CStringGetDatum((char *) c_as_utf8),
968  CStringGetDatum((char *) s),
969  Int32GetDatum(c_as_utf8_len),
970  BoolGetDatum(true)));
971 
972  /* Conversion was successful iff it consumed the whole input */
973  return (converted_len == c_as_utf8_len);
974 }
975 
976 
977 /* convert a multibyte string to a wchar */
978 int
979 pg_mb2wchar(const char *from, pg_wchar *to)
980 {
981  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
982 }
983 
984 /* convert a multibyte string to a wchar with a limited length */
985 int
986 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
987 {
988  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
989 }
990 
991 /* same, with any encoding */
992 int
994  const char *from, pg_wchar *to, int len)
995 {
996  return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
997 }
998 
999 /* convert a wchar string to a multibyte */
1000 int
1001 pg_wchar2mb(const pg_wchar *from, char *to)
1002 {
1003  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
1004 }
1005 
1006 /* convert a wchar string to a multibyte with a limited length */
1007 int
1008 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
1009 {
1010  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1011 }
1012 
1013 /* same, with any encoding */
1014 int
1016  const pg_wchar *from, char *to, int len)
1017 {
1018  return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1019 }
1020 
1021 /* returns the byte length of a multibyte character */
1022 int
1023 pg_mblen(const char *mbstr)
1024 {
1025  return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1026 }
1027 
1028 /* returns the display length of a multibyte character */
1029 int
1030 pg_dsplen(const char *mbstr)
1031 {
1032  return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1033 }
1034 
1035 /* returns the length (counted in wchars) of a multibyte string */
1036 int
1037 pg_mbstrlen(const char *mbstr)
1038 {
1039  int len = 0;
1040 
1041  /* optimization for single byte encoding */
1043  return strlen(mbstr);
1044 
1045  while (*mbstr)
1046  {
1047  mbstr += pg_mblen(mbstr);
1048  len++;
1049  }
1050  return len;
1051 }
1052 
1053 /* returns the length (counted in wchars) of a multibyte string
1054  * (not necessarily NULL terminated)
1055  */
1056 int
1057 pg_mbstrlen_with_len(const char *mbstr, int limit)
1058 {
1059  int len = 0;
1060 
1061  /* optimization for single byte encoding */
1063  return limit;
1064 
1065  while (limit > 0 && *mbstr)
1066  {
1067  int l = pg_mblen(mbstr);
1068 
1069  limit -= l;
1070  mbstr += l;
1071  len++;
1072  }
1073  return len;
1074 }
1075 
1076 /*
1077  * returns the byte length of a multibyte string
1078  * (not necessarily NULL terminated)
1079  * that is no longer than limit.
1080  * this function does not break multibyte character boundary.
1081  */
1082 int
1083 pg_mbcliplen(const char *mbstr, int len, int limit)
1084 {
1086  len, limit);
1087 }
1088 
1089 /*
1090  * pg_mbcliplen with specified encoding
1091  */
1092 int
1093 pg_encoding_mbcliplen(int encoding, const char *mbstr,
1094  int len, int limit)
1095 {
1096  mblen_converter mblen_fn;
1097  int clen = 0;
1098  int l;
1099 
1100  /* optimization for single byte encoding */
1101  if (pg_encoding_max_length(encoding) == 1)
1102  return cliplen(mbstr, len, limit);
1103 
1104  mblen_fn = pg_wchar_table[encoding].mblen;
1105 
1106  while (len > 0 && *mbstr)
1107  {
1108  l = (*mblen_fn) ((const unsigned char *) mbstr);
1109  if ((clen + l) > limit)
1110  break;
1111  clen += l;
1112  if (clen == limit)
1113  break;
1114  len -= l;
1115  mbstr += l;
1116  }
1117  return clen;
1118 }
1119 
1120 /*
1121  * Similar to pg_mbcliplen except the limit parameter specifies the
1122  * character length, not the byte length.
1123  */
1124 int
1125 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1126 {
1127  int clen = 0;
1128  int nch = 0;
1129  int l;
1130 
1131  /* optimization for single byte encoding */
1133  return cliplen(mbstr, len, limit);
1134 
1135  while (len > 0 && *mbstr)
1136  {
1137  l = pg_mblen(mbstr);
1138  nch++;
1139  if (nch > limit)
1140  break;
1141  clen += l;
1142  len -= l;
1143  mbstr += l;
1144  }
1145  return clen;
1146 }
1147 
1148 /* mbcliplen for any single-byte encoding */
1149 static int
1150 cliplen(const char *str, int len, int limit)
1151 {
1152  int l = 0;
1153 
1154  len = Min(len, limit);
1155  while (l < len && str[l])
1156  l++;
1157  return l;
1158 }
1159 
1160 void
1162 {
1164  elog(ERROR, "invalid database encoding: %d", encoding);
1165 
1168 }
1169 
1170 void
1172 {
1173  /* Some calls happen before we can elog()! */
1175 
1178 }
1179 
1180 #ifdef ENABLE_NLS
1181 /*
1182  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1183  * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1184  * fail for gettext-internal causes like out-of-memory.
1185  */
1186 static bool
1187 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1188 {
1189  bool elog_ok = (CurrentMemoryContext != NULL);
1190 
1192  return false;
1193 
1194  if (bind_textdomain_codeset(domainname,
1195  pg_enc2gettext_tbl[encoding]) != NULL)
1196  return true;
1197 
1198  if (elog_ok)
1199  elog(LOG, "bind_textdomain_codeset failed");
1200  else
1201  write_stderr("bind_textdomain_codeset failed");
1202 
1203  return false;
1204 }
1205 
1206 /*
1207  * Bind a gettext message domain to the codeset corresponding to the database
1208  * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1209  * Return the MessageEncoding implied by the new settings.
1210  *
1211  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1212  * When that matches the database encoding, we don't need to do anything. In
1213  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1214  * database encoding, except for the C locale. (On Windows, we also permit a
1215  * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1216  * gettext to the right codeset.
1217  *
1218  * On Windows, gettext defaults to the Windows ANSI code page. This is a
1219  * convenient departure for software that passes the strings to Windows ANSI
1220  * APIs, but we don't do that. Compel gettext to use database encoding or,
1221  * failing that, the LC_CTYPE encoding as it would on other platforms.
1222  *
1223  * This function is called before elog() and palloc() are usable.
1224  */
1225 int
1226 pg_bind_textdomain_codeset(const char *domainname)
1227 {
1228  bool elog_ok = (CurrentMemoryContext != NULL);
1229  int encoding = GetDatabaseEncoding();
1230  int new_msgenc;
1231 
1232 #ifndef WIN32
1233  const char *ctype = setlocale(LC_CTYPE, NULL);
1234 
1235  if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1236 #endif
1237  if (encoding != PG_SQL_ASCII &&
1238  raw_pg_bind_textdomain_codeset(domainname, encoding))
1239  return encoding;
1240 
1241  new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1242  if (new_msgenc < 0)
1243  new_msgenc = PG_SQL_ASCII;
1244 
1245 #ifdef WIN32
1246  if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1247  /* On failure, the old message encoding remains valid. */
1248  return GetMessageEncoding();
1249 #endif
1250 
1251  return new_msgenc;
1252 }
1253 #endif
1254 
1255 /*
1256  * The database encoding, also called the server encoding, represents the
1257  * encoding of data stored in text-like data types. Affected types include
1258  * cstring, text, varchar, name, xml, and json.
1259  */
1260 int
1262 {
1263  return DatabaseEncoding->encoding;
1264 }
1265 
1266 const char *
1268 {
1269  return DatabaseEncoding->name;
1270 }
1271 
1272 Datum
1274 {
1276 }
1277 
1278 Datum
1280 {
1282 }
1283 
1284 Datum
1286 {
1287  Name s = PG_GETARG_NAME(0);
1288 
1290 }
1291 
1292 Datum
1294 {
1296  const char *encoding_name = pg_encoding_to_char(encoding);
1297 
1298  return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1299 }
1300 
1301 /*
1302  * gettext() returns messages in this encoding. This often matches the
1303  * database encoding, but it differs for SQL_ASCII databases, for processes
1304  * not attached to a database, and under a database encoding lacking iconv
1305  * support (MULE_INTERNAL).
1306  */
1307 int
1309 {
1310  return MessageEncoding->encoding;
1311 }
1312 
1313 
1314 /*
1315  * Generic character incrementer function.
1316  *
1317  * Not knowing anything about the properties of the encoding in use, we just
1318  * keep incrementing the last byte until we get a validly-encoded result,
1319  * or we run out of values to try. We don't bother to try incrementing
1320  * higher-order bytes, so there's no growth in runtime for wider characters.
1321  * (If we did try to do that, we'd need to consider the likelihood that 255
1322  * is not a valid final byte in the encoding.)
1323  */
1324 static bool
1325 pg_generic_charinc(unsigned char *charptr, int len)
1326 {
1327  unsigned char *lastbyte = charptr + len - 1;
1328  mbchar_verifier mbverify;
1329 
1330  /* We can just invoke the character verifier directly. */
1332 
1333  while (*lastbyte < (unsigned char) 255)
1334  {
1335  (*lastbyte)++;
1336  if ((*mbverify) (charptr, len) == len)
1337  return true;
1338  }
1339 
1340  return false;
1341 }
1342 
1343 /*
1344  * UTF-8 character incrementer function.
1345  *
1346  * For a one-byte character less than 0x7F, we just increment the byte.
1347  *
1348  * For a multibyte character, every byte but the first must fall between 0x80
1349  * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1350  * the last byte that's not already at its maximum value. If we can't find a
1351  * byte that's less than the maximum allowable value, we simply fail. We also
1352  * need some special-case logic to skip regions used for surrogate pair
1353  * handling, as those should not occur in valid UTF-8.
1354  *
1355  * Note that we don't reset lower-order bytes back to their minimums, since
1356  * we can't afford to make an exhaustive search (see make_greater_string).
1357  */
1358 static bool
1359 pg_utf8_increment(unsigned char *charptr, int length)
1360 {
1361  unsigned char a;
1362  unsigned char limit;
1363 
1364  switch (length)
1365  {
1366  default:
1367  /* reject lengths 5 and 6 for now */
1368  return false;
1369  case 4:
1370  a = charptr[3];
1371  if (a < 0xBF)
1372  {
1373  charptr[3]++;
1374  break;
1375  }
1376  /* FALL THRU */
1377  case 3:
1378  a = charptr[2];
1379  if (a < 0xBF)
1380  {
1381  charptr[2]++;
1382  break;
1383  }
1384  /* FALL THRU */
1385  case 2:
1386  a = charptr[1];
1387  switch (*charptr)
1388  {
1389  case 0xED:
1390  limit = 0x9F;
1391  break;
1392  case 0xF4:
1393  limit = 0x8F;
1394  break;
1395  default:
1396  limit = 0xBF;
1397  break;
1398  }
1399  if (a < limit)
1400  {
1401  charptr[1]++;
1402  break;
1403  }
1404  /* FALL THRU */
1405  case 1:
1406  a = *charptr;
1407  if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1408  return false;
1409  charptr[0]++;
1410  break;
1411  }
1412 
1413  return true;
1414 }
1415 
1416 /*
1417  * EUC-JP character incrementer function.
1418  *
1419  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1420  * representing JIS X 0201 characters with the second byte ranging between
1421  * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1422  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1423  *
1424  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1425  * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1426  * is incremented if possible, otherwise the second-to-last byte.
1427  *
1428  * If the sequence starts with a value other than the above and its MSB
1429  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1430  * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1431  * incremented if possible, otherwise the second-to-last byte.
1432  *
1433  * Otherwise, the sequence is a single-byte ASCII character. It is
1434  * incremented up to 0x7f.
1435  */
1436 static bool
1437 pg_eucjp_increment(unsigned char *charptr, int length)
1438 {
1439  unsigned char c1,
1440  c2;
1441  int i;
1442 
1443  c1 = *charptr;
1444 
1445  switch (c1)
1446  {
1447  case SS2: /* JIS X 0201 */
1448  if (length != 2)
1449  return false;
1450 
1451  c2 = charptr[1];
1452 
1453  if (c2 >= 0xdf)
1454  charptr[0] = charptr[1] = 0xa1;
1455  else if (c2 < 0xa1)
1456  charptr[1] = 0xa1;
1457  else
1458  charptr[1]++;
1459  break;
1460 
1461  case SS3: /* JIS X 0212 */
1462  if (length != 3)
1463  return false;
1464 
1465  for (i = 2; i > 0; i--)
1466  {
1467  c2 = charptr[i];
1468  if (c2 < 0xa1)
1469  {
1470  charptr[i] = 0xa1;
1471  return true;
1472  }
1473  else if (c2 < 0xfe)
1474  {
1475  charptr[i]++;
1476  return true;
1477  }
1478  }
1479 
1480  /* Out of 3-byte code region */
1481  return false;
1482 
1483  default:
1484  if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1485  {
1486  if (length != 2)
1487  return false;
1488 
1489  for (i = 1; i >= 0; i--)
1490  {
1491  c2 = charptr[i];
1492  if (c2 < 0xa1)
1493  {
1494  charptr[i] = 0xa1;
1495  return true;
1496  }
1497  else if (c2 < 0xfe)
1498  {
1499  charptr[i]++;
1500  return true;
1501  }
1502  }
1503 
1504  /* Out of 2 byte code region */
1505  return false;
1506  }
1507  else
1508  { /* ASCII, single byte */
1509  if (c1 > 0x7e)
1510  return false;
1511  (*charptr)++;
1512  }
1513  break;
1514  }
1515 
1516  return true;
1517 }
1518 
1519 /*
1520  * get the character incrementer for the encoding for the current database
1521  */
1524 {
1525  /*
1526  * Eventually it might be best to add a field to pg_wchar_table[], but for
1527  * now we just use a switch.
1528  */
1529  switch (GetDatabaseEncoding())
1530  {
1531  case PG_UTF8:
1532  return pg_utf8_increment;
1533 
1534  case PG_EUC_JP:
1535  return pg_eucjp_increment;
1536 
1537  default:
1538  return pg_generic_charinc;
1539  }
1540 }
1541 
1542 /*
1543  * fetch maximum length of the encoding for the current database
1544  */
1545 int
1547 {
1549 }
1550 
1551 /*
1552  * Verify mbstr to make sure that it is validly encoded in the current
1553  * database encoding. Otherwise same as pg_verify_mbstr().
1554  */
1555 bool
1556 pg_verifymbstr(const char *mbstr, int len, bool noError)
1557 {
1558  return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1559 }
1560 
1561 /*
1562  * Verify mbstr to make sure that it is validly encoded in the specified
1563  * encoding.
1564  */
1565 bool
1566 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1567 {
1568  int oklen;
1569 
1571 
1572  oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1573  if (oklen != len)
1574  {
1575  if (noError)
1576  return false;
1577  report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1578  }
1579  return true;
1580 }
1581 
1582 /*
1583  * Verify mbstr to make sure that it is validly encoded in the specified
1584  * encoding.
1585  *
1586  * mbstr is not necessarily zero terminated; length of mbstr is
1587  * specified by len.
1588  *
1589  * If OK, return length of string in the encoding.
1590  * If a problem is found, return -1 when noError is
1591  * true; when noError is false, ereport() a descriptive message.
1592  *
1593  * Note: We cannot use the faster encoding-specific mbverifystr() function
1594  * here, because we need to count the number of characters in the string.
1595  */
1596 int
1597 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1598 {
1599  mbchar_verifier mbverifychar;
1600  int mb_len;
1601 
1603 
1604  /*
1605  * In single-byte encodings, we need only reject nulls (\0).
1606  */
1607  if (pg_encoding_max_length(encoding) <= 1)
1608  {
1609  const char *nullpos = memchr(mbstr, 0, len);
1610 
1611  if (nullpos == NULL)
1612  return len;
1613  if (noError)
1614  return -1;
1615  report_invalid_encoding(encoding, nullpos, 1);
1616  }
1617 
1618  /* fetch function pointer just once */
1619  mbverifychar = pg_wchar_table[encoding].mbverifychar;
1620 
1621  mb_len = 0;
1622 
1623  while (len > 0)
1624  {
1625  int l;
1626 
1627  /* fast path for ASCII-subset characters */
1628  if (!IS_HIGHBIT_SET(*mbstr))
1629  {
1630  if (*mbstr != '\0')
1631  {
1632  mb_len++;
1633  mbstr++;
1634  len--;
1635  continue;
1636  }
1637  if (noError)
1638  return -1;
1640  }
1641 
1642  l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1643 
1644  if (l < 0)
1645  {
1646  if (noError)
1647  return -1;
1649  }
1650 
1651  mbstr += l;
1652  len -= l;
1653  mb_len++;
1654  }
1655  return mb_len;
1656 }
1657 
1658 /*
1659  * check_encoding_conversion_args: check arguments of a conversion function
1660  *
1661  * "expected" arguments can be either an encoding ID or -1 to indicate that
1662  * the caller will check whether it accepts the ID.
1663  *
1664  * Note: the errors here are not really user-facing, so elog instead of
1665  * ereport seems sufficient. Also, we trust that the "expected" encoding
1666  * arguments are valid encoding IDs, but we don't trust the actuals.
1667  */
1668 void
1670  int dest_encoding,
1671  int len,
1672  int expected_src_encoding,
1673  int expected_dest_encoding)
1674 {
1675  if (!PG_VALID_ENCODING(src_encoding))
1676  elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1677  if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1678  elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1679  pg_enc2name_tbl[expected_src_encoding].name,
1680  pg_enc2name_tbl[src_encoding].name);
1681  if (!PG_VALID_ENCODING(dest_encoding))
1682  elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1683  if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1684  elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1685  pg_enc2name_tbl[expected_dest_encoding].name,
1686  pg_enc2name_tbl[dest_encoding].name);
1687  if (len < 0)
1688  elog(ERROR, "encoding conversion length must not be negative");
1689 }
1690 
1691 /*
1692  * report_invalid_encoding: complain about invalid multibyte character
1693  *
1694  * note: len is remaining length of string, not length of character;
1695  * len must be greater than zero, as we always examine the first byte.
1696  */
1697 void
1698 report_invalid_encoding(int encoding, const char *mbstr, int len)
1699 {
1700  int l = pg_encoding_mblen(encoding, mbstr);
1701  char buf[8 * 5 + 1];
1702  char *p = buf;
1703  int j,
1704  jlimit;
1705 
1706  jlimit = Min(l, len);
1707  jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1708 
1709  for (j = 0; j < jlimit; j++)
1710  {
1711  p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1712  if (j < jlimit - 1)
1713  p += sprintf(p, " ");
1714  }
1715 
1716  ereport(ERROR,
1717  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1718  errmsg("invalid byte sequence for encoding \"%s\": %s",
1720  buf)));
1721 }
1722 
1723 /*
1724  * report_untranslatable_char: complain about untranslatable character
1725  *
1726  * note: len is remaining length of string, not length of character;
1727  * len must be greater than zero, as we always examine the first byte.
1728  */
1729 void
1730 report_untranslatable_char(int src_encoding, int dest_encoding,
1731  const char *mbstr, int len)
1732 {
1733  int l = pg_encoding_mblen(src_encoding, mbstr);
1734  char buf[8 * 5 + 1];
1735  char *p = buf;
1736  int j,
1737  jlimit;
1738 
1739  jlimit = Min(l, len);
1740  jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1741 
1742  for (j = 0; j < jlimit; j++)
1743  {
1744  p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1745  if (j < jlimit - 1)
1746  p += sprintf(p, " ");
1747  }
1748 
1749  ereport(ERROR,
1750  (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1751  errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1752  buf,
1753  pg_enc2name_tbl[src_encoding].name,
1754  pg_enc2name_tbl[dest_encoding].name)));
1755 }
1756 
1757 
1758 #ifdef WIN32
1759 /*
1760  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1761  * string. The character length is also passed to utf16len if not
1762  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1763  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1764  */
1765 WCHAR *
1766 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1767 {
1768  int msgenc = GetMessageEncoding();
1769  WCHAR *utf16;
1770  int dstlen;
1771  UINT codepage;
1772 
1773  if (msgenc == PG_SQL_ASCII)
1774  /* No conversion is possible, and SQL_ASCII is never utf16. */
1775  return NULL;
1776 
1777  codepage = pg_enc2name_tbl[msgenc].codepage;
1778 
1779  /*
1780  * Use MultiByteToWideChar directly if there is a corresponding codepage,
1781  * or double conversion through UTF8 if not. Double conversion is needed,
1782  * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1783  */
1784  if (codepage != 0)
1785  {
1786  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1787  dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1788  utf16[dstlen] = (WCHAR) 0;
1789  }
1790  else
1791  {
1792  char *utf8;
1793 
1794  /*
1795  * XXX pg_do_encoding_conversion() requires a transaction. In the
1796  * absence of one, hope for the input to be valid UTF8.
1797  */
1798  if (IsTransactionState())
1799  {
1800  utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1801  len,
1802  msgenc,
1803  PG_UTF8);
1804  if (utf8 != str)
1805  len = strlen(utf8);
1806  }
1807  else
1808  utf8 = (char *) str;
1809 
1810  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1811  dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1812  utf16[dstlen] = (WCHAR) 0;
1813 
1814  if (utf8 != str)
1815  pfree(utf8);
1816  }
1817 
1818  if (dstlen == 0 && len > 0)
1819  {
1820  pfree(utf16);
1821  return NULL; /* error */
1822  }
1823 
1824  if (utf16len)
1825  *utf16len = dstlen;
1826  return utf16;
1827 }
1828 
1829 #endif /* WIN32 */
#define write_stderr(str)
Definition: parallel.c:184
#define NameStr(name)
Definition: c.h:733
#define unconstify(underlying_type, expr)
Definition: c.h:1232
#define Min(x, y)
Definition: c.h:991
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1142
signed int int32
Definition: c.h:481
#define VARHDRSZ
Definition: c.h:679
#define OidIsValid(objectId)
Definition: c.h:762
size_t Size
Definition: c.h:592
int errdetail(const char *fmt,...)
Definition: elog.c:1205
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
const char * pg_enc2gettext_tbl[]
Definition: encnames.c:360
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:308
void fmgr_info_cxt(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt)
Definition: fmgr.c:137
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:308
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:642
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_GETARG_NAME(n)
Definition: fmgr.h:278
#define OidFunctionCall6(functionId, arg1, arg2, arg3, arg4, arg5, arg6)
Definition: fmgr.h:690
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:353
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
#define FunctionCall6(flinfo, arg1, arg2, arg3, arg4, arg5, arg6)
Definition: fmgr.h:670
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lcons(void *datum, List *list)
Definition: list.c:495
unsigned int pg_wchar
Definition: mbprint.c:31
const char * pg_get_client_encoding_name(void)
Definition: mbutils.c:345
unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding)
Definition: mbutils.c:356
bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
Definition: mbutils.c:926
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:676
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
Datum pg_convert_to(PG_FUNCTION_ARGS)
Definition: mbutils.c:501
struct ConvProcInfo ConvProcInfo
int pg_encoding_wchar2mb_with_len(int encoding, const pg_wchar *from, char *to, int len)
Definition: mbutils.c:1015
static bool pg_generic_charinc(unsigned char *charptr, int len)
Definition: mbutils.c:1325
static const pg_enc2name * ClientEncoding
Definition: mbutils.c:80
static FmgrInfo * ToServerConvProc
Definition: mbutils.c:67
static FmgrInfo * ToClientConvProc
Definition: mbutils.c:68
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition: mbutils.c:1597
void InitializeClientEncoding(void)
Definition: mbutils.c:281
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1030
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition: mbutils.c:1057
mbcharacter_incrementer pg_database_encoding_character_incrementer(void)
Definition: mbutils.c:1523
char * pg_client_to_server(const char *s, int len)
Definition: mbutils.c:660
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:1008
static FmgrInfo * Utf8ToServerConvProc
Definition: mbutils.c:75
static List * ConvProcList
Definition: mbutils.c:61
int pg_mb2wchar(const char *from, pg_wchar *to)
Definition: mbutils.c:979
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1125
Datum PG_char_to_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1285
static const pg_enc2name * MessageEncoding
Definition: mbutils.c:82
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition: mbutils.c:1730
int pg_wchar2mb(const pg_wchar *from, char *to)
Definition: mbutils.c:1001
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:1037
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: mbutils.c:1566
static char * perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
Definition: mbutils.c:783
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1556
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1083
int GetMessageEncoding(void)
Definition: mbutils.c:1308
Datum pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
Definition: mbutils.c:644
Datum getdatabaseencoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1273
int pg_do_encoding_conversion_buf(Oid proc, int src_encoding, int dest_encoding, unsigned char *src, int srclen, unsigned char *dest, int destlen, bool noError)
Definition: mbutils.c:469
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition: mbutils.c:1698
int SetClientEncoding(int encoding)
Definition: mbutils.c:208
void SetMessageEncoding(int encoding)
Definition: mbutils.c:1171
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:864
Datum pg_convert(PG_FUNCTION_ARGS)
Definition: mbutils.c:553
void check_encoding_conversion_args(int src_encoding, int dest_encoding, int len, int expected_src_encoding, int expected_dest_encoding)
Definition: mbutils.c:1669
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int PrepareClientEncoding(int encoding)
Definition: mbutils.c:110
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1267
static bool backend_startup_complete
Definition: mbutils.c:90
char * pg_server_to_client(const char *s, int len)
Definition: mbutils.c:738
Datum pg_convert_from(PG_FUNCTION_ARGS)
Definition: mbutils.c:526
int pg_get_client_encoding(void)
Definition: mbutils.c:336
static bool pg_utf8_increment(unsigned char *charptr, int length)
Definition: mbutils.c:1359
Datum length_in_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:615
static int cliplen(const char *str, int len, int limit)
Definition: mbutils.c:1150
static int pending_client_encoding
Definition: mbutils.c:91
void SetDatabaseEncoding(int encoding)
Definition: mbutils.c:1161
int pg_encoding_mbcliplen(int encoding, const char *mbstr, int len, int limit)
Definition: mbutils.c:1093
Datum pg_client_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1279
Datum PG_encoding_to_char(PG_FUNCTION_ARGS)
Definition: mbutils.c:1293
int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len)
Definition: mbutils.c:993
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:749
static bool pg_eucjp_increment(unsigned char *charptr, int length)
Definition: mbutils.c:1437
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:986
static const pg_enc2name * DatabaseEncoding
Definition: mbutils.c:81
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:1508
MemoryContext TopMemoryContext
Definition: mcxt.c:137
MemoryContext CurrentMemoryContext
Definition: mcxt.c:131
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1528
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1168
void * MemoryContextAllocHuge(MemoryContext context, Size size)
Definition: mcxt.c:1627
void * palloc(Size size)
Definition: mcxt.c:1304
#define MaxAllocHugeSize
Definition: memutils.h:45
#define MaxAllocSize
Definition: memutils.h:40
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:48
Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding)
Definition: namespace.c:4065
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
const void size_t len
int32 encoding
Definition: pg_database.h:41
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
#define foreach_delete_current(lst, var_or_cell)
Definition: pg_list.h:391
static char * buf
Definition: pg_test_fsync.c:73
#define MAX_MULTIBYTE_CHAR_LEN
Definition: pg_wchar.h:33
#define MAX_CONVERSION_GROWTH
Definition: pg_wchar.h:302
#define pg_utf_mblen
Definition: pg_wchar.h:572
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_UTF8
Definition: pg_wchar.h:232
#define SS2
Definition: pg_wchar.h:38
bool(* mbcharacter_incrementer)(unsigned char *mbstr, int len)
Definition: pg_wchar.h:370
int(* mbchar_verifier)(const unsigned char *mbstr, int len)
Definition: pg_wchar.h:372
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:291
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:519
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:281
#define pg_encoding_to_char
Definition: pg_wchar.h:569
#define pg_char_to_encoding
Definition: pg_wchar.h:568
#define SS3
Definition: pg_wchar.h:39
int(* mblen_converter)(const unsigned char *mbstr)
Definition: pg_wchar.h:366
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define sprintf
Definition: port.h:240
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:428
uintptr_t Datum
Definition: postgres.h:64
static Datum BoolGetDatum(bool X)
Definition: postgres.h:102
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
unsigned int Oid
Definition: postgres_ext.h:31
char * c
int s_encoding
Definition: mbutils.c:55
FmgrInfo to_client_info
Definition: mbutils.c:58
int c_encoding
Definition: mbutils.c:56
FmgrInfo to_server_info
Definition: mbutils.c:57
Definition: fmgr.h:57
Definition: pg_list.h:54
Definition: c.h:728
pg_enc encoding
Definition: pg_wchar.h:342
const char * name
Definition: pg_wchar.h:341
mbstr_verifier mbverifystr
Definition: pg_wchar.h:385
wchar2mb_with_len_converter wchar2mb_with_len
Definition: pg_wchar.h:380
mb2wchar_with_len_converter mb2wchar_with_len
Definition: pg_wchar.h:378
mblen_converter mblen
Definition: pg_wchar.h:382
mbdisplaylen_converter dsplen
Definition: pg_wchar.h:383
int maxmblen
Definition: pg_wchar.h:386
mbchar_verifier mbverifychar
Definition: pg_wchar.h:384
Definition: c.h:674
#define VARDATA(PTR)
Definition: varatt.h:278
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
const char * name
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:484
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2076
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2188
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2130
#define setlocale(a, b)
Definition: win32_port.h:467
size_t pg_wchar_strlen(const pg_wchar *str)
Definition: wstrncmp.c:70
bool IsTransactionState(void)
Definition: xact.c:379