PostgreSQL Source Code  git master
mbutils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  * This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required. However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs. Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr(). Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 
44 /*
45  * We maintain a simple linked list caching the fmgr lookup info for the
46  * currently selected conversion functions, as well as any that have been
47  * selected previously in the current session. (We remember previous
48  * settings because we must be able to restore a previous setting during
49  * transaction rollback, without doing any fresh catalog accesses.)
50  *
51  * Since we'll never release this data, we just keep it in TopMemoryContext.
52  */
53 typedef struct ConvProcInfo
54 {
55  int s_encoding; /* server and client encoding IDs */
57  FmgrInfo to_server_info; /* lookup info for conversion procs */
59 } ConvProcInfo;
60 
61 static List *ConvProcList = NIL; /* List of ConvProcInfo */
62 
63 /*
64  * These variables point to the currently active conversion functions,
65  * or are NULL when no conversion is needed.
66  */
67 static FmgrInfo *ToServerConvProc = NULL;
68 static FmgrInfo *ToClientConvProc = NULL;
69 
70 /*
71  * These variables track the currently-selected encodings.
72  */
76 
77 /*
78  * During backend startup we can't set client encoding because we (a)
79  * can't look up the conversion functions, and (b) may not know the database
80  * encoding yet either. So SetClientEncoding() just accepts anything and
81  * remembers it for InitializeClientEncoding() to apply later.
82  */
83 static bool backend_startup_complete = false;
85 
86 
87 /* Internal functions */
88 static char *perform_default_encoding_conversion(const char *src,
89  int len, bool is_client_to_server);
90 static int cliplen(const char *str, int len, int limit);
91 
92 
93 /*
94  * Prepare for a future call to SetClientEncoding. Success should mean
95  * that SetClientEncoding is guaranteed to succeed for this encoding request.
96  *
97  * (But note that success before backend_startup_complete does not guarantee
98  * success after ...)
99  *
100  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
101  */
102 int
104 {
105  int current_server_encoding;
106  ListCell *lc;
107 
108  if (!PG_VALID_FE_ENCODING(encoding))
109  return -1;
110 
111  /* Can't do anything during startup, per notes above */
113  return 0;
114 
115  current_server_encoding = GetDatabaseEncoding();
116 
117  /*
118  * Check for cases that require no conversion function.
119  */
120  if (current_server_encoding == encoding ||
121  current_server_encoding == PG_SQL_ASCII ||
122  encoding == PG_SQL_ASCII)
123  return 0;
124 
125  if (IsTransactionState())
126  {
127  /*
128  * If we're in a live transaction, it's safe to access the catalogs,
129  * so look up the functions. We repeat the lookup even if the info is
130  * already cached, so that we can react to changes in the contents of
131  * pg_conversion.
132  */
133  Oid to_server_proc,
134  to_client_proc;
135  ConvProcInfo *convinfo;
136  MemoryContext oldcontext;
137 
138  to_server_proc = FindDefaultConversionProc(encoding,
139  current_server_encoding);
140  if (!OidIsValid(to_server_proc))
141  return -1;
142  to_client_proc = FindDefaultConversionProc(current_server_encoding,
143  encoding);
144  if (!OidIsValid(to_client_proc))
145  return -1;
146 
147  /*
148  * Load the fmgr info into TopMemoryContext (could still fail here)
149  */
151  sizeof(ConvProcInfo));
152  convinfo->s_encoding = current_server_encoding;
153  convinfo->c_encoding = encoding;
154  fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
156  fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
158 
159  /* Attach new info to head of list */
161  ConvProcList = lcons(convinfo, ConvProcList);
162  MemoryContextSwitchTo(oldcontext);
163 
164  /*
165  * We cannot yet remove any older entry for the same encoding pair,
166  * since it could still be in use. SetClientEncoding will clean up.
167  */
168 
169  return 0; /* success */
170  }
171  else
172  {
173  /*
174  * If we're not in a live transaction, the only thing we can do is
175  * restore a previous setting using the cache. This covers all
176  * transaction-rollback cases. The only case it might not work for is
177  * trying to change client_encoding on the fly by editing
178  * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
179  * thing to do anyway.
180  */
181  foreach(lc, ConvProcList)
182  {
183  ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
184 
185  if (oldinfo->s_encoding == current_server_encoding &&
186  oldinfo->c_encoding == encoding)
187  return 0;
188  }
189 
190  return -1; /* it's not cached, so fail */
191  }
192 }
193 
194 /*
195  * Set the active client encoding and set up the conversion-function pointers.
196  * PrepareClientEncoding should have been called previously for this encoding.
197  *
198  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
199  */
200 int
202 {
203  int current_server_encoding;
204  bool found;
205  ListCell *lc;
206 
207  if (!PG_VALID_FE_ENCODING(encoding))
208  return -1;
209 
210  /* Can't do anything during startup, per notes above */
212  {
214  return 0;
215  }
216 
217  current_server_encoding = GetDatabaseEncoding();
218 
219  /*
220  * Check for cases that require no conversion function.
221  */
222  if (current_server_encoding == encoding ||
223  current_server_encoding == PG_SQL_ASCII ||
224  encoding == PG_SQL_ASCII)
225  {
226  ClientEncoding = &pg_enc2name_tbl[encoding];
227  ToServerConvProc = NULL;
228  ToClientConvProc = NULL;
229  return 0;
230  }
231 
232  /*
233  * Search the cache for the entry previously prepared by
234  * PrepareClientEncoding; if there isn't one, we lose. While at it,
235  * release any duplicate entries so that repeated Prepare/Set cycles don't
236  * leak memory.
237  */
238  found = false;
239  foreach(lc, ConvProcList)
240  {
241  ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
242 
243  if (convinfo->s_encoding == current_server_encoding &&
244  convinfo->c_encoding == encoding)
245  {
246  if (!found)
247  {
248  /* Found newest entry, so set up */
249  ClientEncoding = &pg_enc2name_tbl[encoding];
250  ToServerConvProc = &convinfo->to_server_info;
251  ToClientConvProc = &convinfo->to_client_info;
252  found = true;
253  }
254  else
255  {
256  /* Duplicate entry, release it */
257  ConvProcList = foreach_delete_current(ConvProcList, lc);
258  pfree(convinfo);
259  }
260  }
261  }
262 
263  if (found)
264  return 0; /* success */
265  else
266  return -1; /* it's not cached, so fail */
267 }
268 
269 /*
270  * Initialize client encoding conversions.
271  * Called from InitPostgres() once during backend startup.
272  */
273 void
275 {
278 
281  {
282  /*
283  * Oops, the requested conversion is not available. We couldn't fail
284  * before, but we can now.
285  */
286  ereport(FATAL,
287  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
288  errmsg("conversion between %s and %s is not supported",
291  }
292 }
293 
294 /*
295  * returns the current client encoding
296  */
297 int
299 {
300  return ClientEncoding->encoding;
301 }
302 
303 /*
304  * returns the current client encoding name
305  */
306 const char *
308 {
309  return ClientEncoding->name;
310 }
311 
312 /*
313  * Convert src string to another encoding (general case).
314  *
315  * See the notes about string conversion functions at the top of this file.
316  */
317 unsigned char *
318 pg_do_encoding_conversion(unsigned char *src, int len,
319  int src_encoding, int dest_encoding)
320 {
321  unsigned char *result;
322  Oid proc;
323 
324  if (len <= 0)
325  return src; /* empty string is always valid */
326 
327  if (src_encoding == dest_encoding)
328  return src; /* no conversion required, assume valid */
329 
330  if (dest_encoding == PG_SQL_ASCII)
331  return src; /* any string is valid in SQL_ASCII */
332 
333  if (src_encoding == PG_SQL_ASCII)
334  {
335  /* No conversion is possible, but we must validate the result */
336  (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
337  return src;
338  }
339 
340  if (!IsTransactionState()) /* shouldn't happen */
341  elog(ERROR, "cannot perform encoding conversion outside a transaction");
342 
343  proc = FindDefaultConversionProc(src_encoding, dest_encoding);
344  if (!OidIsValid(proc))
345  ereport(ERROR,
346  (errcode(ERRCODE_UNDEFINED_FUNCTION),
347  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
348  pg_encoding_to_char(src_encoding),
349  pg_encoding_to_char(dest_encoding))));
350 
351  /*
352  * Allocate space for conversion result, being wary of integer overflow
353  */
354  if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
355  ereport(ERROR,
356  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
357  errmsg("out of memory"),
358  errdetail("String of %d bytes is too long for encoding conversion.",
359  len)));
360 
361  result = palloc(len * MAX_CONVERSION_GROWTH + 1);
362 
363  OidFunctionCall5(proc,
364  Int32GetDatum(src_encoding),
365  Int32GetDatum(dest_encoding),
366  CStringGetDatum(src),
367  CStringGetDatum(result),
368  Int32GetDatum(len));
369  return result;
370 }
371 
372 /*
373  * Convert string to encoding encoding_name. The source
374  * encoding is the DB encoding.
375  *
376  * BYTEA convert_to(TEXT string, NAME encoding_name) */
377 Datum
379 {
380  Datum string = PG_GETARG_DATUM(0);
381  Datum dest_encoding_name = PG_GETARG_DATUM(1);
382  Datum src_encoding_name = DirectFunctionCall1(namein,
383  CStringGetDatum(DatabaseEncoding->name));
384  Datum result;
385 
386  /*
387  * pg_convert expects a bytea as its first argument. We're passing it a
388  * text argument here, relying on the fact that they are both in fact
389  * varlena types, and thus structurally identical.
390  */
391  result = DirectFunctionCall3(pg_convert, string,
392  src_encoding_name, dest_encoding_name);
393 
394  PG_RETURN_DATUM(result);
395 }
396 
397 /*
398  * Convert string from encoding encoding_name. The destination
399  * encoding is the DB encoding.
400  *
401  * TEXT convert_from(BYTEA string, NAME encoding_name) */
402 Datum
404 {
405  Datum string = PG_GETARG_DATUM(0);
406  Datum src_encoding_name = PG_GETARG_DATUM(1);
407  Datum dest_encoding_name = DirectFunctionCall1(namein,
408  CStringGetDatum(DatabaseEncoding->name));
409  Datum result;
410 
411  result = DirectFunctionCall3(pg_convert, string,
412  src_encoding_name, dest_encoding_name);
413 
414  /*
415  * pg_convert returns a bytea, which we in turn return as text, relying on
416  * the fact that they are both in fact varlena types, and thus
417  * structurally identical. Although not all bytea values are valid text,
418  * in this case it will be because we've told pg_convert to return one
419  * that is valid as text in the current database encoding.
420  */
421  PG_RETURN_DATUM(result);
422 }
423 
424 /*
425  * Convert string between two arbitrary encodings.
426  *
427  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
428  */
429 Datum
431 {
432  bytea *string = PG_GETARG_BYTEA_PP(0);
433  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
434  int src_encoding = pg_char_to_encoding(src_encoding_name);
435  char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
436  int dest_encoding = pg_char_to_encoding(dest_encoding_name);
437  const char *src_str;
438  char *dest_str;
439  bytea *retval;
440  int len;
441 
442  if (src_encoding < 0)
443  ereport(ERROR,
444  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
445  errmsg("invalid source encoding name \"%s\"",
446  src_encoding_name)));
447  if (dest_encoding < 0)
448  ereport(ERROR,
449  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
450  errmsg("invalid destination encoding name \"%s\"",
451  dest_encoding_name)));
452 
453  /* make sure that source string is valid */
454  len = VARSIZE_ANY_EXHDR(string);
455  src_str = VARDATA_ANY(string);
456  pg_verify_mbstr_len(src_encoding, src_str, len, false);
457 
458  /* perform conversion */
459  dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
460  len,
461  src_encoding,
462  dest_encoding);
463 
464  /* update len if conversion actually happened */
465  if (dest_str != src_str)
466  len = strlen(dest_str);
467 
468  /*
469  * build bytea data type structure.
470  */
471  retval = (bytea *) palloc(len + VARHDRSZ);
472  SET_VARSIZE(retval, len + VARHDRSZ);
473  memcpy(VARDATA(retval), dest_str, len);
474 
475  if (dest_str != src_str)
476  pfree(dest_str);
477 
478  /* free memory if allocated by the toaster */
479  PG_FREE_IF_COPY(string, 0);
480 
481  PG_RETURN_BYTEA_P(retval);
482 }
483 
484 /*
485  * get the length of the string considered as text in the specified
486  * encoding. Raises an error if the data is not valid in that
487  * encoding.
488  *
489  * INT4 length (BYTEA string, NAME src_encoding_name)
490  */
491 Datum
493 {
494  bytea *string = PG_GETARG_BYTEA_PP(0);
495  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
496  int src_encoding = pg_char_to_encoding(src_encoding_name);
497  const char *src_str;
498  int len;
499  int retval;
500 
501  if (src_encoding < 0)
502  ereport(ERROR,
503  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
504  errmsg("invalid encoding name \"%s\"",
505  src_encoding_name)));
506 
507  len = VARSIZE_ANY_EXHDR(string);
508  src_str = VARDATA_ANY(string);
509 
510  retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
511 
512  PG_RETURN_INT32(retval);
513 }
514 
515 /*
516  * Get maximum multibyte character length in the specified encoding.
517  *
518  * Note encoding is specified numerically, not by name as above.
519  */
520 Datum
522 {
523  int encoding = PG_GETARG_INT32(0);
524 
525  if (PG_VALID_ENCODING(encoding))
526  PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
527  else
528  PG_RETURN_NULL();
529 }
530 
531 /*
532  * Convert client encoding to server encoding.
533  *
534  * See the notes about string conversion functions at the top of this file.
535  */
536 char *
537 pg_client_to_server(const char *s, int len)
538 {
539  return pg_any_to_server(s, len, ClientEncoding->encoding);
540 }
541 
542 /*
543  * Convert any encoding to server encoding.
544  *
545  * See the notes about string conversion functions at the top of this file.
546  *
547  * Unlike the other string conversion functions, this will apply validation
548  * even if encoding == DatabaseEncoding->encoding. This is because this is
549  * used to process data coming in from outside the database, and we never
550  * want to just assume validity.
551  */
552 char *
553 pg_any_to_server(const char *s, int len, int encoding)
554 {
555  if (len <= 0)
556  return unconstify(char *, s); /* empty string is always valid */
557 
558  if (encoding == DatabaseEncoding->encoding ||
559  encoding == PG_SQL_ASCII)
560  {
561  /*
562  * No conversion is needed, but we must still validate the data.
563  */
564  (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
565  return unconstify(char *, s);
566  }
567 
568  if (DatabaseEncoding->encoding == PG_SQL_ASCII)
569  {
570  /*
571  * No conversion is possible, but we must still validate the data,
572  * because the client-side code might have done string escaping using
573  * the selected client_encoding. If the client encoding is ASCII-safe
574  * then we just do a straight validation under that encoding. For an
575  * ASCII-unsafe encoding we have a problem: we dare not pass such data
576  * to the parser but we have no way to convert it. We compromise by
577  * rejecting the data if it contains any non-ASCII characters.
578  */
579  if (PG_VALID_BE_ENCODING(encoding))
580  (void) pg_verify_mbstr(encoding, s, len, false);
581  else
582  {
583  int i;
584 
585  for (i = 0; i < len; i++)
586  {
587  if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
588  ereport(ERROR,
589  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
590  errmsg("invalid byte value for encoding \"%s\": 0x%02x",
592  (unsigned char) s[i])));
593  }
594  }
595  return unconstify(char *, s);
596  }
597 
598  /* Fast path if we can use cached conversion function */
599  if (encoding == ClientEncoding->encoding)
600  return perform_default_encoding_conversion(s, len, true);
601 
602  /* General case ... will not work outside transactions */
603  return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
604  len,
605  encoding,
606  DatabaseEncoding->encoding);
607 }
608 
609 /*
610  * Convert server encoding to client encoding.
611  *
612  * See the notes about string conversion functions at the top of this file.
613  */
614 char *
615 pg_server_to_client(const char *s, int len)
616 {
617  return pg_server_to_any(s, len, ClientEncoding->encoding);
618 }
619 
620 /*
621  * Convert server encoding to any encoding.
622  *
623  * See the notes about string conversion functions at the top of this file.
624  */
625 char *
626 pg_server_to_any(const char *s, int len, int encoding)
627 {
628  if (len <= 0)
629  return unconstify(char *, s); /* empty string is always valid */
630 
631  if (encoding == DatabaseEncoding->encoding ||
632  encoding == PG_SQL_ASCII)
633  return unconstify(char *, s); /* assume data is valid */
634 
635  if (DatabaseEncoding->encoding == PG_SQL_ASCII)
636  {
637  /* No conversion is possible, but we must validate the result */
638  (void) pg_verify_mbstr(encoding, s, len, false);
639  return unconstify(char *, s);
640  }
641 
642  /* Fast path if we can use cached conversion function */
643  if (encoding == ClientEncoding->encoding)
644  return perform_default_encoding_conversion(s, len, false);
645 
646  /* General case ... will not work outside transactions */
647  return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
648  len,
649  DatabaseEncoding->encoding,
650  encoding);
651 }
652 
653 /*
654  * Perform default encoding conversion using cached FmgrInfo. Since
655  * this function does not access database at all, it is safe to call
656  * outside transactions. If the conversion has not been set up by
657  * SetClientEncoding(), no conversion is performed.
658  */
659 static char *
660 perform_default_encoding_conversion(const char *src, int len,
661  bool is_client_to_server)
662 {
663  char *result;
664  int src_encoding,
665  dest_encoding;
666  FmgrInfo *flinfo;
667 
668  if (is_client_to_server)
669  {
670  src_encoding = ClientEncoding->encoding;
671  dest_encoding = DatabaseEncoding->encoding;
672  flinfo = ToServerConvProc;
673  }
674  else
675  {
676  src_encoding = DatabaseEncoding->encoding;
677  dest_encoding = ClientEncoding->encoding;
678  flinfo = ToClientConvProc;
679  }
680 
681  if (flinfo == NULL)
682  return unconstify(char *, src);
683 
684  /*
685  * Allocate space for conversion result, being wary of integer overflow
686  */
687  if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
688  ereport(ERROR,
689  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
690  errmsg("out of memory"),
691  errdetail("String of %d bytes is too long for encoding conversion.",
692  len)));
693 
694  result = palloc(len * MAX_CONVERSION_GROWTH + 1);
695 
696  FunctionCall5(flinfo,
697  Int32GetDatum(src_encoding),
698  Int32GetDatum(dest_encoding),
699  CStringGetDatum(src),
700  CStringGetDatum(result),
701  Int32GetDatum(len));
702  return result;
703 }
704 
705 
706 /* convert a multibyte string to a wchar */
707 int
708 pg_mb2wchar(const char *from, pg_wchar *to)
709 {
710  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
711 }
712 
713 /* convert a multibyte string to a wchar with a limited length */
714 int
715 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
716 {
717  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
718 }
719 
720 /* same, with any encoding */
721 int
723  const char *from, pg_wchar *to, int len)
724 {
725  return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
726 }
727 
728 /* convert a wchar string to a multibyte */
729 int
730 pg_wchar2mb(const pg_wchar *from, char *to)
731 {
732  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
733 }
734 
735 /* convert a wchar string to a multibyte with a limited length */
736 int
737 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
738 {
739  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
740 }
741 
742 /* same, with any encoding */
743 int
745  const pg_wchar *from, char *to, int len)
746 {
747  return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
748 }
749 
750 /* returns the byte length of a multibyte character */
751 int
752 pg_mblen(const char *mbstr)
753 {
754  return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
755 }
756 
757 /* returns the display length of a multibyte character */
758 int
759 pg_dsplen(const char *mbstr)
760 {
761  return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
762 }
763 
764 /* returns the length (counted in wchars) of a multibyte string */
765 int
766 pg_mbstrlen(const char *mbstr)
767 {
768  int len = 0;
769 
770  /* optimization for single byte encoding */
772  return strlen(mbstr);
773 
774  while (*mbstr)
775  {
776  mbstr += pg_mblen(mbstr);
777  len++;
778  }
779  return len;
780 }
781 
782 /* returns the length (counted in wchars) of a multibyte string
783  * (not necessarily NULL terminated)
784  */
785 int
786 pg_mbstrlen_with_len(const char *mbstr, int limit)
787 {
788  int len = 0;
789 
790  /* optimization for single byte encoding */
792  return limit;
793 
794  while (limit > 0 && *mbstr)
795  {
796  int l = pg_mblen(mbstr);
797 
798  limit -= l;
799  mbstr += l;
800  len++;
801  }
802  return len;
803 }
804 
805 /*
806  * returns the byte length of a multibyte string
807  * (not necessarily NULL terminated)
808  * that is no longer than limit.
809  * this function does not break multibyte character boundary.
810  */
811 int
812 pg_mbcliplen(const char *mbstr, int len, int limit)
813 {
814  return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
815  len, limit);
816 }
817 
818 /*
819  * pg_mbcliplen with specified encoding
820  */
821 int
822 pg_encoding_mbcliplen(int encoding, const char *mbstr,
823  int len, int limit)
824 {
825  mblen_converter mblen_fn;
826  int clen = 0;
827  int l;
828 
829  /* optimization for single byte encoding */
830  if (pg_encoding_max_length(encoding) == 1)
831  return cliplen(mbstr, len, limit);
832 
833  mblen_fn = pg_wchar_table[encoding].mblen;
834 
835  while (len > 0 && *mbstr)
836  {
837  l = (*mblen_fn) ((const unsigned char *) mbstr);
838  if ((clen + l) > limit)
839  break;
840  clen += l;
841  if (clen == limit)
842  break;
843  len -= l;
844  mbstr += l;
845  }
846  return clen;
847 }
848 
849 /*
850  * Similar to pg_mbcliplen except the limit parameter specifies the
851  * character length, not the byte length.
852  */
853 int
854 pg_mbcharcliplen(const char *mbstr, int len, int limit)
855 {
856  int clen = 0;
857  int nch = 0;
858  int l;
859 
860  /* optimization for single byte encoding */
862  return cliplen(mbstr, len, limit);
863 
864  while (len > 0 && *mbstr)
865  {
866  l = pg_mblen(mbstr);
867  nch++;
868  if (nch > limit)
869  break;
870  clen += l;
871  len -= l;
872  mbstr += l;
873  }
874  return clen;
875 }
876 
877 /* mbcliplen for any single-byte encoding */
878 static int
879 cliplen(const char *str, int len, int limit)
880 {
881  int l = 0;
882 
883  len = Min(len, limit);
884  while (l < len && str[l])
885  l++;
886  return l;
887 }
888 
889 void
891 {
892  if (!PG_VALID_BE_ENCODING(encoding))
893  elog(ERROR, "invalid database encoding: %d", encoding);
894 
895  DatabaseEncoding = &pg_enc2name_tbl[encoding];
896  Assert(DatabaseEncoding->encoding == encoding);
897 }
898 
899 void
901 {
902  /* Some calls happen before we can elog()! */
903  Assert(PG_VALID_ENCODING(encoding));
904 
905  MessageEncoding = &pg_enc2name_tbl[encoding];
906  Assert(MessageEncoding->encoding == encoding);
907 }
908 
909 #ifdef ENABLE_NLS
910 /*
911  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
912  * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
913  * fail for gettext-internal causes like out-of-memory.
914  */
915 static bool
916 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
917 {
918  bool elog_ok = (CurrentMemoryContext != NULL);
919  int i;
920 
921  for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
922  {
923  if (pg_enc2gettext_tbl[i].encoding == encoding)
924  {
925  if (bind_textdomain_codeset(domainname,
926  pg_enc2gettext_tbl[i].name) != NULL)
927  return true;
928 
929  if (elog_ok)
930  elog(LOG, "bind_textdomain_codeset failed");
931  else
932  write_stderr("bind_textdomain_codeset failed");
933 
934  break;
935  }
936  }
937 
938  return false;
939 }
940 
941 /*
942  * Bind a gettext message domain to the codeset corresponding to the database
943  * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
944  * Return the MessageEncoding implied by the new settings.
945  *
946  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
947  * When that matches the database encoding, we don't need to do anything. In
948  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
949  * database encoding, except for the C locale. (On Windows, we also permit a
950  * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
951  * gettext to the right codeset.
952  *
953  * On Windows, gettext defaults to the Windows ANSI code page. This is a
954  * convenient departure for software that passes the strings to Windows ANSI
955  * APIs, but we don't do that. Compel gettext to use database encoding or,
956  * failing that, the LC_CTYPE encoding as it would on other platforms.
957  *
958  * This function is called before elog() and palloc() are usable.
959  */
960 int
961 pg_bind_textdomain_codeset(const char *domainname)
962 {
963  bool elog_ok = (CurrentMemoryContext != NULL);
965  int new_msgenc;
966 
967 #ifndef WIN32
968  const char *ctype = setlocale(LC_CTYPE, NULL);
969 
970  if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
971 #endif
972  if (encoding != PG_SQL_ASCII &&
973  raw_pg_bind_textdomain_codeset(domainname, encoding))
974  return encoding;
975 
976  new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
977  if (new_msgenc < 0)
978  new_msgenc = PG_SQL_ASCII;
979 
980 #ifdef WIN32
981  if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
982  /* On failure, the old message encoding remains valid. */
983  return GetMessageEncoding();
984 #endif
985 
986  return new_msgenc;
987 }
988 #endif
989 
990 /*
991  * The database encoding, also called the server encoding, represents the
992  * encoding of data stored in text-like data types. Affected types include
993  * cstring, text, varchar, name, xml, and json.
994  */
995 int
997 {
998  return DatabaseEncoding->encoding;
999 }
1000 
1001 const char *
1003 {
1004  return DatabaseEncoding->name;
1005 }
1006 
1007 Datum
1009 {
1010  return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1011 }
1012 
1013 Datum
1015 {
1016  return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1017 }
1018 
1019 /*
1020  * gettext() returns messages in this encoding. This often matches the
1021  * database encoding, but it differs for SQL_ASCII databases, for processes
1022  * not attached to a database, and under a database encoding lacking iconv
1023  * support (MULE_INTERNAL).
1024  */
1025 int
1027 {
1028  return MessageEncoding->encoding;
1029 }
1030 
1031 #ifdef WIN32
1032 /*
1033  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1034  * string. The character length is also passed to utf16len if not
1035  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1036  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1037  */
1038 WCHAR *
1039 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1040 {
1041  int msgenc = GetMessageEncoding();
1042  WCHAR *utf16;
1043  int dstlen;
1044  UINT codepage;
1045 
1046  if (msgenc == PG_SQL_ASCII)
1047  /* No conversion is possible, and SQL_ASCII is never utf16. */
1048  return NULL;
1049 
1050  codepage = pg_enc2name_tbl[msgenc].codepage;
1051 
1052  /*
1053  * Use MultiByteToWideChar directly if there is a corresponding codepage,
1054  * or double conversion through UTF8 if not. Double conversion is needed,
1055  * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1056  */
1057  if (codepage != 0)
1058  {
1059  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1060  dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1061  utf16[dstlen] = (WCHAR) 0;
1062  }
1063  else
1064  {
1065  char *utf8;
1066 
1067  /*
1068  * XXX pg_do_encoding_conversion() requires a transaction. In the
1069  * absence of one, hope for the input to be valid UTF8.
1070  */
1071  if (IsTransactionState())
1072  {
1073  utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1074  len,
1075  msgenc,
1076  PG_UTF8);
1077  if (utf8 != str)
1078  len = strlen(utf8);
1079  }
1080  else
1081  utf8 = (char *) str;
1082 
1083  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1084  dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1085  utf16[dstlen] = (WCHAR) 0;
1086 
1087  if (utf8 != str)
1088  pfree(utf8);
1089  }
1090 
1091  if (dstlen == 0 && len > 0)
1092  {
1093  pfree(utf16);
1094  return NULL; /* error */
1095  }
1096 
1097  if (utf16len)
1098  *utf16len = dstlen;
1099  return utf16;
1100 }
1101 
1102 #endif
#define NIL
Definition: pg_list.h:65
#define PG_GETARG_INT32(n)
Definition: fmgr.h:264
Definition: fmgr.h:56
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:854
int(* mblen_converter)(const unsigned char *mbstr)
Definition: pg_wchar.h:365
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:48
void SetMessageEncoding(int encoding)
Definition: mbutils.c:900
int pg_char_to_encoding(const char *name)
Definition: encnames.c:551
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
#define VARDATA(PTR)
Definition: postgres.h:302
int PrepareClientEncoding(int encoding)
Definition: mbutils.c:103
#define setlocale(a, b)
Definition: win32_port.h:417
Datum getdatabaseencoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1008
int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len)
Definition: mbutils.c:722
char * pg_server_to_client(const char *s, int len)
Definition: mbutils.c:615
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:263
#define VARHDRSZ
Definition: c.h:555
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1942
FmgrInfo to_server_info
Definition: mbutils.c:57
FmgrInfo to_client_info
Definition: mbutils.c:58
static bool backend_startup_complete
Definition: mbutils.c:83
#define Min(x, y)
Definition: c.h:904
static int pending_client_encoding
Definition: mbutils.c:84
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define PG_RETURN_INT32(x)
Definition: fmgr.h:344
static FmgrInfo * ToServerConvProc
Definition: mbutils.c:67
#define write_stderr(str)
Definition: parallel.c:182
static char * perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
Definition: mbutils.c:660
int errcode(int sqlerrcode)
Definition: elog.c:570
const char * name
Definition: pg_wchar.h:343
static const pg_enc2name * ClientEncoding
Definition: mbutils.c:73
size_t pg_wchar_strlen(const pg_wchar *str)
Definition: wstrncmp.c:70
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:616
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:360
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding)
Definition: mbutils.c:318
#define OidIsValid(objectId)
Definition: c.h:638
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:626
int pg_wchar2mb(const pg_wchar *from, char *to)
Definition: mbutils.c:730
const pg_enc2gettext pg_enc2gettext_tbl[]
Definition: encnames.c:359
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition: mbutils.c:786
#define foreach_delete_current(lst, cell)
Definition: pg_list.h:368
mbdisplaylen_converter dsplen
Definition: pg_wchar.h:380
int s_encoding
Definition: mbutils.c:55
Datum pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
Definition: mbutils.c:521
void pfree(void *pointer)
Definition: mcxt.c:1031
char * pg_client_to_server(const char *s, int len)
Definition: mbutils.c:537
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1075
#define ERROR
Definition: elog.h:43
#define FunctionCall5(flinfo, arg1, arg2, arg3, arg4, arg5)
Definition: fmgr.h:642
#define FATAL
Definition: elog.h:52
Datum pg_client_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1014
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:812
static List * ConvProcList
Definition: mbutils.c:61
int pg_encoding_mbcliplen(int encoding, const char *mbstr, int len, int limit)
Definition: mbutils.c:822
int pg_encoding_max_length(int encoding)
Definition: wchar.c:1868
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:307
int pg_database_encoding_max_length(void)
Definition: wchar.c:1881
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define CStringGetDatum(X)
Definition: postgres.h:578
int SetClientEncoding(int encoding)
Definition: mbutils.c:201
const char * name
Definition: pg_wchar.h:328
struct ConvProcInfo ConvProcInfo
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:305
void fmgr_info_cxt(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt)
Definition: fmgr.c:134
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1925
int pg_mb2wchar(const char *from, pg_wchar *to)
Definition: mbutils.c:708
#define ereport(elevel, rest)
Definition: elog.h:141
unsigned int pg_wchar
Definition: mbprint.c:31
MemoryContext TopMemoryContext
Definition: mcxt.c:44
int pg_encoding_wchar2mb_with_len(int encoding, const pg_wchar *from, char *to, int len)
Definition: mbutils.c:744
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:620
#define MaxAllocSize
Definition: memutils.h:40
#define unconstify(underlying_type, expr)
Definition: c.h:1163
Datum length_in_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:492
uintptr_t Datum
Definition: postgres.h:367
void SetDatabaseEncoding(int encoding)
Definition: mbutils.c:890
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:343
int GetDatabaseEncoding(void)
Definition: mbutils.c:996
int pg_get_client_encoding(void)
Definition: mbutils.c:298
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:766
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:433
wchar2mb_with_len_converter wchar2mb_with_len
Definition: pg_wchar.h:377
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:715
pg_enc encoding
Definition: pg_wchar.h:329
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:759
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:607
List * lcons(void *datum, List *list)
Definition: list.c:453
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:295
#define Assert(condition)
Definition: c.h:732
#define lfirst(lc)
Definition: pg_list.h:190
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1002
static int cliplen(const char *str, int len, int limit)
Definition: mbutils.c:879
Datum pg_convert(PG_FUNCTION_ARGS)
Definition: mbutils.c:430
size_t Size
Definition: c.h:466
const char * pg_get_client_encoding_name(void)
Definition: mbutils.c:307
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:302
Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding)
Definition: namespace.c:3673
bool IsTransactionState(void)
Definition: xact.c:356
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:255
Datum pg_convert_from(PG_FUNCTION_ARGS)
Definition: mbutils.c:403
int pg_mblen(const char *mbstr)
Definition: mbutils.c:752
int32 encoding
Definition: pg_database.h:41
static const pg_enc2name * MessageEncoding
Definition: mbutils.c:75
void InitializeClientEncoding(void)
Definition: mbutils.c:274
const char * name
Definition: encode.c:521
int GetMessageEncoding(void)
Definition: mbutils.c:1026
#define MAX_CONVERSION_GROWTH
Definition: pg_wchar.h:316
#define Int32GetDatum(X)
Definition: postgres.h:479
static FmgrInfo * ToClientConvProc
Definition: mbutils.c:68
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:341
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:784
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:1777
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:771
static const pg_enc2name * DatabaseEncoding
Definition: mbutils.c:74
#define elog(elevel,...)
Definition: elog.h:226
int i
#define NameStr(name)
Definition: c.h:609
Definition: c.h:549
#define PG_FUNCTION_ARGS
Definition: fmgr.h:188
mb2wchar_with_len_converter mb2wchar_with_len
Definition: pg_wchar.h:375
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:329
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:737
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:553
mblen_converter mblen
Definition: pg_wchar.h:379
int c_encoding
Definition: mbutils.c:56
Definition: pg_list.h:50
Datum pg_convert_to(PG_FUNCTION_ARGS)
Definition: mbutils.c:378
#define PG_RETURN_NULL()
Definition: fmgr.h:335
#define PG_GETARG_NAME(n)
Definition: fmgr.h:273
#define OidFunctionCall5(functionId, arg1, arg2, arg3, arg4, arg5)
Definition: fmgr.h:662