PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
mbutils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  * This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required. However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs. Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr(). Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 
44 /*
45  * When converting strings between different encodings, we assume that space
46  * for converted result is 4-to-1 growth in the worst case. The rate for
47  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
48  * kanna -> UTF8 is the worst case). So "4" should be enough for the moment.
49  *
50  * Note that this is not the same as the maximum character width in any
51  * particular encoding.
52  */
53 #define MAX_CONVERSION_GROWTH 4
54 
55 /*
56  * We maintain a simple linked list caching the fmgr lookup info for the
57  * currently selected conversion functions, as well as any that have been
58  * selected previously in the current session. (We remember previous
59  * settings because we must be able to restore a previous setting during
60  * transaction rollback, without doing any fresh catalog accesses.)
61  *
62  * Since we'll never release this data, we just keep it in TopMemoryContext.
63  */
64 typedef struct ConvProcInfo
65 {
66  int s_encoding; /* server and client encoding IDs */
68  FmgrInfo to_server_info; /* lookup info for conversion procs */
70 } ConvProcInfo;
71 
72 static List *ConvProcList = NIL; /* List of ConvProcInfo */
73 
74 /*
75  * These variables point to the currently active conversion functions,
76  * or are NULL when no conversion is needed.
77  */
80 
81 /*
82  * These variables track the currently-selected encodings.
83  */
87 
88 /*
89  * During backend startup we can't set client encoding because we (a)
90  * can't look up the conversion functions, and (b) may not know the database
91  * encoding yet either. So SetClientEncoding() just accepts anything and
92  * remembers it for InitializeClientEncoding() to apply later.
93  */
94 static bool backend_startup_complete = false;
96 
97 
98 /* Internal functions */
99 static char *perform_default_encoding_conversion(const char *src,
100  int len, bool is_client_to_server);
101 static int cliplen(const char *str, int len, int limit);
102 
103 
104 /*
105  * Prepare for a future call to SetClientEncoding. Success should mean
106  * that SetClientEncoding is guaranteed to succeed for this encoding request.
107  *
108  * (But note that success before backend_startup_complete does not guarantee
109  * success after ...)
110  *
111  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
112  */
113 int
115 {
116  int current_server_encoding;
117  ListCell *lc;
118 
119  if (!PG_VALID_FE_ENCODING(encoding))
120  return -1;
121 
122  /* Can't do anything during startup, per notes above */
124  return 0;
125 
126  current_server_encoding = GetDatabaseEncoding();
127 
128  /*
129  * Check for cases that require no conversion function.
130  */
131  if (current_server_encoding == encoding ||
132  current_server_encoding == PG_SQL_ASCII ||
133  encoding == PG_SQL_ASCII)
134  return 0;
135 
136  if (IsTransactionState())
137  {
138  /*
139  * If we're in a live transaction, it's safe to access the catalogs,
140  * so look up the functions. We repeat the lookup even if the info is
141  * already cached, so that we can react to changes in the contents of
142  * pg_conversion.
143  */
144  Oid to_server_proc,
145  to_client_proc;
146  ConvProcInfo *convinfo;
147  MemoryContext oldcontext;
148 
149  to_server_proc = FindDefaultConversionProc(encoding,
150  current_server_encoding);
151  if (!OidIsValid(to_server_proc))
152  return -1;
153  to_client_proc = FindDefaultConversionProc(current_server_encoding,
154  encoding);
155  if (!OidIsValid(to_client_proc))
156  return -1;
157 
158  /*
159  * Load the fmgr info into TopMemoryContext (could still fail here)
160  */
162  sizeof(ConvProcInfo));
163  convinfo->s_encoding = current_server_encoding;
164  convinfo->c_encoding = encoding;
165  fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
167  fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
169 
170  /* Attach new info to head of list */
172  ConvProcList = lcons(convinfo, ConvProcList);
173  MemoryContextSwitchTo(oldcontext);
174 
175  /*
176  * We cannot yet remove any older entry for the same encoding pair,
177  * since it could still be in use. SetClientEncoding will clean up.
178  */
179 
180  return 0; /* success */
181  }
182  else
183  {
184  /*
185  * If we're not in a live transaction, the only thing we can do is
186  * restore a previous setting using the cache. This covers all
187  * transaction-rollback cases. The only case it might not work for is
188  * trying to change client_encoding on the fly by editing
189  * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
190  * thing to do anyway.
191  */
192  foreach(lc, ConvProcList)
193  {
194  ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
195 
196  if (oldinfo->s_encoding == current_server_encoding &&
197  oldinfo->c_encoding == encoding)
198  return 0;
199  }
200 
201  return -1; /* it's not cached, so fail */
202  }
203 }
204 
205 /*
206  * Set the active client encoding and set up the conversion-function pointers.
207  * PrepareClientEncoding should have been called previously for this encoding.
208  *
209  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
210  */
211 int
213 {
214  int current_server_encoding;
215  bool found;
216  ListCell *lc;
217  ListCell *prev;
218  ListCell *next;
219 
220  if (!PG_VALID_FE_ENCODING(encoding))
221  return -1;
222 
223  /* Can't do anything during startup, per notes above */
225  {
227  return 0;
228  }
229 
230  current_server_encoding = GetDatabaseEncoding();
231 
232  /*
233  * Check for cases that require no conversion function.
234  */
235  if (current_server_encoding == encoding ||
236  current_server_encoding == PG_SQL_ASCII ||
237  encoding == PG_SQL_ASCII)
238  {
239  ClientEncoding = &pg_enc2name_tbl[encoding];
240  ToServerConvProc = NULL;
241  ToClientConvProc = NULL;
242  return 0;
243  }
244 
245  /*
246  * Search the cache for the entry previously prepared by
247  * PrepareClientEncoding; if there isn't one, we lose. While at it,
248  * release any duplicate entries so that repeated Prepare/Set cycles don't
249  * leak memory.
250  */
251  found = false;
252  prev = NULL;
253  for (lc = list_head(ConvProcList); lc; lc = next)
254  {
255  ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
256 
257  next = lnext(lc);
258 
259  if (convinfo->s_encoding == current_server_encoding &&
260  convinfo->c_encoding == encoding)
261  {
262  if (!found)
263  {
264  /* Found newest entry, so set up */
265  ClientEncoding = &pg_enc2name_tbl[encoding];
266  ToServerConvProc = &convinfo->to_server_info;
267  ToClientConvProc = &convinfo->to_client_info;
268  found = true;
269  }
270  else
271  {
272  /* Duplicate entry, release it */
273  ConvProcList = list_delete_cell(ConvProcList, lc, prev);
274  pfree(convinfo);
275  continue; /* prev mustn't advance */
276  }
277  }
278 
279  prev = lc;
280  }
281 
282  if (found)
283  return 0; /* success */
284  else
285  return -1; /* it's not cached, so fail */
286 }
287 
288 /*
289  * Initialize client encoding conversions.
290  * Called from InitPostgres() once during backend startup.
291  */
292 void
294 {
297 
300  {
301  /*
302  * Oops, the requested conversion is not available. We couldn't fail
303  * before, but we can now.
304  */
305  ereport(FATAL,
306  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
307  errmsg("conversion between %s and %s is not supported",
310  }
311 }
312 
313 /*
314  * returns the current client encoding
315  */
316 int
318 {
319  return ClientEncoding->encoding;
320 }
321 
322 /*
323  * returns the current client encoding name
324  */
325 const char *
327 {
328  return ClientEncoding->name;
329 }
330 
331 /*
332  * Convert src string to another encoding (general case).
333  *
334  * See the notes about string conversion functions at the top of this file.
335  */
336 unsigned char *
337 pg_do_encoding_conversion(unsigned char *src, int len,
338  int src_encoding, int dest_encoding)
339 {
340  unsigned char *result;
341  Oid proc;
342 
343  if (len <= 0)
344  return src; /* empty string is always valid */
345 
346  if (src_encoding == dest_encoding)
347  return src; /* no conversion required, assume valid */
348 
349  if (dest_encoding == PG_SQL_ASCII)
350  return src; /* any string is valid in SQL_ASCII */
351 
352  if (src_encoding == PG_SQL_ASCII)
353  {
354  /* No conversion is possible, but we must validate the result */
355  (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
356  return src;
357  }
358 
359  if (!IsTransactionState()) /* shouldn't happen */
360  elog(ERROR, "cannot perform encoding conversion outside a transaction");
361 
362  proc = FindDefaultConversionProc(src_encoding, dest_encoding);
363  if (!OidIsValid(proc))
364  ereport(ERROR,
365  (errcode(ERRCODE_UNDEFINED_FUNCTION),
366  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
367  pg_encoding_to_char(src_encoding),
368  pg_encoding_to_char(dest_encoding))));
369 
370  /*
371  * Allocate space for conversion result, being wary of integer overflow
372  */
373  if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
374  ereport(ERROR,
375  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
376  errmsg("out of memory"),
377  errdetail("String of %d bytes is too long for encoding conversion.",
378  len)));
379 
380  result = palloc(len * MAX_CONVERSION_GROWTH + 1);
381 
382  OidFunctionCall5(proc,
383  Int32GetDatum(src_encoding),
384  Int32GetDatum(dest_encoding),
385  CStringGetDatum(src),
386  CStringGetDatum(result),
387  Int32GetDatum(len));
388  return result;
389 }
390 
391 /*
392  * Convert string to encoding encoding_name. The source
393  * encoding is the DB encoding.
394  *
395  * BYTEA convert_to(TEXT string, NAME encoding_name) */
396 Datum
398 {
399  Datum string = PG_GETARG_DATUM(0);
400  Datum dest_encoding_name = PG_GETARG_DATUM(1);
401  Datum src_encoding_name = DirectFunctionCall1(namein,
402  CStringGetDatum(DatabaseEncoding->name));
403  Datum result;
404 
405  /*
406  * pg_convert expects a bytea as its first argument. We're passing it a
407  * text argument here, relying on the fact that they are both in fact
408  * varlena types, and thus structurally identical.
409  */
410  result = DirectFunctionCall3(pg_convert, string,
411  src_encoding_name, dest_encoding_name);
412 
413  PG_RETURN_DATUM(result);
414 }
415 
416 /*
417  * Convert string from encoding encoding_name. The destination
418  * encoding is the DB encoding.
419  *
420  * TEXT convert_from(BYTEA string, NAME encoding_name) */
421 Datum
423 {
424  Datum string = PG_GETARG_DATUM(0);
425  Datum src_encoding_name = PG_GETARG_DATUM(1);
426  Datum dest_encoding_name = DirectFunctionCall1(namein,
427  CStringGetDatum(DatabaseEncoding->name));
428  Datum result;
429 
430  result = DirectFunctionCall3(pg_convert, string,
431  src_encoding_name, dest_encoding_name);
432 
433  /*
434  * pg_convert returns a bytea, which we in turn return as text, relying on
435  * the fact that they are both in fact varlena types, and thus
436  * structurally identical. Although not all bytea values are valid text,
437  * in this case it will be because we've told pg_convert to return one
438  * that is valid as text in the current database encoding.
439  */
440  PG_RETURN_DATUM(result);
441 }
442 
443 /*
444  * Convert string between two arbitrary encodings.
445  *
446  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
447  */
448 Datum
450 {
451  bytea *string = PG_GETARG_BYTEA_PP(0);
452  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
453  int src_encoding = pg_char_to_encoding(src_encoding_name);
454  char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
455  int dest_encoding = pg_char_to_encoding(dest_encoding_name);
456  const char *src_str;
457  char *dest_str;
458  bytea *retval;
459  int len;
460 
461  if (src_encoding < 0)
462  ereport(ERROR,
463  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
464  errmsg("invalid source encoding name \"%s\"",
465  src_encoding_name)));
466  if (dest_encoding < 0)
467  ereport(ERROR,
468  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
469  errmsg("invalid destination encoding name \"%s\"",
470  dest_encoding_name)));
471 
472  /* make sure that source string is valid */
473  len = VARSIZE_ANY_EXHDR(string);
474  src_str = VARDATA_ANY(string);
475  pg_verify_mbstr_len(src_encoding, src_str, len, false);
476 
477  /* perform conversion */
478  dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
479  len,
480  src_encoding,
481  dest_encoding);
482 
483  /* update len if conversion actually happened */
484  if (dest_str != src_str)
485  len = strlen(dest_str);
486 
487  /*
488  * build bytea data type structure.
489  */
490  retval = (bytea *) palloc(len + VARHDRSZ);
491  SET_VARSIZE(retval, len + VARHDRSZ);
492  memcpy(VARDATA(retval), dest_str, len);
493 
494  if (dest_str != src_str)
495  pfree(dest_str);
496 
497  /* free memory if allocated by the toaster */
498  PG_FREE_IF_COPY(string, 0);
499 
500  PG_RETURN_BYTEA_P(retval);
501 }
502 
503 /*
504  * get the length of the string considered as text in the specified
505  * encoding. Raises an error if the data is not valid in that
506  * encoding.
507  *
508  * INT4 length (BYTEA string, NAME src_encoding_name)
509  */
510 Datum
512 {
513  bytea *string = PG_GETARG_BYTEA_PP(0);
514  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
515  int src_encoding = pg_char_to_encoding(src_encoding_name);
516  const char *src_str;
517  int len;
518  int retval;
519 
520  if (src_encoding < 0)
521  ereport(ERROR,
522  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
523  errmsg("invalid encoding name \"%s\"",
524  src_encoding_name)));
525 
526  len = VARSIZE_ANY_EXHDR(string);
527  src_str = VARDATA_ANY(string);
528 
529  retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
530 
531  PG_RETURN_INT32(retval);
532 }
533 
534 /*
535  * Get maximum multibyte character length in the specified encoding.
536  *
537  * Note encoding is specified numerically, not by name as above.
538  */
539 Datum
541 {
542  int encoding = PG_GETARG_INT32(0);
543 
544  if (PG_VALID_ENCODING(encoding))
545  PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
546  else
547  PG_RETURN_NULL();
548 }
549 
550 /*
551  * Convert client encoding to server encoding.
552  *
553  * See the notes about string conversion functions at the top of this file.
554  */
555 char *
556 pg_client_to_server(const char *s, int len)
557 {
558  return pg_any_to_server(s, len, ClientEncoding->encoding);
559 }
560 
561 /*
562  * Convert any encoding to server encoding.
563  *
564  * See the notes about string conversion functions at the top of this file.
565  *
566  * Unlike the other string conversion functions, this will apply validation
567  * even if encoding == DatabaseEncoding->encoding. This is because this is
568  * used to process data coming in from outside the database, and we never
569  * want to just assume validity.
570  */
571 char *
572 pg_any_to_server(const char *s, int len, int encoding)
573 {
574  if (len <= 0)
575  return (char *) s; /* empty string is always valid */
576 
577  if (encoding == DatabaseEncoding->encoding ||
578  encoding == PG_SQL_ASCII)
579  {
580  /*
581  * No conversion is needed, but we must still validate the data.
582  */
583  (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
584  return (char *) s;
585  }
586 
587  if (DatabaseEncoding->encoding == PG_SQL_ASCII)
588  {
589  /*
590  * No conversion is possible, but we must still validate the data,
591  * because the client-side code might have done string escaping using
592  * the selected client_encoding. If the client encoding is ASCII-safe
593  * then we just do a straight validation under that encoding. For an
594  * ASCII-unsafe encoding we have a problem: we dare not pass such data
595  * to the parser but we have no way to convert it. We compromise by
596  * rejecting the data if it contains any non-ASCII characters.
597  */
598  if (PG_VALID_BE_ENCODING(encoding))
599  (void) pg_verify_mbstr(encoding, s, len, false);
600  else
601  {
602  int i;
603 
604  for (i = 0; i < len; i++)
605  {
606  if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
607  ereport(ERROR,
608  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
609  errmsg("invalid byte value for encoding \"%s\": 0x%02x",
611  (unsigned char) s[i])));
612  }
613  }
614  return (char *) s;
615  }
616 
617  /* Fast path if we can use cached conversion function */
618  if (encoding == ClientEncoding->encoding)
619  return perform_default_encoding_conversion(s, len, true);
620 
621  /* General case ... will not work outside transactions */
622  return (char *) pg_do_encoding_conversion((unsigned char *) s,
623  len,
624  encoding,
625  DatabaseEncoding->encoding);
626 }
627 
628 /*
629  * Convert server encoding to client encoding.
630  *
631  * See the notes about string conversion functions at the top of this file.
632  */
633 char *
634 pg_server_to_client(const char *s, int len)
635 {
636  return pg_server_to_any(s, len, ClientEncoding->encoding);
637 }
638 
639 /*
640  * Convert server encoding to any encoding.
641  *
642  * See the notes about string conversion functions at the top of this file.
643  */
644 char *
645 pg_server_to_any(const char *s, int len, int encoding)
646 {
647  if (len <= 0)
648  return (char *) s; /* empty string is always valid */
649 
650  if (encoding == DatabaseEncoding->encoding ||
651  encoding == PG_SQL_ASCII)
652  return (char *) s; /* assume data is valid */
653 
654  if (DatabaseEncoding->encoding == PG_SQL_ASCII)
655  {
656  /* No conversion is possible, but we must validate the result */
657  (void) pg_verify_mbstr(encoding, s, len, false);
658  return (char *) s;
659  }
660 
661  /* Fast path if we can use cached conversion function */
662  if (encoding == ClientEncoding->encoding)
663  return perform_default_encoding_conversion(s, len, false);
664 
665  /* General case ... will not work outside transactions */
666  return (char *) pg_do_encoding_conversion((unsigned char *) s,
667  len,
668  DatabaseEncoding->encoding,
669  encoding);
670 }
671 
672 /*
673  * Perform default encoding conversion using cached FmgrInfo. Since
674  * this function does not access database at all, it is safe to call
675  * outside transactions. If the conversion has not been set up by
676  * SetClientEncoding(), no conversion is performed.
677  */
678 static char *
679 perform_default_encoding_conversion(const char *src, int len,
680  bool is_client_to_server)
681 {
682  char *result;
683  int src_encoding,
684  dest_encoding;
685  FmgrInfo *flinfo;
686 
687  if (is_client_to_server)
688  {
689  src_encoding = ClientEncoding->encoding;
690  dest_encoding = DatabaseEncoding->encoding;
691  flinfo = ToServerConvProc;
692  }
693  else
694  {
695  src_encoding = DatabaseEncoding->encoding;
696  dest_encoding = ClientEncoding->encoding;
697  flinfo = ToClientConvProc;
698  }
699 
700  if (flinfo == NULL)
701  return (char *) src;
702 
703  /*
704  * Allocate space for conversion result, being wary of integer overflow
705  */
706  if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
707  ereport(ERROR,
708  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
709  errmsg("out of memory"),
710  errdetail("String of %d bytes is too long for encoding conversion.",
711  len)));
712 
713  result = palloc(len * MAX_CONVERSION_GROWTH + 1);
714 
715  FunctionCall5(flinfo,
716  Int32GetDatum(src_encoding),
717  Int32GetDatum(dest_encoding),
718  CStringGetDatum(src),
719  CStringGetDatum(result),
720  Int32GetDatum(len));
721  return result;
722 }
723 
724 
725 /* convert a multibyte string to a wchar */
726 int
727 pg_mb2wchar(const char *from, pg_wchar *to)
728 {
729  return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
730 }
731 
732 /* convert a multibyte string to a wchar with a limited length */
733 int
734 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
735 {
736  return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
737 }
738 
739 /* same, with any encoding */
740 int
742  const char *from, pg_wchar *to, int len)
743 {
744  return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
745 }
746 
747 /* convert a wchar string to a multibyte */
748 int
749 pg_wchar2mb(const pg_wchar *from, char *to)
750 {
751  return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, pg_wchar_strlen(from));
752 }
753 
754 /* convert a wchar string to a multibyte with a limited length */
755 int
756 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
757 {
758  return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
759 }
760 
761 /* same, with any encoding */
762 int
764  const pg_wchar *from, char *to, int len)
765 {
766  return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
767 }
768 
769 /* returns the byte length of a multibyte character */
770 int
771 pg_mblen(const char *mbstr)
772 {
773  return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
774 }
775 
776 /* returns the display length of a multibyte character */
777 int
778 pg_dsplen(const char *mbstr)
779 {
780  return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
781 }
782 
783 /* returns the length (counted in wchars) of a multibyte string */
784 int
785 pg_mbstrlen(const char *mbstr)
786 {
787  int len = 0;
788 
789  /* optimization for single byte encoding */
791  return strlen(mbstr);
792 
793  while (*mbstr)
794  {
795  mbstr += pg_mblen(mbstr);
796  len++;
797  }
798  return len;
799 }
800 
801 /* returns the length (counted in wchars) of a multibyte string
802  * (not necessarily NULL terminated)
803  */
804 int
805 pg_mbstrlen_with_len(const char *mbstr, int limit)
806 {
807  int len = 0;
808 
809  /* optimization for single byte encoding */
811  return limit;
812 
813  while (limit > 0 && *mbstr)
814  {
815  int l = pg_mblen(mbstr);
816 
817  limit -= l;
818  mbstr += l;
819  len++;
820  }
821  return len;
822 }
823 
824 /*
825  * returns the byte length of a multibyte string
826  * (not necessarily NULL terminated)
827  * that is no longer than limit.
828  * this function does not break multibyte character boundary.
829  */
830 int
831 pg_mbcliplen(const char *mbstr, int len, int limit)
832 {
833  return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
834  len, limit);
835 }
836 
837 /*
838  * pg_mbcliplen with specified encoding
839  */
840 int
841 pg_encoding_mbcliplen(int encoding, const char *mbstr,
842  int len, int limit)
843 {
844  mblen_converter mblen_fn;
845  int clen = 0;
846  int l;
847 
848  /* optimization for single byte encoding */
849  if (pg_encoding_max_length(encoding) == 1)
850  return cliplen(mbstr, len, limit);
851 
852  mblen_fn = pg_wchar_table[encoding].mblen;
853 
854  while (len > 0 && *mbstr)
855  {
856  l = (*mblen_fn) ((const unsigned char *) mbstr);
857  if ((clen + l) > limit)
858  break;
859  clen += l;
860  if (clen == limit)
861  break;
862  len -= l;
863  mbstr += l;
864  }
865  return clen;
866 }
867 
868 /*
869  * Similar to pg_mbcliplen except the limit parameter specifies the
870  * character length, not the byte length.
871  */
872 int
873 pg_mbcharcliplen(const char *mbstr, int len, int limit)
874 {
875  int clen = 0;
876  int nch = 0;
877  int l;
878 
879  /* optimization for single byte encoding */
881  return cliplen(mbstr, len, limit);
882 
883  while (len > 0 && *mbstr)
884  {
885  l = pg_mblen(mbstr);
886  nch++;
887  if (nch > limit)
888  break;
889  clen += l;
890  len -= l;
891  mbstr += l;
892  }
893  return clen;
894 }
895 
896 /* mbcliplen for any single-byte encoding */
897 static int
898 cliplen(const char *str, int len, int limit)
899 {
900  int l = 0;
901 
902  len = Min(len, limit);
903  while (l < len && str[l])
904  l++;
905  return l;
906 }
907 
908 void
910 {
911  if (!PG_VALID_BE_ENCODING(encoding))
912  elog(ERROR, "invalid database encoding: %d", encoding);
913 
914  DatabaseEncoding = &pg_enc2name_tbl[encoding];
915  Assert(DatabaseEncoding->encoding == encoding);
916 }
917 
918 void
920 {
921  /* Some calls happen before we can elog()! */
922  Assert(PG_VALID_ENCODING(encoding));
923 
924  MessageEncoding = &pg_enc2name_tbl[encoding];
925  Assert(MessageEncoding->encoding == encoding);
926 }
927 
928 #ifdef ENABLE_NLS
929 /*
930  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
931  * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
932  * fail for gettext-internal causes like out-of-memory.
933  */
934 static bool
935 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
936 {
937  bool elog_ok = (CurrentMemoryContext != NULL);
938  int i;
939 
940  for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
941  {
942  if (pg_enc2gettext_tbl[i].encoding == encoding)
943  {
944  if (bind_textdomain_codeset(domainname,
946  return true;
947 
948  if (elog_ok)
949  elog(LOG, "bind_textdomain_codeset failed");
950  else
951  write_stderr("bind_textdomain_codeset failed");
952 
953  break;
954  }
955  }
956 
957  return false;
958 }
959 
960 /*
961  * Bind a gettext message domain to the codeset corresponding to the database
962  * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
963  * Return the MessageEncoding implied by the new settings.
964  *
965  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
966  * When that matches the database encoding, we don't need to do anything. In
967  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
968  * database encoding, except for the C locale. (On Windows, we also permit a
969  * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
970  * gettext to the right codeset.
971  *
972  * On Windows, gettext defaults to the Windows ANSI code page. This is a
973  * convenient departure for software that passes the strings to Windows ANSI
974  * APIs, but we don't do that. Compel gettext to use database encoding or,
975  * failing that, the LC_CTYPE encoding as it would on other platforms.
976  *
977  * This function is called before elog() and palloc() are usable.
978  */
979 int
980 pg_bind_textdomain_codeset(const char *domainname)
981 {
982  bool elog_ok = (CurrentMemoryContext != NULL);
983  int encoding = GetDatabaseEncoding();
984  int new_msgenc;
985 
986 #ifndef WIN32
987  const char *ctype = setlocale(LC_CTYPE, NULL);
988 
989  if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
990 #endif
991  if (encoding != PG_SQL_ASCII &&
992  raw_pg_bind_textdomain_codeset(domainname, encoding))
993  return encoding;
994 
995  new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
996  if (new_msgenc < 0)
997  new_msgenc = PG_SQL_ASCII;
998 
999 #ifdef WIN32
1000  if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1001  /* On failure, the old message encoding remains valid. */
1002  return GetMessageEncoding();
1003 #endif
1004 
1005  return new_msgenc;
1006 }
1007 #endif
1008 
1009 /*
1010  * The database encoding, also called the server encoding, represents the
1011  * encoding of data stored in text-like data types. Affected types include
1012  * cstring, text, varchar, name, xml, and json.
1013  */
1014 int
1016 {
1017  return DatabaseEncoding->encoding;
1018 }
1019 
1020 const char *
1022 {
1023  return DatabaseEncoding->name;
1024 }
1025 
1026 Datum
1028 {
1029  return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1030 }
1031 
1032 Datum
1034 {
1035  return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1036 }
1037 
1038 /*
1039  * gettext() returns messages in this encoding. This often matches the
1040  * database encoding, but it differs for SQL_ASCII databases, for processes
1041  * not attached to a database, and under a database encoding lacking iconv
1042  * support (MULE_INTERNAL).
1043  */
1044 int
1046 {
1047  return MessageEncoding->encoding;
1048 }
1049 
1050 #ifdef WIN32
1051 /*
1052  * Result is palloc'ed null-terminated utf16 string. The character length
1053  * is also passed to utf16len if not null. Returns NULL iff failed.
1054  */
1055 WCHAR *
1056 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1057 {
1058  WCHAR *utf16;
1059  int dstlen;
1060  UINT codepage;
1061 
1062  codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
1063 
1064  /*
1065  * Use MultiByteToWideChar directly if there is a corresponding codepage,
1066  * or double conversion through UTF8 if not. Double conversion is needed,
1067  * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1068  */
1069  if (codepage != 0)
1070  {
1071  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1072  dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1073  utf16[dstlen] = (WCHAR) 0;
1074  }
1075  else
1076  {
1077  char *utf8;
1078 
1079  /*
1080  * XXX pg_do_encoding_conversion() requires a transaction. In the
1081  * absence of one, hope for the input to be valid UTF8.
1082  */
1083  if (IsTransactionState())
1084  {
1085  utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1086  len,
1088  PG_UTF8);
1089  if (utf8 != str)
1090  len = strlen(utf8);
1091  }
1092  else
1093  utf8 = (char *) str;
1094 
1095  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1096  dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1097  utf16[dstlen] = (WCHAR) 0;
1098 
1099  if (utf8 != str)
1100  pfree(utf8);
1101  }
1102 
1103  if (dstlen == 0 && len > 0)
1104  {
1105  pfree(utf16);
1106  return NULL; /* error */
1107  }
1108 
1109  if (utf16len)
1110  *utf16len = dstlen;
1111  return utf16;
1112 }
1113 
1114 #endif
#define NIL
Definition: pg_list.h:69
#define PG_GETARG_INT32(n)
Definition: fmgr.h:225
Definition: fmgr.h:53
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:873
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:46
void SetMessageEncoding(int encoding)
Definition: mbutils.c:919
int pg_char_to_encoding(const char *name)
Definition: encnames.c:475
#define VARDATA_ANY(PTR)
Definition: postgres.h:349
#define VARDATA(PTR)
Definition: postgres.h:305
static int32 next
Definition: blutils.c:210
int PrepareClientEncoding(int encoding)
Definition: mbutils.c:114
Datum getdatabaseencoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1027
int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len)
Definition: mbutils.c:741
char * pg_server_to_client(const char *s, int len)
Definition: mbutils.c:634
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:224
#define VARHDRSZ
Definition: c.h:440
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1894
FmgrInfo to_server_info
Definition: mbutils.c:68
FmgrInfo to_client_info
Definition: mbutils.c:69
static bool backend_startup_complete
Definition: mbutils.c:94
#define Min(x, y)
Definition: c.h:801
static int pending_client_encoding
Definition: mbutils.c:95
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define PG_RETURN_INT32(x)
Definition: fmgr.h:298
static FmgrInfo * ToServerConvProc
Definition: mbutils.c:78
#define write_stderr(str)
Definition: parallel.c:182
static char * perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
Definition: mbutils.c:679
int errcode(int sqlerrcode)
Definition: elog.c:575
const char * name
Definition: pg_wchar.h:330
static const pg_enc2name * ClientEncoding
Definition: mbutils.c:84
size_t pg_wchar_strlen(const pg_wchar *str)
Definition: wstrncmp.c:70
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:555
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:313
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding)
Definition: mbutils.c:337
#define OidIsValid(objectId)
Definition: c.h:533
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:645
int pg_wchar2mb(const pg_wchar *from, char *to)
Definition: mbutils.c:749
const pg_enc2gettext pg_enc2gettext_tbl[]
Definition: encnames.c:359
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition: mbutils.c:805
mbdisplaylen_converter dsplen
Definition: pg_wchar.h:361
int s_encoding
Definition: mbutils.c:66
Datum pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
Definition: mbutils.c:540
void pfree(void *pointer)
Definition: mcxt.c:992
char * pg_client_to_server(const char *s, int len)
Definition: mbutils.c:556
#define IS_HIGHBIT_SET(ch)
Definition: c.h:968
#define ERROR
Definition: elog.h:43
#define FunctionCall5(flinfo, arg1, arg2, arg3, arg4, arg5)
Definition: fmgr.h:581
#define FATAL
Definition: elog.h:52
Datum pg_client_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1033
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:831
static List * ConvProcList
Definition: mbutils.c:72
int pg_encoding_mbcliplen(int encoding, const char *mbstr, int len, int limit)
Definition: mbutils.c:841
int pg_encoding_max_length(int encoding)
Definition: wchar.c:1820
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:299
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:307
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
int errdetail(const char *fmt,...)
Definition: elog.c:873
#define CStringGetDatum(X)
Definition: postgres.h:586
int SetClientEncoding(int encoding)
Definition: mbutils.c:212
static ListCell * list_head(const List *l)
Definition: pg_list.h:77
const char * name
Definition: pg_wchar.h:315
struct ConvProcInfo ConvProcInfo
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:303
void fmgr_info_cxt(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt)
Definition: fmgr.c:169
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1877
int pg_mb2wchar(const char *from, pg_wchar *to)
Definition: mbutils.c:727
#define lnext(lc)
Definition: pg_list.h:105
#define ereport(elevel, rest)
Definition: elog.h:122
unsigned int pg_wchar
Definition: mbprint.c:31
MemoryContext TopMemoryContext
Definition: mcxt.c:43
int pg_encoding_wchar2mb_with_len(int encoding, const pg_wchar *from, char *to, int len)
Definition: mbutils.c:763
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:559
#define MaxAllocSize
Definition: memutils.h:40
List * list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Definition: list.c:528
Datum length_in_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:511
uintptr_t Datum
Definition: postgres.h:374
void SetDatabaseEncoding(int encoding)
Definition: mbutils.c:909
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:297
int GetDatabaseEncoding(void)
Definition: mbutils.c:1015
int pg_get_client_encoding(void)
Definition: mbutils.c:317
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:785
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:438
wchar2mb_with_len_converter wchar2mb_with_len
Definition: pg_wchar.h:358
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:734
pg_enc encoding
Definition: pg_wchar.h:316
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:778
static char * encoding
Definition: initdb.c:121
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:531
List * lcons(void *datum, List *list)
Definition: list.c:259
#define MAX_CONVERSION_GROWTH
Definition: mbutils.c:53
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:293
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:670
#define lfirst(lc)
Definition: pg_list.h:106
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1021
static int cliplen(const char *str, int len, int limit)
Definition: mbutils.c:898
Datum pg_convert(PG_FUNCTION_ARGS)
Definition: mbutils.c:449
size_t Size
Definition: c.h:352
const char * pg_get_client_encoding_name(void)
Definition: mbutils.c:326
int(* mblen_converter)(const unsigned char *mbstr)
Definition: pg_wchar.h:346
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:268
Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding)
Definition: namespace.c:3452
bool IsTransactionState(void)
Definition: xact.c:349
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:216
Datum pg_convert_from(PG_FUNCTION_ARGS)
Definition: mbutils.c:422
int pg_mblen(const char *mbstr)
Definition: mbutils.c:771
static const pg_enc2name * MessageEncoding
Definition: mbutils.c:86
void InitializeClientEncoding(void)
Definition: mbutils.c:293
const char * name
Definition: encode.c:521
int GetMessageEncoding(void)
Definition: mbutils.c:1045
#define Int32GetDatum(X)
Definition: postgres.h:487
static FmgrInfo * ToClientConvProc
Definition: mbutils.c:79
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:342
void * palloc(Size size)
Definition: mcxt.c:891
int errmsg(const char *fmt,...)
Definition: elog.c:797
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:1729
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:749
static const pg_enc2name * DatabaseEncoding
Definition: mbutils.c:85
int i
#define NameStr(name)
Definition: c.h:494
Definition: c.h:434
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
mb2wchar_with_len_converter mb2wchar_with_len
Definition: pg_wchar.h:356
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:330
#define elog
Definition: elog.h:219
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:756
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:572
mblen_converter mblen
Definition: pg_wchar.h:360
int c_encoding
Definition: mbutils.c:67
Definition: pg_list.h:45
Datum pg_convert_to(PG_FUNCTION_ARGS)
Definition: mbutils.c:397
#define PG_RETURN_NULL()
Definition: fmgr.h:289
#define PG_GETARG_NAME(n)
Definition: fmgr.h:234
#define OidFunctionCall5(functionId, arg1, arg2, arg3, arg4, arg5)
Definition: fmgr.h:601