PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
mbutils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  * This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required. However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs. Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr(). Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  * src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 
44 /*
45  * We maintain a simple linked list caching the fmgr lookup info for the
46  * currently selected conversion functions, as well as any that have been
47  * selected previously in the current session. (We remember previous
48  * settings because we must be able to restore a previous setting during
49  * transaction rollback, without doing any fresh catalog accesses.)
50  *
51  * Since we'll never release this data, we just keep it in TopMemoryContext.
52  */
53 typedef struct ConvProcInfo
54 {
55  int s_encoding; /* server and client encoding IDs */
57  FmgrInfo to_server_info; /* lookup info for conversion procs */
59 } ConvProcInfo;
60 
61 static List *ConvProcList = NIL; /* List of ConvProcInfo */
62 
63 /*
64  * These variables point to the currently active conversion functions,
65  * or are NULL when no conversion is needed.
66  */
67 static FmgrInfo *ToServerConvProc = NULL;
68 static FmgrInfo *ToClientConvProc = NULL;
69 
70 /*
71  * These variables track the currently-selected encodings.
72  */
76 
77 /*
78  * During backend startup we can't set client encoding because we (a)
79  * can't look up the conversion functions, and (b) may not know the database
80  * encoding yet either. So SetClientEncoding() just accepts anything and
81  * remembers it for InitializeClientEncoding() to apply later.
82  */
83 static bool backend_startup_complete = false;
85 
86 
87 /* Internal functions */
88 static char *perform_default_encoding_conversion(const char *src,
89  int len, bool is_client_to_server);
90 static int cliplen(const char *str, int len, int limit);
91 
92 
93 /*
94  * Prepare for a future call to SetClientEncoding. Success should mean
95  * that SetClientEncoding is guaranteed to succeed for this encoding request.
96  *
97  * (But note that success before backend_startup_complete does not guarantee
98  * success after ...)
99  *
100  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
101  */
102 int
104 {
105  int current_server_encoding;
106  ListCell *lc;
107 
108  if (!PG_VALID_FE_ENCODING(encoding))
109  return -1;
110 
111  /* Can't do anything during startup, per notes above */
113  return 0;
114 
115  current_server_encoding = GetDatabaseEncoding();
116 
117  /*
118  * Check for cases that require no conversion function.
119  */
120  if (current_server_encoding == encoding ||
121  current_server_encoding == PG_SQL_ASCII ||
122  encoding == PG_SQL_ASCII)
123  return 0;
124 
125  if (IsTransactionState())
126  {
127  /*
128  * If we're in a live transaction, it's safe to access the catalogs,
129  * so look up the functions. We repeat the lookup even if the info is
130  * already cached, so that we can react to changes in the contents of
131  * pg_conversion.
132  */
133  Oid to_server_proc,
134  to_client_proc;
135  ConvProcInfo *convinfo;
136  MemoryContext oldcontext;
137 
138  to_server_proc = FindDefaultConversionProc(encoding,
139  current_server_encoding);
140  if (!OidIsValid(to_server_proc))
141  return -1;
142  to_client_proc = FindDefaultConversionProc(current_server_encoding,
143  encoding);
144  if (!OidIsValid(to_client_proc))
145  return -1;
146 
147  /*
148  * Load the fmgr info into TopMemoryContext (could still fail here)
149  */
151  sizeof(ConvProcInfo));
152  convinfo->s_encoding = current_server_encoding;
153  convinfo->c_encoding = encoding;
154  fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
156  fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
158 
159  /* Attach new info to head of list */
161  ConvProcList = lcons(convinfo, ConvProcList);
162  MemoryContextSwitchTo(oldcontext);
163 
164  /*
165  * We cannot yet remove any older entry for the same encoding pair,
166  * since it could still be in use. SetClientEncoding will clean up.
167  */
168 
169  return 0; /* success */
170  }
171  else
172  {
173  /*
174  * If we're not in a live transaction, the only thing we can do is
175  * restore a previous setting using the cache. This covers all
176  * transaction-rollback cases. The only case it might not work for is
177  * trying to change client_encoding on the fly by editing
178  * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
179  * thing to do anyway.
180  */
181  foreach(lc, ConvProcList)
182  {
183  ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
184 
185  if (oldinfo->s_encoding == current_server_encoding &&
186  oldinfo->c_encoding == encoding)
187  return 0;
188  }
189 
190  return -1; /* it's not cached, so fail */
191  }
192 }
193 
194 /*
195  * Set the active client encoding and set up the conversion-function pointers.
196  * PrepareClientEncoding should have been called previously for this encoding.
197  *
198  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
199  */
200 int
202 {
203  int current_server_encoding;
204  bool found;
205  ListCell *lc;
206  ListCell *prev;
207  ListCell *next;
208 
209  if (!PG_VALID_FE_ENCODING(encoding))
210  return -1;
211 
212  /* Can't do anything during startup, per notes above */
214  {
216  return 0;
217  }
218 
219  current_server_encoding = GetDatabaseEncoding();
220 
221  /*
222  * Check for cases that require no conversion function.
223  */
224  if (current_server_encoding == encoding ||
225  current_server_encoding == PG_SQL_ASCII ||
226  encoding == PG_SQL_ASCII)
227  {
228  ClientEncoding = &pg_enc2name_tbl[encoding];
229  ToServerConvProc = NULL;
230  ToClientConvProc = NULL;
231  return 0;
232  }
233 
234  /*
235  * Search the cache for the entry previously prepared by
236  * PrepareClientEncoding; if there isn't one, we lose. While at it,
237  * release any duplicate entries so that repeated Prepare/Set cycles don't
238  * leak memory.
239  */
240  found = false;
241  prev = NULL;
242  for (lc = list_head(ConvProcList); lc; lc = next)
243  {
244  ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
245 
246  next = lnext(lc);
247 
248  if (convinfo->s_encoding == current_server_encoding &&
249  convinfo->c_encoding == encoding)
250  {
251  if (!found)
252  {
253  /* Found newest entry, so set up */
254  ClientEncoding = &pg_enc2name_tbl[encoding];
255  ToServerConvProc = &convinfo->to_server_info;
256  ToClientConvProc = &convinfo->to_client_info;
257  found = true;
258  }
259  else
260  {
261  /* Duplicate entry, release it */
262  ConvProcList = list_delete_cell(ConvProcList, lc, prev);
263  pfree(convinfo);
264  continue; /* prev mustn't advance */
265  }
266  }
267 
268  prev = lc;
269  }
270 
271  if (found)
272  return 0; /* success */
273  else
274  return -1; /* it's not cached, so fail */
275 }
276 
277 /*
278  * Initialize client encoding conversions.
279  * Called from InitPostgres() once during backend startup.
280  */
281 void
283 {
286 
289  {
290  /*
291  * Oops, the requested conversion is not available. We couldn't fail
292  * before, but we can now.
293  */
294  ereport(FATAL,
295  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
296  errmsg("conversion between %s and %s is not supported",
299  }
300 }
301 
302 /*
303  * returns the current client encoding
304  */
305 int
307 {
308  return ClientEncoding->encoding;
309 }
310 
311 /*
312  * returns the current client encoding name
313  */
314 const char *
316 {
317  return ClientEncoding->name;
318 }
319 
320 /*
321  * Convert src string to another encoding (general case).
322  *
323  * See the notes about string conversion functions at the top of this file.
324  */
325 unsigned char *
326 pg_do_encoding_conversion(unsigned char *src, int len,
327  int src_encoding, int dest_encoding)
328 {
329  unsigned char *result;
330  Oid proc;
331 
332  if (len <= 0)
333  return src; /* empty string is always valid */
334 
335  if (src_encoding == dest_encoding)
336  return src; /* no conversion required, assume valid */
337 
338  if (dest_encoding == PG_SQL_ASCII)
339  return src; /* any string is valid in SQL_ASCII */
340 
341  if (src_encoding == PG_SQL_ASCII)
342  {
343  /* No conversion is possible, but we must validate the result */
344  (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
345  return src;
346  }
347 
348  if (!IsTransactionState()) /* shouldn't happen */
349  elog(ERROR, "cannot perform encoding conversion outside a transaction");
350 
351  proc = FindDefaultConversionProc(src_encoding, dest_encoding);
352  if (!OidIsValid(proc))
353  ereport(ERROR,
354  (errcode(ERRCODE_UNDEFINED_FUNCTION),
355  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
356  pg_encoding_to_char(src_encoding),
357  pg_encoding_to_char(dest_encoding))));
358 
359  /*
360  * Allocate space for conversion result, being wary of integer overflow
361  */
362  if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
363  ereport(ERROR,
364  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
365  errmsg("out of memory"),
366  errdetail("String of %d bytes is too long for encoding conversion.",
367  len)));
368 
369  result = palloc(len * MAX_CONVERSION_GROWTH + 1);
370 
371  OidFunctionCall5(proc,
372  Int32GetDatum(src_encoding),
373  Int32GetDatum(dest_encoding),
374  CStringGetDatum(src),
375  CStringGetDatum(result),
376  Int32GetDatum(len));
377  return result;
378 }
379 
380 /*
381  * Convert string to encoding encoding_name. The source
382  * encoding is the DB encoding.
383  *
384  * BYTEA convert_to(TEXT string, NAME encoding_name) */
385 Datum
387 {
388  Datum string = PG_GETARG_DATUM(0);
389  Datum dest_encoding_name = PG_GETARG_DATUM(1);
390  Datum src_encoding_name = DirectFunctionCall1(namein,
391  CStringGetDatum(DatabaseEncoding->name));
392  Datum result;
393 
394  /*
395  * pg_convert expects a bytea as its first argument. We're passing it a
396  * text argument here, relying on the fact that they are both in fact
397  * varlena types, and thus structurally identical.
398  */
399  result = DirectFunctionCall3(pg_convert, string,
400  src_encoding_name, dest_encoding_name);
401 
402  PG_RETURN_DATUM(result);
403 }
404 
405 /*
406  * Convert string from encoding encoding_name. The destination
407  * encoding is the DB encoding.
408  *
409  * TEXT convert_from(BYTEA string, NAME encoding_name) */
410 Datum
412 {
413  Datum string = PG_GETARG_DATUM(0);
414  Datum src_encoding_name = PG_GETARG_DATUM(1);
415  Datum dest_encoding_name = DirectFunctionCall1(namein,
416  CStringGetDatum(DatabaseEncoding->name));
417  Datum result;
418 
419  result = DirectFunctionCall3(pg_convert, string,
420  src_encoding_name, dest_encoding_name);
421 
422  /*
423  * pg_convert returns a bytea, which we in turn return as text, relying on
424  * the fact that they are both in fact varlena types, and thus
425  * structurally identical. Although not all bytea values are valid text,
426  * in this case it will be because we've told pg_convert to return one
427  * that is valid as text in the current database encoding.
428  */
429  PG_RETURN_DATUM(result);
430 }
431 
432 /*
433  * Convert string between two arbitrary encodings.
434  *
435  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
436  */
437 Datum
439 {
440  bytea *string = PG_GETARG_BYTEA_PP(0);
441  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
442  int src_encoding = pg_char_to_encoding(src_encoding_name);
443  char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
444  int dest_encoding = pg_char_to_encoding(dest_encoding_name);
445  const char *src_str;
446  char *dest_str;
447  bytea *retval;
448  int len;
449 
450  if (src_encoding < 0)
451  ereport(ERROR,
452  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
453  errmsg("invalid source encoding name \"%s\"",
454  src_encoding_name)));
455  if (dest_encoding < 0)
456  ereport(ERROR,
457  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
458  errmsg("invalid destination encoding name \"%s\"",
459  dest_encoding_name)));
460 
461  /* make sure that source string is valid */
462  len = VARSIZE_ANY_EXHDR(string);
463  src_str = VARDATA_ANY(string);
464  pg_verify_mbstr_len(src_encoding, src_str, len, false);
465 
466  /* perform conversion */
467  dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
468  len,
469  src_encoding,
470  dest_encoding);
471 
472  /* update len if conversion actually happened */
473  if (dest_str != src_str)
474  len = strlen(dest_str);
475 
476  /*
477  * build bytea data type structure.
478  */
479  retval = (bytea *) palloc(len + VARHDRSZ);
480  SET_VARSIZE(retval, len + VARHDRSZ);
481  memcpy(VARDATA(retval), dest_str, len);
482 
483  if (dest_str != src_str)
484  pfree(dest_str);
485 
486  /* free memory if allocated by the toaster */
487  PG_FREE_IF_COPY(string, 0);
488 
489  PG_RETURN_BYTEA_P(retval);
490 }
491 
492 /*
493  * get the length of the string considered as text in the specified
494  * encoding. Raises an error if the data is not valid in that
495  * encoding.
496  *
497  * INT4 length (BYTEA string, NAME src_encoding_name)
498  */
499 Datum
501 {
502  bytea *string = PG_GETARG_BYTEA_PP(0);
503  char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
504  int src_encoding = pg_char_to_encoding(src_encoding_name);
505  const char *src_str;
506  int len;
507  int retval;
508 
509  if (src_encoding < 0)
510  ereport(ERROR,
511  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
512  errmsg("invalid encoding name \"%s\"",
513  src_encoding_name)));
514 
515  len = VARSIZE_ANY_EXHDR(string);
516  src_str = VARDATA_ANY(string);
517 
518  retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
519 
520  PG_RETURN_INT32(retval);
521 }
522 
523 /*
524  * Get maximum multibyte character length in the specified encoding.
525  *
526  * Note encoding is specified numerically, not by name as above.
527  */
528 Datum
530 {
531  int encoding = PG_GETARG_INT32(0);
532 
533  if (PG_VALID_ENCODING(encoding))
534  PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
535  else
536  PG_RETURN_NULL();
537 }
538 
539 /*
540  * Convert client encoding to server encoding.
541  *
542  * See the notes about string conversion functions at the top of this file.
543  */
544 char *
545 pg_client_to_server(const char *s, int len)
546 {
547  return pg_any_to_server(s, len, ClientEncoding->encoding);
548 }
549 
550 /*
551  * Convert any encoding to server encoding.
552  *
553  * See the notes about string conversion functions at the top of this file.
554  *
555  * Unlike the other string conversion functions, this will apply validation
556  * even if encoding == DatabaseEncoding->encoding. This is because this is
557  * used to process data coming in from outside the database, and we never
558  * want to just assume validity.
559  */
560 char *
561 pg_any_to_server(const char *s, int len, int encoding)
562 {
563  if (len <= 0)
564  return (char *) s; /* empty string is always valid */
565 
566  if (encoding == DatabaseEncoding->encoding ||
567  encoding == PG_SQL_ASCII)
568  {
569  /*
570  * No conversion is needed, but we must still validate the data.
571  */
572  (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
573  return (char *) s;
574  }
575 
576  if (DatabaseEncoding->encoding == PG_SQL_ASCII)
577  {
578  /*
579  * No conversion is possible, but we must still validate the data,
580  * because the client-side code might have done string escaping using
581  * the selected client_encoding. If the client encoding is ASCII-safe
582  * then we just do a straight validation under that encoding. For an
583  * ASCII-unsafe encoding we have a problem: we dare not pass such data
584  * to the parser but we have no way to convert it. We compromise by
585  * rejecting the data if it contains any non-ASCII characters.
586  */
587  if (PG_VALID_BE_ENCODING(encoding))
588  (void) pg_verify_mbstr(encoding, s, len, false);
589  else
590  {
591  int i;
592 
593  for (i = 0; i < len; i++)
594  {
595  if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
596  ereport(ERROR,
597  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
598  errmsg("invalid byte value for encoding \"%s\": 0x%02x",
600  (unsigned char) s[i])));
601  }
602  }
603  return (char *) s;
604  }
605 
606  /* Fast path if we can use cached conversion function */
607  if (encoding == ClientEncoding->encoding)
608  return perform_default_encoding_conversion(s, len, true);
609 
610  /* General case ... will not work outside transactions */
611  return (char *) pg_do_encoding_conversion((unsigned char *) s,
612  len,
613  encoding,
614  DatabaseEncoding->encoding);
615 }
616 
617 /*
618  * Convert server encoding to client encoding.
619  *
620  * See the notes about string conversion functions at the top of this file.
621  */
622 char *
623 pg_server_to_client(const char *s, int len)
624 {
625  return pg_server_to_any(s, len, ClientEncoding->encoding);
626 }
627 
628 /*
629  * Convert server encoding to any encoding.
630  *
631  * See the notes about string conversion functions at the top of this file.
632  */
633 char *
634 pg_server_to_any(const char *s, int len, int encoding)
635 {
636  if (len <= 0)
637  return (char *) s; /* empty string is always valid */
638 
639  if (encoding == DatabaseEncoding->encoding ||
640  encoding == PG_SQL_ASCII)
641  return (char *) s; /* assume data is valid */
642 
643  if (DatabaseEncoding->encoding == PG_SQL_ASCII)
644  {
645  /* No conversion is possible, but we must validate the result */
646  (void) pg_verify_mbstr(encoding, s, len, false);
647  return (char *) s;
648  }
649 
650  /* Fast path if we can use cached conversion function */
651  if (encoding == ClientEncoding->encoding)
652  return perform_default_encoding_conversion(s, len, false);
653 
654  /* General case ... will not work outside transactions */
655  return (char *) pg_do_encoding_conversion((unsigned char *) s,
656  len,
657  DatabaseEncoding->encoding,
658  encoding);
659 }
660 
661 /*
662  * Perform default encoding conversion using cached FmgrInfo. Since
663  * this function does not access database at all, it is safe to call
664  * outside transactions. If the conversion has not been set up by
665  * SetClientEncoding(), no conversion is performed.
666  */
667 static char *
668 perform_default_encoding_conversion(const char *src, int len,
669  bool is_client_to_server)
670 {
671  char *result;
672  int src_encoding,
673  dest_encoding;
674  FmgrInfo *flinfo;
675 
676  if (is_client_to_server)
677  {
678  src_encoding = ClientEncoding->encoding;
679  dest_encoding = DatabaseEncoding->encoding;
680  flinfo = ToServerConvProc;
681  }
682  else
683  {
684  src_encoding = DatabaseEncoding->encoding;
685  dest_encoding = ClientEncoding->encoding;
686  flinfo = ToClientConvProc;
687  }
688 
689  if (flinfo == NULL)
690  return (char *) src;
691 
692  /*
693  * Allocate space for conversion result, being wary of integer overflow
694  */
695  if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
696  ereport(ERROR,
697  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
698  errmsg("out of memory"),
699  errdetail("String of %d bytes is too long for encoding conversion.",
700  len)));
701 
702  result = palloc(len * MAX_CONVERSION_GROWTH + 1);
703 
704  FunctionCall5(flinfo,
705  Int32GetDatum(src_encoding),
706  Int32GetDatum(dest_encoding),
707  CStringGetDatum(src),
708  CStringGetDatum(result),
709  Int32GetDatum(len));
710  return result;
711 }
712 
713 
714 /* convert a multibyte string to a wchar */
715 int
716 pg_mb2wchar(const char *from, pg_wchar *to)
717 {
718  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
719 }
720 
721 /* convert a multibyte string to a wchar with a limited length */
722 int
723 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
724 {
725  return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
726 }
727 
728 /* same, with any encoding */
729 int
731  const char *from, pg_wchar *to, int len)
732 {
733  return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
734 }
735 
736 /* convert a wchar string to a multibyte */
737 int
738 pg_wchar2mb(const pg_wchar *from, char *to)
739 {
740  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
741 }
742 
743 /* convert a wchar string to a multibyte with a limited length */
744 int
745 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
746 {
747  return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
748 }
749 
750 /* same, with any encoding */
751 int
753  const pg_wchar *from, char *to, int len)
754 {
755  return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
756 }
757 
758 /* returns the byte length of a multibyte character */
759 int
760 pg_mblen(const char *mbstr)
761 {
762  return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
763 }
764 
765 /* returns the display length of a multibyte character */
766 int
767 pg_dsplen(const char *mbstr)
768 {
769  return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
770 }
771 
772 /* returns the length (counted in wchars) of a multibyte string */
773 int
774 pg_mbstrlen(const char *mbstr)
775 {
776  int len = 0;
777 
778  /* optimization for single byte encoding */
780  return strlen(mbstr);
781 
782  while (*mbstr)
783  {
784  mbstr += pg_mblen(mbstr);
785  len++;
786  }
787  return len;
788 }
789 
790 /* returns the length (counted in wchars) of a multibyte string
791  * (not necessarily NULL terminated)
792  */
793 int
794 pg_mbstrlen_with_len(const char *mbstr, int limit)
795 {
796  int len = 0;
797 
798  /* optimization for single byte encoding */
800  return limit;
801 
802  while (limit > 0 && *mbstr)
803  {
804  int l = pg_mblen(mbstr);
805 
806  limit -= l;
807  mbstr += l;
808  len++;
809  }
810  return len;
811 }
812 
813 /*
814  * returns the byte length of a multibyte string
815  * (not necessarily NULL terminated)
816  * that is no longer than limit.
817  * this function does not break multibyte character boundary.
818  */
819 int
820 pg_mbcliplen(const char *mbstr, int len, int limit)
821 {
822  return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
823  len, limit);
824 }
825 
826 /*
827  * pg_mbcliplen with specified encoding
828  */
829 int
830 pg_encoding_mbcliplen(int encoding, const char *mbstr,
831  int len, int limit)
832 {
833  mblen_converter mblen_fn;
834  int clen = 0;
835  int l;
836 
837  /* optimization for single byte encoding */
838  if (pg_encoding_max_length(encoding) == 1)
839  return cliplen(mbstr, len, limit);
840 
841  mblen_fn = pg_wchar_table[encoding].mblen;
842 
843  while (len > 0 && *mbstr)
844  {
845  l = (*mblen_fn) ((const unsigned char *) mbstr);
846  if ((clen + l) > limit)
847  break;
848  clen += l;
849  if (clen == limit)
850  break;
851  len -= l;
852  mbstr += l;
853  }
854  return clen;
855 }
856 
857 /*
858  * Similar to pg_mbcliplen except the limit parameter specifies the
859  * character length, not the byte length.
860  */
861 int
862 pg_mbcharcliplen(const char *mbstr, int len, int limit)
863 {
864  int clen = 0;
865  int nch = 0;
866  int l;
867 
868  /* optimization for single byte encoding */
870  return cliplen(mbstr, len, limit);
871 
872  while (len > 0 && *mbstr)
873  {
874  l = pg_mblen(mbstr);
875  nch++;
876  if (nch > limit)
877  break;
878  clen += l;
879  len -= l;
880  mbstr += l;
881  }
882  return clen;
883 }
884 
885 /* mbcliplen for any single-byte encoding */
886 static int
887 cliplen(const char *str, int len, int limit)
888 {
889  int l = 0;
890 
891  len = Min(len, limit);
892  while (l < len && str[l])
893  l++;
894  return l;
895 }
896 
897 void
899 {
900  if (!PG_VALID_BE_ENCODING(encoding))
901  elog(ERROR, "invalid database encoding: %d", encoding);
902 
903  DatabaseEncoding = &pg_enc2name_tbl[encoding];
904  Assert(DatabaseEncoding->encoding == encoding);
905 }
906 
907 void
909 {
910  /* Some calls happen before we can elog()! */
911  Assert(PG_VALID_ENCODING(encoding));
912 
913  MessageEncoding = &pg_enc2name_tbl[encoding];
914  Assert(MessageEncoding->encoding == encoding);
915 }
916 
917 #ifdef ENABLE_NLS
918 /*
919  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
920  * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
921  * fail for gettext-internal causes like out-of-memory.
922  */
923 static bool
924 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
925 {
926  bool elog_ok = (CurrentMemoryContext != NULL);
927  int i;
928 
929  for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
930  {
931  if (pg_enc2gettext_tbl[i].encoding == encoding)
932  {
933  if (bind_textdomain_codeset(domainname,
934  pg_enc2gettext_tbl[i].name) != NULL)
935  return true;
936 
937  if (elog_ok)
938  elog(LOG, "bind_textdomain_codeset failed");
939  else
940  write_stderr("bind_textdomain_codeset failed");
941 
942  break;
943  }
944  }
945 
946  return false;
947 }
948 
949 /*
950  * Bind a gettext message domain to the codeset corresponding to the database
951  * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
952  * Return the MessageEncoding implied by the new settings.
953  *
954  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
955  * When that matches the database encoding, we don't need to do anything. In
956  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
957  * database encoding, except for the C locale. (On Windows, we also permit a
958  * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
959  * gettext to the right codeset.
960  *
961  * On Windows, gettext defaults to the Windows ANSI code page. This is a
962  * convenient departure for software that passes the strings to Windows ANSI
963  * APIs, but we don't do that. Compel gettext to use database encoding or,
964  * failing that, the LC_CTYPE encoding as it would on other platforms.
965  *
966  * This function is called before elog() and palloc() are usable.
967  */
968 int
969 pg_bind_textdomain_codeset(const char *domainname)
970 {
971  bool elog_ok = (CurrentMemoryContext != NULL);
972  int encoding = GetDatabaseEncoding();
973  int new_msgenc;
974 
975 #ifndef WIN32
976  const char *ctype = setlocale(LC_CTYPE, NULL);
977 
978  if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
979 #endif
980  if (encoding != PG_SQL_ASCII &&
981  raw_pg_bind_textdomain_codeset(domainname, encoding))
982  return encoding;
983 
984  new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
985  if (new_msgenc < 0)
986  new_msgenc = PG_SQL_ASCII;
987 
988 #ifdef WIN32
989  if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
990  /* On failure, the old message encoding remains valid. */
991  return GetMessageEncoding();
992 #endif
993 
994  return new_msgenc;
995 }
996 #endif
997 
998 /*
999  * The database encoding, also called the server encoding, represents the
1000  * encoding of data stored in text-like data types. Affected types include
1001  * cstring, text, varchar, name, xml, and json.
1002  */
1003 int
1005 {
1006  return DatabaseEncoding->encoding;
1007 }
1008 
1009 const char *
1011 {
1012  return DatabaseEncoding->name;
1013 }
1014 
1015 Datum
1017 {
1018  return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1019 }
1020 
1021 Datum
1023 {
1024  return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1025 }
1026 
1027 /*
1028  * gettext() returns messages in this encoding. This often matches the
1029  * database encoding, but it differs for SQL_ASCII databases, for processes
1030  * not attached to a database, and under a database encoding lacking iconv
1031  * support (MULE_INTERNAL).
1032  */
1033 int
1035 {
1036  return MessageEncoding->encoding;
1037 }
1038 
1039 #ifdef WIN32
1040 /*
1041  * Result is palloc'ed null-terminated utf16 string. The character length
1042  * is also passed to utf16len if not null. Returns NULL iff failed.
1043  */
1044 WCHAR *
1045 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1046 {
1047  WCHAR *utf16;
1048  int dstlen;
1049  UINT codepage;
1050 
1051  codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
1052 
1053  /*
1054  * Use MultiByteToWideChar directly if there is a corresponding codepage,
1055  * or double conversion through UTF8 if not. Double conversion is needed,
1056  * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1057  */
1058  if (codepage != 0)
1059  {
1060  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1061  dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1062  utf16[dstlen] = (WCHAR) 0;
1063  }
1064  else
1065  {
1066  char *utf8;
1067 
1068  /*
1069  * XXX pg_do_encoding_conversion() requires a transaction. In the
1070  * absence of one, hope for the input to be valid UTF8.
1071  */
1072  if (IsTransactionState())
1073  {
1074  utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1075  len,
1077  PG_UTF8);
1078  if (utf8 != str)
1079  len = strlen(utf8);
1080  }
1081  else
1082  utf8 = (char *) str;
1083 
1084  utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1085  dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1086  utf16[dstlen] = (WCHAR) 0;
1087 
1088  if (utf8 != str)
1089  pfree(utf8);
1090  }
1091 
1092  if (dstlen == 0 && len > 0)
1093  {
1094  pfree(utf16);
1095  return NULL; /* error */
1096  }
1097 
1098  if (utf16len)
1099  *utf16len = dstlen;
1100  return utf16;
1101 }
1102 
1103 #endif
#define NIL
Definition: pg_list.h:69
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
Definition: fmgr.h:56
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:862
Datum namein(PG_FUNCTION_ARGS)
Definition: name.c:46
void SetMessageEncoding(int encoding)
Definition: mbutils.c:908
int pg_char_to_encoding(const char *name)
Definition: encnames.c:551
#define VARDATA_ANY(PTR)
Definition: postgres.h:347
#define VARDATA(PTR)
Definition: postgres.h:303
static int32 next
Definition: blutils.c:210
int PrepareClientEncoding(int encoding)
Definition: mbutils.c:103
Datum getdatabaseencoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1016
int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len)
Definition: mbutils.c:730
char * pg_server_to_client(const char *s, int len)
Definition: mbutils.c:623
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:233
#define VARHDRSZ
Definition: c.h:439
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1894
FmgrInfo to_server_info
Definition: mbutils.c:57
FmgrInfo to_client_info
Definition: mbutils.c:58
static bool backend_startup_complete
Definition: mbutils.c:83
#define Min(x, y)
Definition: c.h:812
static int pending_client_encoding
Definition: mbutils.c:84
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define PG_RETURN_INT32(x)
Definition: fmgr.h:314
static FmgrInfo * ToServerConvProc
Definition: mbutils.c:67
#define write_stderr(str)
Definition: parallel.c:182
static char * perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
Definition: mbutils.c:668
int errcode(int sqlerrcode)
Definition: elog.c:575
const char * name
Definition: pg_wchar.h:343
static const pg_enc2name * ClientEncoding
Definition: mbutils.c:73
size_t pg_wchar_strlen(const pg_wchar *str)
Definition: wstrncmp.c:70
#define DirectFunctionCall1(func, arg1)
Definition: fmgr.h:585
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:330
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding)
Definition: mbutils.c:326
#define OidIsValid(objectId)
Definition: c.h:532
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:634
int pg_wchar2mb(const pg_wchar *from, char *to)
Definition: mbutils.c:738
const pg_enc2gettext pg_enc2gettext_tbl[]
Definition: encnames.c:359
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition: mbutils.c:794
mbdisplaylen_converter dsplen
Definition: pg_wchar.h:380
int s_encoding
Definition: mbutils.c:55
Datum pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
Definition: mbutils.c:529
void pfree(void *pointer)
Definition: mcxt.c:949
char * pg_client_to_server(const char *s, int len)
Definition: mbutils.c:545
#define IS_HIGHBIT_SET(ch)
Definition: c.h:979
#define ERROR
Definition: elog.h:43
#define FunctionCall5(flinfo, arg1, arg2, arg3, arg4, arg5)
Definition: fmgr.h:611
#define FATAL
Definition: elog.h:52
Datum pg_client_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:1022
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:820
static List * ConvProcList
Definition: mbutils.c:61
int pg_encoding_mbcliplen(int encoding, const char *mbstr, int len, int limit)
Definition: mbutils.c:830
int pg_encoding_max_length(int encoding)
Definition: wchar.c:1820
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:301
const pg_enc2name pg_enc2name_tbl[]
Definition: encnames.c:307
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
int errdetail(const char *fmt,...)
Definition: elog.c:873
#define CStringGetDatum(X)
Definition: postgres.h:584
int SetClientEncoding(int encoding)
Definition: mbutils.c:201
static ListCell * list_head(const List *l)
Definition: pg_list.h:77
const char * name
Definition: pg_wchar.h:328
struct ConvProcInfo ConvProcInfo
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
#define PG_VALID_FE_ENCODING(_enc)
Definition: pg_wchar.h:305
void fmgr_info_cxt(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt)
Definition: fmgr.c:132
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1877
int pg_mb2wchar(const char *from, pg_wchar *to)
Definition: mbutils.c:716
#define lnext(lc)
Definition: pg_list.h:105
#define ereport(elevel, rest)
Definition: elog.h:122
unsigned int pg_wchar
Definition: mbprint.c:31
MemoryContext TopMemoryContext
Definition: mcxt.c:43
int pg_encoding_wchar2mb_with_len(int encoding, const pg_wchar *from, char *to, int len)
Definition: mbutils.c:752
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:589
#define MaxAllocSize
Definition: memutils.h:40
List * list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Definition: list.c:528
Datum length_in_encoding(PG_FUNCTION_ARGS)
Definition: mbutils.c:500
uintptr_t Datum
Definition: postgres.h:372
void SetDatabaseEncoding(int encoding)
Definition: mbutils.c:898
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:313
int GetDatabaseEncoding(void)
Definition: mbutils.c:1004
int pg_get_client_encoding(void)
Definition: mbutils.c:306
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:774
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:433
wchar2mb_with_len_converter wchar2mb_with_len
Definition: pg_wchar.h:377
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:723
pg_enc encoding
Definition: pg_wchar.h:329
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:767
static char * encoding
Definition: initdb.c:123
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:607
List * lcons(void *datum, List *list)
Definition: list.c:259
#define PG_VALID_BE_ENCODING(_enc)
Definition: pg_wchar.h:295
#define Assert(condition)
Definition: c.h:681
#define lfirst(lc)
Definition: pg_list.h:106
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1010
static int cliplen(const char *str, int len, int limit)
Definition: mbutils.c:887
Datum pg_convert(PG_FUNCTION_ARGS)
Definition: mbutils.c:438
size_t Size
Definition: c.h:350
const char * pg_get_client_encoding_name(void)
Definition: mbutils.c:315
int(* mblen_converter)(const unsigned char *mbstr)
Definition: pg_wchar.h:365
#define PG_GETARG_BYTEA_PP(n)
Definition: fmgr.h:272
Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding)
Definition: namespace.c:3602
bool IsTransactionState(void)
Definition: xact.c:351
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:225
Datum pg_convert_from(PG_FUNCTION_ARGS)
Definition: mbutils.c:411
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
static const pg_enc2name * MessageEncoding
Definition: mbutils.c:75
void InitializeClientEncoding(void)
Definition: mbutils.c:282
const char * name
Definition: encode.c:521
int GetMessageEncoding(void)
Definition: mbutils.c:1034
#define MAX_CONVERSION_GROWTH
Definition: pg_wchar.h:316
#define Int32GetDatum(X)
Definition: postgres.h:485
static FmgrInfo * ToClientConvProc
Definition: mbutils.c:68
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:340
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:1729
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:706
static const pg_enc2name * DatabaseEncoding
Definition: mbutils.c:74
int i
#define NameStr(name)
Definition: c.h:493
Definition: c.h:433
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
mb2wchar_with_len_converter mb2wchar_with_len
Definition: pg_wchar.h:375
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:328
#define elog
Definition: elog.h:219
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:745
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:561
mblen_converter mblen
Definition: pg_wchar.h:379
int c_encoding
Definition: mbutils.c:56
Definition: pg_list.h:45
Datum pg_convert_to(PG_FUNCTION_ARGS)
Definition: mbutils.c:386
#define PG_RETURN_NULL()
Definition: fmgr.h:305
#define PG_GETARG_NAME(n)
Definition: fmgr.h:243
#define OidFunctionCall5(functionId, arg1, arg2, arg3, arg4, arg5)
Definition: fmgr.h:631