PostgreSQL Source Code git master
Loading...
Searching...
No Matches
mbutils.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * mbutils.c
4 * This file contains functions for encoding conversion.
5 *
6 * The string-conversion functions in this file share some API quirks.
7 * Note the following:
8 *
9 * The functions return a palloc'd, null-terminated string if conversion
10 * is required. However, if no conversion is performed, the given source
11 * string pointer is returned as-is.
12 *
13 * Although the presence of a length argument means that callers can pass
14 * non-null-terminated strings, care is required because the same string
15 * will be passed back if no conversion occurs. Such callers *must* check
16 * whether result == src and handle that case differently.
17 *
18 * If the source and destination encodings are the same, the source string
19 * is returned without any verification; it's assumed to be valid data.
20 * If that might not be the case, the caller is responsible for validating
21 * the string using a separate call to pg_verify_mbstr(). Whenever the
22 * source and destination encodings are different, the functions ensure that
23 * the result is validly encoded according to the destination encoding.
24 *
25 *
26 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/mb/mbutils.c
32 *
33 *-------------------------------------------------------------------------
34 */
35#include "postgres.h"
36
37#include "access/xact.h"
38#include "catalog/namespace.h"
39#include "mb/pg_wchar.h"
40#include "utils/fmgrprotos.h"
41#include "utils/memdebug.h"
42#include "utils/memutils.h"
43#include "utils/relcache.h"
44#include "varatt.h"
45
46/*
47 * We maintain a simple linked list caching the fmgr lookup info for the
48 * currently selected conversion functions, as well as any that have been
49 * selected previously in the current session. (We remember previous
50 * settings because we must be able to restore a previous setting during
51 * transaction rollback, without doing any fresh catalog accesses.)
52 *
53 * Since we'll never release this data, we just keep it in TopMemoryContext.
54 */
55typedef struct ConvProcInfo
56{
57 int s_encoding; /* server and client encoding IDs */
59 FmgrInfo to_server_info; /* lookup info for conversion procs */
62
63static List *ConvProcList = NIL; /* List of ConvProcInfo */
64
65/*
66 * These variables point to the currently active conversion functions,
67 * or are NULL when no conversion is needed.
68 */
71
72/*
73 * This variable stores the conversion function to convert from UTF-8
74 * to the server encoding. It's NULL if the server encoding *is* UTF-8,
75 * or if we lack a conversion function for this.
76 */
78
79/*
80 * These variables track the currently-selected encodings.
81 */
85
86/*
87 * During backend startup we can't set client encoding because we (a)
88 * can't look up the conversion functions, and (b) may not know the database
89 * encoding yet either. So SetClientEncoding() just accepts anything and
90 * remembers it for InitializeClientEncoding() to apply later.
91 */
92static bool backend_startup_complete = false;
94
95
96/* Internal functions */
97static char *perform_default_encoding_conversion(const char *src,
98 int len, bool is_client_to_server);
99static int cliplen(const char *str, int len, int limit);
100
102static void report_invalid_encoding_int(int encoding, const char *mbstr,
103 int mblen, int len);
104
106static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
107
108
109/*
110 * Prepare for a future call to SetClientEncoding. Success should mean
111 * that SetClientEncoding is guaranteed to succeed for this encoding request.
112 *
113 * (But note that success before backend_startup_complete does not guarantee
114 * success after ...)
115 *
116 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
117 */
118int
120{
122 ListCell *lc;
123
125 return -1;
126
127 /* Can't do anything during startup, per notes above */
129 return 0;
130
132
133 /*
134 * Check for cases that require no conversion function.
135 */
139 return 0;
140
141 if (IsTransactionState())
142 {
143 /*
144 * If we're in a live transaction, it's safe to access the catalogs,
145 * so look up the functions. We repeat the lookup even if the info is
146 * already cached, so that we can react to changes in the contents of
147 * pg_conversion.
148 */
152 MemoryContext oldcontext;
153
157 return -1;
159 encoding);
161 return -1;
162
163 /*
164 * Load the fmgr info into TopMemoryContext (could still fail here)
165 */
167 sizeof(ConvProcInfo));
168 convinfo->s_encoding = current_server_encoding;
169 convinfo->c_encoding = encoding;
170 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
172 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
174
175 /* Attach new info to head of list */
178 MemoryContextSwitchTo(oldcontext);
179
180 /*
181 * We cannot yet remove any older entry for the same encoding pair,
182 * since it could still be in use. SetClientEncoding will clean up.
183 */
184
185 return 0; /* success */
186 }
187 else
188 {
189 /*
190 * If we're not in a live transaction, the only thing we can do is
191 * restore a previous setting using the cache. This covers all
192 * transaction-rollback cases. The only case it might not work for is
193 * trying to change client_encoding on the fly by editing
194 * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
195 * thing to do anyway.
196 */
197 foreach(lc, ConvProcList)
198 {
200
201 if (oldinfo->s_encoding == current_server_encoding &&
202 oldinfo->c_encoding == encoding)
203 return 0;
204 }
205
206 return -1; /* it's not cached, so fail */
207 }
208}
209
210/*
211 * Set the active client encoding and set up the conversion-function pointers.
212 * PrepareClientEncoding should have been called previously for this encoding.
213 *
214 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
215 */
216int
218{
220 bool found;
221 ListCell *lc;
222
224 return -1;
225
226 /* Can't do anything during startup, per notes above */
228 {
230 return 0;
231 }
232
234
235 /*
236 * Check for cases that require no conversion function.
237 */
241 {
245 return 0;
246 }
247
248 /*
249 * Search the cache for the entry previously prepared by
250 * PrepareClientEncoding; if there isn't one, we lose. While at it,
251 * release any duplicate entries so that repeated Prepare/Set cycles don't
252 * leak memory.
253 */
254 found = false;
255 foreach(lc, ConvProcList)
256 {
258
259 if (convinfo->s_encoding == current_server_encoding &&
260 convinfo->c_encoding == encoding)
261 {
262 if (!found)
263 {
264 /* Found newest entry, so set up */
266 ToServerConvProc = &convinfo->to_server_info;
267 ToClientConvProc = &convinfo->to_client_info;
268 found = true;
269 }
270 else
271 {
272 /* Duplicate entry, release it */
275 }
276 }
277 }
278
279 if (found)
280 return 0; /* success */
281 else
282 return -1; /* it's not cached, so fail */
283}
284
285/*
286 * Initialize client encoding conversions.
287 * Called from InitPostgres() once during backend startup.
288 */
289void
291{
293
296
299 {
300 /*
301 * Oops, the requested conversion is not available. We couldn't fail
302 * before, but we can now.
303 */
306 errmsg("conversion between %s and %s is not supported",
309 }
310
311 /*
312 * Also look up the UTF8-to-server conversion function if needed. Since
313 * the server encoding is fixed within any one backend process, we don't
314 * have to do this more than once.
315 */
319 {
321
326 /* If there's no such conversion, just leave the pointer as NULL */
328 {
329 FmgrInfo *finfo;
330
332 sizeof(FmgrInfo));
335 /* Set Utf8ToServerConvProc only after data is fully valid */
336 Utf8ToServerConvProc = finfo;
337 }
338 }
339}
340
341/*
342 * returns the current client encoding
343 */
344int
346{
347 return ClientEncoding->encoding;
348}
349
350/*
351 * returns the current client encoding name
352 */
353const char *
358
359/*
360 * Convert src string to another encoding (general case).
361 *
362 * See the notes about string conversion functions at the top of this file.
363 */
364unsigned char *
365pg_do_encoding_conversion(unsigned char *src, int len,
367{
368 unsigned char *result;
369 Oid proc;
370
371 if (len <= 0)
372 return src; /* empty string is always valid */
373
375 return src; /* no conversion required, assume valid */
376
378 return src; /* any string is valid in SQL_ASCII */
379
381 {
382 /* No conversion is possible, but we must validate the result */
383 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
384 return src;
385 }
386
387 if (!IsTransactionState()) /* shouldn't happen */
388 elog(ERROR, "cannot perform encoding conversion outside a transaction");
389
391 if (!OidIsValid(proc))
394 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
397
398 /*
399 * Allocate space for conversion result, being wary of integer overflow.
400 *
401 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
402 * required space, so it might exceed MaxAllocSize even though the result
403 * would actually fit. We do not want to hand back a result string that
404 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
405 * if we just allocate more than that, and don't use it, that's fine.
406 */
410 errmsg("out of memory"),
411 errdetail("String of %d bytes is too long for encoding conversion.",
412 len)));
413
414 result = (unsigned char *)
417
418 (void) OidFunctionCall6(proc,
421 CStringGetDatum((char *) src),
422 CStringGetDatum((char *) result),
424 BoolGetDatum(false));
425
426 /*
427 * If the result is large, it's worth repalloc'ing to release any extra
428 * space we asked for. The cutoff here is somewhat arbitrary, but we
429 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
430 */
431 if (len > 1000000)
432 {
433 Size resultlen = strlen((char *) result);
434
435 if (resultlen >= MaxAllocSize)
438 errmsg("out of memory"),
439 errdetail("String of %d bytes is too long for encoding conversion.",
440 len)));
441
442 result = (unsigned char *) repalloc(result, resultlen + 1);
443 }
444
445 return result;
446}
447
448/*
449 * Convert src string to another encoding.
450 *
451 * This function has a different API than the other conversion functions.
452 * The caller should've looked up the conversion function using
453 * FindDefaultConversionProc(). Unlike the other functions, the converted
454 * result is not palloc'd. It is written to the caller-supplied buffer
455 * instead.
456 *
457 * src_encoding - encoding to convert from
458 * dest_encoding - encoding to convert to
459 * src, srclen - input buffer and its length in bytes
460 * dest, destlen - destination buffer and its size in bytes
461 *
462 * The output is null-terminated.
463 *
464 * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
465 * wouldn't necessarily fit in the output buffer, and the function will not
466 * convert the whole input.
467 *
468 * TODO: The conversion function interface is not great. Firstly, it
469 * would be nice to pass through the destination buffer size to the
470 * conversion function, so that if you pass a shorter destination buffer, it
471 * could still continue to fill up the whole buffer. Currently, we have to
472 * assume worst case expansion and stop the conversion short, even if there
473 * is in fact space left in the destination buffer. Secondly, it would be
474 * nice to return the number of bytes written to the caller, to avoid a call
475 * to strlen().
476 */
477int
479 int src_encoding,
480 int dest_encoding,
481 unsigned char *src, int srclen,
482 unsigned char *dest, int destlen,
483 bool noError)
484{
485 Datum result;
486
487 /*
488 * If the destination buffer is not large enough to hold the result in the
489 * worst case, limit the input size passed to the conversion function.
490 */
491 if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
493
494 result = OidFunctionCall6(proc,
497 CStringGetDatum((char *) src),
498 CStringGetDatum((char *) dest),
501 return DatumGetInt32(result);
502}
503
504/*
505 * Convert string to encoding encoding_name. The source
506 * encoding is the DB encoding.
507 *
508 * BYTEA convert_to(TEXT string, NAME encoding_name)
509 */
510Datum
512{
513 Datum string = PG_GETARG_DATUM(0);
517 Datum result;
518
519 /*
520 * pg_convert expects a bytea as its first argument. We're passing it a
521 * text argument here, relying on the fact that they are both in fact
522 * varlena types, and thus structurally identical.
523 */
524 result = DirectFunctionCall3(pg_convert, string,
526
527 PG_RETURN_DATUM(result);
528}
529
530/*
531 * Convert string from encoding encoding_name. The destination
532 * encoding is the DB encoding.
533 *
534 * TEXT convert_from(BYTEA string, NAME encoding_name)
535 */
536Datum
538{
539 Datum string = PG_GETARG_DATUM(0);
543 Datum result;
544
545 result = DirectFunctionCall3(pg_convert, string,
547
548 /*
549 * pg_convert returns a bytea, which we in turn return as text, relying on
550 * the fact that they are both in fact varlena types, and thus
551 * structurally identical. Although not all bytea values are valid text,
552 * in this case it will be because we've told pg_convert to return one
553 * that is valid as text in the current database encoding.
554 */
555 PG_RETURN_DATUM(result);
556}
557
558/*
559 * Convert string between two arbitrary encodings.
560 *
561 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
562 */
563Datum
565{
566 bytea *string = PG_GETARG_BYTEA_PP(0);
571 const char *src_str;
572 char *dest_str;
573 bytea *retval;
574 int len;
575
576 if (src_encoding < 0)
579 errmsg("invalid source encoding name \"%s\"",
581 if (dest_encoding < 0)
584 errmsg("invalid destination encoding name \"%s\"",
586
587 /* make sure that source string is valid */
588 len = VARSIZE_ANY_EXHDR(string);
589 src_str = VARDATA_ANY(string);
591
592 /* perform conversion */
593 dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
594 len,
597
598
599 /* return source string if no conversion happened */
600 if (dest_str == src_str)
601 PG_RETURN_BYTEA_P(string);
602
603 /*
604 * build bytea data type structure.
605 */
607 retval = (bytea *) palloc(len + VARHDRSZ);
608 SET_VARSIZE(retval, len + VARHDRSZ);
609 memcpy(VARDATA(retval), dest_str, len);
611
612 /* free memory if allocated by the toaster */
613 PG_FREE_IF_COPY(string, 0);
614
615 PG_RETURN_BYTEA_P(retval);
616}
617
618/*
619 * get the length of the string considered as text in the specified
620 * encoding. Raises an error if the data is not valid in that
621 * encoding.
622 *
623 * INT4 length (BYTEA string, NAME src_encoding_name)
624 */
625Datum
627{
628 bytea *string = PG_GETARG_BYTEA_PP(0);
631 const char *src_str;
632 int len;
633 int retval;
634
635 if (src_encoding < 0)
638 errmsg("invalid encoding name \"%s\"",
640
641 len = VARSIZE_ANY_EXHDR(string);
642 src_str = VARDATA_ANY(string);
643
644 retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
645
646 PG_RETURN_INT32(retval);
647}
648
649/*
650 * Get maximum multibyte character length in the specified encoding.
651 *
652 * Note encoding is specified numerically, not by name as above.
653 */
654Datum
664
665/*
666 * Convert client encoding to server encoding.
667 *
668 * See the notes about string conversion functions at the top of this file.
669 */
670char *
671pg_client_to_server(const char *s, int len)
672{
674}
675
676/*
677 * Convert any encoding to server encoding.
678 *
679 * See the notes about string conversion functions at the top of this file.
680 *
681 * Unlike the other string conversion functions, this will apply validation
682 * even if encoding == DatabaseEncoding->encoding. This is because this is
683 * used to process data coming in from outside the database, and we never
684 * want to just assume validity.
685 */
686char *
687pg_any_to_server(const char *s, int len, int encoding)
688{
689 if (len <= 0)
690 return unconstify(char *, s); /* empty string is always valid */
691
694 {
695 /*
696 * No conversion is needed, but we must still validate the data.
697 */
699 return unconstify(char *, s);
700 }
701
703 {
704 /*
705 * No conversion is possible, but we must still validate the data,
706 * because the client-side code might have done string escaping using
707 * the selected client_encoding. If the client encoding is ASCII-safe
708 * then we just do a straight validation under that encoding. For an
709 * ASCII-unsafe encoding we have a problem: we dare not pass such data
710 * to the parser but we have no way to convert it. We compromise by
711 * rejecting the data if it contains any non-ASCII characters.
712 */
714 (void) pg_verify_mbstr(encoding, s, len, false);
715 else
716 {
717 int i;
718
719 for (i = 0; i < len; i++)
720 {
721 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
724 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
726 (unsigned char) s[i])));
727 }
728 }
729 return unconstify(char *, s);
730 }
731
732 /* Fast path if we can use cached conversion function */
735
736 /* General case ... will not work outside transactions */
737 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
738 len,
739 encoding,
741}
742
743/*
744 * Convert server encoding to client encoding.
745 *
746 * See the notes about string conversion functions at the top of this file.
747 */
748char *
749pg_server_to_client(const char *s, int len)
750{
752}
753
754/*
755 * Convert server encoding to any encoding.
756 *
757 * See the notes about string conversion functions at the top of this file.
758 */
759char *
760pg_server_to_any(const char *s, int len, int encoding)
761{
762 if (len <= 0)
763 return unconstify(char *, s); /* empty string is always valid */
764
767 return unconstify(char *, s); /* assume data is valid */
768
770 {
771 /* No conversion is possible, but we must validate the result */
772 (void) pg_verify_mbstr(encoding, s, len, false);
773 return unconstify(char *, s);
774 }
775
776 /* Fast path if we can use cached conversion function */
779
780 /* General case ... will not work outside transactions */
781 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
782 len,
784 encoding);
785}
786
787/*
788 * Perform default encoding conversion using cached FmgrInfo. Since
789 * this function does not access database at all, it is safe to call
790 * outside transactions. If the conversion has not been set up by
791 * SetClientEncoding(), no conversion is performed.
792 */
793static char *
796{
797 char *result;
798 int src_encoding,
800 FmgrInfo *flinfo;
801
803 {
806 flinfo = ToServerConvProc;
807 }
808 else
809 {
812 flinfo = ToClientConvProc;
813 }
814
815 if (flinfo == NULL)
816 return unconstify(char *, src);
817
818 /*
819 * Allocate space for conversion result, being wary of integer overflow.
820 * See comments in pg_do_encoding_conversion.
821 */
825 errmsg("out of memory"),
826 errdetail("String of %d bytes is too long for encoding conversion.",
827 len)));
828
829 result = (char *)
832
833 FunctionCall6(flinfo,
836 CStringGetDatum(src),
837 CStringGetDatum(result),
839 BoolGetDatum(false));
840
841 /*
842 * Release extra space if there might be a lot --- see comments in
843 * pg_do_encoding_conversion.
844 */
845 if (len > 1000000)
846 {
847 Size resultlen = strlen(result);
848
849 if (resultlen >= MaxAllocSize)
852 errmsg("out of memory"),
853 errdetail("String of %d bytes is too long for encoding conversion.",
854 len)));
855
856 result = (char *) repalloc(result, resultlen + 1);
857 }
858
859 return result;
860}
861
862/*
863 * Convert a single Unicode code point into a string in the server encoding.
864 *
865 * The code point given by "c" is converted and stored at *s, which must
866 * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
867 * The output will have a trailing '\0'. Throws error if the conversion
868 * cannot be performed.
869 *
870 * Note that this relies on having previously looked up any required
871 * conversion function. That's partly for speed but mostly because the parser
872 * may call this outside any transaction, or in an aborted transaction.
873 */
874void
875pg_unicode_to_server(char32_t c, unsigned char *s)
876{
877 unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
878 int c_as_utf8_len;
879 int server_encoding;
880
881 /*
882 * Complain if invalid Unicode code point. The choice of errcode here is
883 * debatable, but really our caller should have checked this anyway.
884 */
888 errmsg("invalid Unicode code point")));
889
890 /* Otherwise, if it's in ASCII range, conversion is trivial */
891 if (c <= 0x7F)
892 {
893 s[0] = (unsigned char) c;
894 s[1] = '\0';
895 return;
896 }
897
898 /* If the server encoding is UTF-8, we just need to reformat the code */
901 {
902 unicode_to_utf8(c, s);
903 s[pg_utf_mblen(s)] = '\0';
904 return;
905 }
906
907 /* For all other cases, we must have a conversion function available */
911 errmsg("conversion between %s and %s is not supported",
914
915 /* Construct UTF-8 source string */
918 c_as_utf8[c_as_utf8_len] = '\0';
919
920 /* Convert, or throw error if we can't */
924 CStringGetDatum((char *) c_as_utf8),
925 CStringGetDatum((char *) s),
927 BoolGetDatum(false));
928}
929
930/*
931 * Convert a single Unicode code point into a string in the server encoding.
932 *
933 * Same as pg_unicode_to_server(), except that we don't throw errors,
934 * but simply return false on conversion failure.
935 */
936bool
937pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
938{
939 unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
940 int c_as_utf8_len;
941 int converted_len;
942 int server_encoding;
943
944 /* Fail if invalid Unicode code point */
946 return false;
947
948 /* Otherwise, if it's in ASCII range, conversion is trivial */
949 if (c <= 0x7F)
950 {
951 s[0] = (unsigned char) c;
952 s[1] = '\0';
953 return true;
954 }
955
956 /* If the server encoding is UTF-8, we just need to reformat the code */
959 {
960 unicode_to_utf8(c, s);
961 s[pg_utf_mblen(s)] = '\0';
962 return true;
963 }
964
965 /* For all other cases, we must have a conversion function available */
967 return false;
968
969 /* Construct UTF-8 source string */
972 c_as_utf8[c_as_utf8_len] = '\0';
973
974 /* Convert, but without throwing error if we can't */
978 CStringGetDatum((char *) c_as_utf8),
979 CStringGetDatum((char *) s),
981 BoolGetDatum(true)));
982
983 /* Conversion was successful iff it consumed the whole input */
984 return (converted_len == c_as_utf8_len);
985}
986
987
988/* convert a multibyte string to a wchar */
989int
990pg_mb2wchar(const char *from, pg_wchar *to)
991{
992 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
993}
994
995/* convert a multibyte string to a wchar with a limited length */
996int
997pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
998{
999 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
1000}
1001
1002/* same, with any encoding */
1003int
1005 const char *from, pg_wchar *to, int len)
1006{
1007 return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
1008}
1009
1010/* convert a wchar string to a multibyte */
1011int
1012pg_wchar2mb(const pg_wchar *from, char *to)
1013{
1014 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
1015}
1016
1017/* convert a wchar string to a multibyte with a limited length */
1018int
1019pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
1020{
1021 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1022}
1023
1024/* same, with any encoding */
1025int
1027 const pg_wchar *from, char *to, int len)
1028{
1029 return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1030}
1031
1032/*
1033 * Returns the byte length of a multibyte character sequence in a
1034 * null-terminated string. Raises an illegal byte sequence error if the
1035 * sequence would hit a null terminator.
1036 *
1037 * The caller is expected to have checked for a terminator at *mbstr == 0
1038 * before calling, but some callers want 1 in that case, so this function
1039 * continues that tradition.
1040 *
1041 * This must only be used for strings that have a null-terminator to enable
1042 * bounds detection.
1043 */
1044int
1046{
1047 int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1048
1049 /*
1050 * The .mblen functions return 1 when given a pointer to a terminator.
1051 * Some callers depend on that, so we tolerate it for now. Well-behaved
1052 * callers check the leading byte for a terminator *before* calling.
1053 */
1054 for (int i = 1; i < length; ++i)
1055 if (unlikely(mbstr[i] == 0))
1057
1058 /*
1059 * String should be NUL-terminated, but checking that would make typical
1060 * callers O(N^2), tripling Valgrind check-world time. Unless
1061 * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we
1062 * found a character, not a terminator, the next byte must be a terminator
1063 * or the start of the next character.) If the caller iterates the whole
1064 * string, the last call will diagnose a missing terminator.
1065 */
1066 if (mbstr[0] != '\0')
1067 {
1068#ifdef VALGRIND_EXPENSIVE
1070#else
1072#endif
1073 }
1074
1075 return length;
1076}
1077
1078/*
1079 * Returns the byte length of a multibyte character sequence bounded by a range
1080 * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence
1081 * error if the sequence would exceed the range.
1082 */
1083int
1084pg_mblen_range(const char *mbstr, const char *end)
1085{
1086 int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1087
1088 Assert(end > mbstr);
1089
1090 if (unlikely(mbstr + length > end))
1091 report_invalid_encoding_db(mbstr, length, end - mbstr);
1092
1093#ifdef VALGRIND_EXPENSIVE
1095#else
1097#endif
1098
1099 return length;
1100}
1101
1102/*
1103 * Returns the byte length of a multibyte character sequence bounded by a range
1104 * extending for 'limit' bytes, which must be at least one. Raises an illegal
1105 * byte sequence error if the sequence would exceed the range.
1106 */
1107int
1108pg_mblen_with_len(const char *mbstr, int limit)
1109{
1110 int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1111
1112 Assert(limit >= 1);
1113
1114 if (unlikely(length > limit))
1115 report_invalid_encoding_db(mbstr, length, limit);
1116
1117#ifdef VALGRIND_EXPENSIVE
1119#else
1121#endif
1122
1123 return length;
1124}
1125
1126
1127/*
1128 * Returns the length of a multibyte character sequence, without any
1129 * validation of bounds.
1130 *
1131 * PLEASE NOTE: This function can only be used safely if the caller has
1132 * already verified the input string, since otherwise there is a risk of
1133 * overrunning the buffer if the string is invalid. A prior call to a
1134 * pg_mbstrlen* function suffices.
1135 */
1136int
1138{
1139 int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1140
1142
1143 return length;
1144}
1145
1146/*
1147 * Historical name for pg_mblen_unbounded(). Should not be used and will be
1148 * removed in a later version.
1149 */
1150int
1151pg_mblen(const char *mbstr)
1152{
1153 return pg_mblen_unbounded(mbstr);
1154}
1155
1156/* returns the display length of a multibyte character */
1157int
1158pg_dsplen(const char *mbstr)
1159{
1160 return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1161}
1162
1163/* returns the length (counted in wchars) of a multibyte string */
1164int
1165pg_mbstrlen(const char *mbstr)
1166{
1167 int len = 0;
1168
1169 /* optimization for single byte encoding */
1171 return strlen(mbstr);
1172
1173 while (*mbstr)
1174 {
1176 len++;
1177 }
1178 return len;
1179}
1180
1181/* returns the length (counted in wchars) of a multibyte string
1182 * (stops at the first of "limit" or a NUL)
1183 */
1184int
1185pg_mbstrlen_with_len(const char *mbstr, int limit)
1186{
1187 int len = 0;
1188
1189 /* optimization for single byte encoding */
1191 return limit;
1192
1193 while (limit > 0 && *mbstr)
1194 {
1195 int l = pg_mblen_with_len(mbstr, limit);
1196
1197 limit -= l;
1198 mbstr += l;
1199 len++;
1200 }
1201 return len;
1202}
1203
1204/*
1205 * returns the byte length of a multibyte string
1206 * (not necessarily NULL terminated)
1207 * that is no longer than limit.
1208 * this function does not break multibyte character boundary.
1209 */
1210int
1211pg_mbcliplen(const char *mbstr, int len, int limit)
1212{
1214 len, limit);
1215}
1216
1217/*
1218 * pg_mbcliplen with specified encoding; string must be valid in encoding
1219 */
1220int
1222 int len, int limit)
1223{
1225 int clen = 0;
1226 int l;
1227
1228 /* optimization for single byte encoding */
1230 return cliplen(mbstr, len, limit);
1231
1233
1234 while (len > 0 && *mbstr)
1235 {
1236 l = (*mblen_fn) ((const unsigned char *) mbstr);
1237 if ((clen + l) > limit)
1238 break;
1239 clen += l;
1240 if (clen == limit)
1241 break;
1242 len -= l;
1243 mbstr += l;
1244 }
1245 return clen;
1246}
1247
1248/*
1249 * Similar to pg_mbcliplen except the limit parameter specifies the
1250 * character length, not the byte length.
1251 */
1252int
1253pg_mbcharcliplen(const char *mbstr, int len, int limit)
1254{
1255 int clen = 0;
1256 int nch = 0;
1257 int l;
1258
1259 /* optimization for single byte encoding */
1261 return cliplen(mbstr, len, limit);
1262
1263 while (len > 0 && *mbstr)
1264 {
1266 nch++;
1267 if (nch > limit)
1268 break;
1269 clen += l;
1270 len -= l;
1271 mbstr += l;
1272 }
1273 return clen;
1274}
1275
1276/* mbcliplen for any single-byte encoding */
1277static int
1278cliplen(const char *str, int len, int limit)
1279{
1280 int l = 0;
1281
1282 len = Min(len, limit);
1283 while (l < len && str[l])
1284 l++;
1285 return l;
1286}
1287
1288void
1290{
1292 elog(ERROR, "invalid database encoding: %d", encoding);
1293
1296}
1297
1298void
1300{
1301 /* Some calls happen before we can elog()! */
1303
1306}
1307
1308#ifdef ENABLE_NLS
1309/*
1310 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1311 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1312 * fail for gettext-internal causes like out-of-memory.
1313 */
1314static bool
1315raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1316{
1317 bool elog_ok = (CurrentMemoryContext != NULL);
1318
1320 return false;
1321
1322 if (bind_textdomain_codeset(domainname,
1324 return true;
1325
1326 if (elog_ok)
1327 elog(LOG, "bind_textdomain_codeset failed");
1328 else
1329 write_stderr("bind_textdomain_codeset failed");
1330
1331 return false;
1332}
1333
1334/*
1335 * Bind a gettext message domain to the codeset corresponding to the database
1336 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1337 * Return the MessageEncoding implied by the new settings.
1338 *
1339 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1340 * When that matches the database encoding, we don't need to do anything. In
1341 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1342 * database encoding, except for the C locale. (On Windows, we also permit a
1343 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1344 * gettext to the right codeset.
1345 *
1346 * On Windows, gettext defaults to the Windows ANSI code page. This is a
1347 * convenient departure for software that passes the strings to Windows ANSI
1348 * APIs, but we don't do that. Compel gettext to use database encoding or,
1349 * failing that, the LC_CTYPE encoding as it would on other platforms.
1350 *
1351 * This function is called before elog() and palloc() are usable.
1352 */
1353int
1354pg_bind_textdomain_codeset(const char *domainname)
1355{
1356 bool elog_ok = (CurrentMemoryContext != NULL);
1358 int new_msgenc;
1359
1360#ifndef WIN32
1361 const char *ctype = setlocale(LC_CTYPE, NULL);
1362
1363 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1364#endif
1365 if (encoding != PG_SQL_ASCII &&
1367 return encoding;
1368
1370 if (new_msgenc < 0)
1372
1373#ifdef WIN32
1375 /* On failure, the old message encoding remains valid. */
1376 return GetMessageEncoding();
1377#endif
1378
1379 return new_msgenc;
1380}
1381#endif
1382
1383/*
1384 * The database encoding, also called the server encoding, represents the
1385 * encoding of data stored in text-like data types. Affected types include
1386 * cstring, text, varchar, name, xml, and json.
1387 */
1388int
1390{
1391 return DatabaseEncoding->encoding;
1392}
1393
1394const char *
1396{
1397 return DatabaseEncoding->name;
1398}
1399
1400Datum
1405
1406Datum
1411
1412Datum
1419
1420Datum
1428
1429/*
1430 * gettext() returns messages in this encoding. This often matches the
1431 * database encoding, but it differs for SQL_ASCII databases, for processes
1432 * not attached to a database, and under a database encoding lacking iconv
1433 * support (MULE_INTERNAL).
1434 */
1435int
1437{
1438 return MessageEncoding->encoding;
1439}
1440
1441
1442/*
1443 * Generic character incrementer function.
1444 *
1445 * Not knowing anything about the properties of the encoding in use, we just
1446 * keep incrementing the last byte until we get a validly-encoded result,
1447 * or we run out of values to try. We don't bother to try incrementing
1448 * higher-order bytes, so there's no growth in runtime for wider characters.
1449 * (If we did try to do that, we'd need to consider the likelihood that 255
1450 * is not a valid final byte in the encoding.)
1451 */
1452static bool
1453pg_generic_charinc(unsigned char *charptr, int len)
1454{
1455 unsigned char *lastbyte = charptr + len - 1;
1457
1458 /* We can just invoke the character verifier directly. */
1460
1461 while (*lastbyte < (unsigned char) 255)
1462 {
1463 (*lastbyte)++;
1464 if ((*mbverify) (charptr, len) == len)
1465 return true;
1466 }
1467
1468 return false;
1469}
1470
1471/*
1472 * UTF-8 character incrementer function.
1473 *
1474 * For a one-byte character less than 0x7F, we just increment the byte.
1475 *
1476 * For a multibyte character, every byte but the first must fall between 0x80
1477 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1478 * the last byte that's not already at its maximum value. If we can't find a
1479 * byte that's less than the maximum allowable value, we simply fail. We also
1480 * need some special-case logic to skip regions used for surrogate pair
1481 * handling, as those should not occur in valid UTF-8.
1482 *
1483 * Note that we don't reset lower-order bytes back to their minimums, since
1484 * we can't afford to make an exhaustive search (see make_greater_string).
1485 */
1486static bool
1487pg_utf8_increment(unsigned char *charptr, int length)
1488{
1489 unsigned char a;
1490 unsigned char limit;
1491
1492 switch (length)
1493 {
1494 default:
1495 /* reject lengths 5 and 6 for now */
1496 return false;
1497 case 4:
1498 a = charptr[3];
1499 if (a < 0xBF)
1500 {
1501 charptr[3]++;
1502 break;
1503 }
1505 case 3:
1506 a = charptr[2];
1507 if (a < 0xBF)
1508 {
1509 charptr[2]++;
1510 break;
1511 }
1513 case 2:
1514 a = charptr[1];
1515 switch (*charptr)
1516 {
1517 case 0xED:
1518 limit = 0x9F;
1519 break;
1520 case 0xF4:
1521 limit = 0x8F;
1522 break;
1523 default:
1524 limit = 0xBF;
1525 break;
1526 }
1527 if (a < limit)
1528 {
1529 charptr[1]++;
1530 break;
1531 }
1533 case 1:
1534 a = *charptr;
1535 if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1536 return false;
1537 charptr[0]++;
1538 break;
1539 }
1540
1541 return true;
1542}
1543
1544/*
1545 * EUC-JP character incrementer function.
1546 *
1547 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1548 * representing JIS X 0201 characters with the second byte ranging between
1549 * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1550 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1551 *
1552 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1553 * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1554 * is incremented if possible, otherwise the second-to-last byte.
1555 *
1556 * If the sequence starts with a value other than the above and its MSB
1557 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1558 * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1559 * incremented if possible, otherwise the second-to-last byte.
1560 *
1561 * Otherwise, the sequence is a single-byte ASCII character. It is
1562 * incremented up to 0x7f.
1563 */
1564static bool
1565pg_eucjp_increment(unsigned char *charptr, int length)
1566{
1567 unsigned char c1,
1568 c2;
1569 int i;
1570
1571 c1 = *charptr;
1572
1573 switch (c1)
1574 {
1575 case SS2: /* JIS X 0201 */
1576 if (length != 2)
1577 return false;
1578
1579 c2 = charptr[1];
1580
1581 if (c2 >= 0xdf)
1582 charptr[0] = charptr[1] = 0xa1;
1583 else if (c2 < 0xa1)
1584 charptr[1] = 0xa1;
1585 else
1586 charptr[1]++;
1587 break;
1588
1589 case SS3: /* JIS X 0212 */
1590 if (length != 3)
1591 return false;
1592
1593 for (i = 2; i > 0; i--)
1594 {
1595 c2 = charptr[i];
1596 if (c2 < 0xa1)
1597 {
1598 charptr[i] = 0xa1;
1599 return true;
1600 }
1601 else if (c2 < 0xfe)
1602 {
1603 charptr[i]++;
1604 return true;
1605 }
1606 }
1607
1608 /* Out of 3-byte code region */
1609 return false;
1610
1611 default:
1612 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1613 {
1614 if (length != 2)
1615 return false;
1616
1617 for (i = 1; i >= 0; i--)
1618 {
1619 c2 = charptr[i];
1620 if (c2 < 0xa1)
1621 {
1622 charptr[i] = 0xa1;
1623 return true;
1624 }
1625 else if (c2 < 0xfe)
1626 {
1627 charptr[i]++;
1628 return true;
1629 }
1630 }
1631
1632 /* Out of 2 byte code region */
1633 return false;
1634 }
1635 else
1636 { /* ASCII, single byte */
1637 if (c1 > 0x7e)
1638 return false;
1639 (*charptr)++;
1640 }
1641 break;
1642 }
1643
1644 return true;
1645}
1646
1647/*
1648 * get the character incrementer for the encoding for the current database
1649 */
1652{
1653 /*
1654 * Eventually it might be best to add a field to pg_wchar_table[], but for
1655 * now we just use a switch.
1656 */
1657 switch (GetDatabaseEncoding())
1658 {
1659 case PG_UTF8:
1660 return pg_utf8_increment;
1661
1662 case PG_EUC_JP:
1663 return pg_eucjp_increment;
1664
1665 default:
1666 return pg_generic_charinc;
1667 }
1668}
1669
1670/*
1671 * fetch maximum length of the encoding for the current database
1672 */
1673int
1678
1679/*
1680 * Verify mbstr to make sure that it is validly encoded in the current
1681 * database encoding. Otherwise same as pg_verify_mbstr().
1682 */
1683bool
1684pg_verifymbstr(const char *mbstr, int len, bool noError)
1685{
1687}
1688
1689/*
1690 * Verify mbstr to make sure that it is validly encoded in the specified
1691 * encoding.
1692 */
1693bool
1694pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1695{
1696 int oklen;
1697
1699
1700 oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1701 if (oklen != len)
1702 {
1703 if (noError)
1704 return false;
1706 }
1707 return true;
1708}
1709
1710/*
1711 * Verify mbstr to make sure that it is validly encoded in the specified
1712 * encoding.
1713 *
1714 * mbstr is not necessarily zero terminated; length of mbstr is
1715 * specified by len.
1716 *
1717 * If OK, return length of string in the encoding.
1718 * If a problem is found, return -1 when noError is
1719 * true; when noError is false, ereport() a descriptive message.
1720 *
1721 * Note: We cannot use the faster encoding-specific mbverifystr() function
1722 * here, because we need to count the number of characters in the string.
1723 */
1724int
1725pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1726{
1727 mbchar_verifier mbverifychar;
1728 int mb_len;
1729
1731
1732 /*
1733 * In single-byte encodings, we need only reject nulls (\0).
1734 */
1736 {
1737 const char *nullpos = memchr(mbstr, 0, len);
1738
1739 if (nullpos == NULL)
1740 return len;
1741 if (noError)
1742 return -1;
1744 }
1745
1746 /* fetch function pointer just once */
1747 mbverifychar = pg_wchar_table[encoding].mbverifychar;
1748
1749 mb_len = 0;
1750
1751 while (len > 0)
1752 {
1753 int l;
1754
1755 /* fast path for ASCII-subset characters */
1756 if (!IS_HIGHBIT_SET(*mbstr))
1757 {
1758 if (*mbstr != '\0')
1759 {
1760 mb_len++;
1761 mbstr++;
1762 len--;
1763 continue;
1764 }
1765 if (noError)
1766 return -1;
1768 }
1769
1770 l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1771
1772 if (l < 0)
1773 {
1774 if (noError)
1775 return -1;
1777 }
1778
1779 mbstr += l;
1780 len -= l;
1781 mb_len++;
1782 }
1783 return mb_len;
1784}
1785
1786/*
1787 * check_encoding_conversion_args: check arguments of a conversion function
1788 *
1789 * "expected" arguments can be either an encoding ID or -1 to indicate that
1790 * the caller will check whether it accepts the ID.
1791 *
1792 * Note: the errors here are not really user-facing, so elog instead of
1793 * ereport seems sufficient. Also, we trust that the "expected" encoding
1794 * arguments are valid encoding IDs, but we don't trust the actuals.
1795 */
1796void
1798 int dest_encoding,
1799 int len,
1802{
1804 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1806 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1810 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1812 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1815 if (len < 0)
1816 elog(ERROR, "encoding conversion length must not be negative");
1817}
1818
1819/*
1820 * report_invalid_encoding: complain about invalid multibyte character
1821 *
1822 * note: len is remaining length of string, not length of character;
1823 * len must be greater than zero (or we'd neglect initializing "buf").
1824 */
1825void
1832
1833static void
1834report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
1835{
1836 char buf[8 * 5 + 1];
1837 char *p = buf;
1838 int j,
1839 jlimit;
1840
1841 jlimit = Min(mblen, len);
1842 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1843
1844 for (j = 0; j < jlimit; j++)
1845 {
1846 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1847 if (j < jlimit - 1)
1848 p += sprintf(p, " ");
1849 }
1850
1851 ereport(ERROR,
1853 errmsg("invalid byte sequence for encoding \"%s\": %s",
1855 buf)));
1856}
1857
1858static void
1859report_invalid_encoding_db(const char *mbstr, int mblen, int len)
1860{
1862}
1863
1864/*
1865 * report_untranslatable_char: complain about untranslatable character
1866 *
1867 * note: len is remaining length of string, not length of character;
1868 * len must be greater than zero (or we'd neglect initializing "buf").
1869 */
1870void
1872 const char *mbstr, int len)
1873{
1874 int l;
1875 char buf[8 * 5 + 1];
1876 char *p = buf;
1877 int j,
1878 jlimit;
1879
1880 /*
1881 * We probably could use plain pg_encoding_mblen(), because
1882 * gb18030_to_utf8() verifies before it converts. All conversions should.
1883 * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even
1884 * so, be defensive, since a buggy conversion might pass invalid data.
1885 * This is not a performance-critical path.
1886 */
1888 jlimit = Min(l, len);
1889 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1890
1891 for (j = 0; j < jlimit; j++)
1892 {
1893 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1894 if (j < jlimit - 1)
1895 p += sprintf(p, " ");
1896 }
1897
1898 ereport(ERROR,
1900 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1901 buf,
1904}
1905
1906
1907#ifdef WIN32
1908/*
1909 * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1910 * string. The character length is also passed to utf16len if not
1911 * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1912 * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1913 */
1914WCHAR *
1915pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1916{
1917 int msgenc = GetMessageEncoding();
1918 WCHAR *utf16;
1919 int dstlen;
1920 UINT codepage;
1921
1922 if (msgenc == PG_SQL_ASCII)
1923 /* No conversion is possible, and SQL_ASCII is never utf16. */
1924 return NULL;
1925
1926 codepage = pg_enc2name_tbl[msgenc].codepage;
1927
1928 /*
1929 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1930 * or double conversion through UTF8 if not. Double conversion is needed,
1931 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1932 */
1933 if (codepage != 0)
1934 {
1935 utf16 = palloc_array(WCHAR, len + 1);
1937 utf16[dstlen] = (WCHAR) 0;
1938 }
1939 else
1940 {
1941 char *utf8;
1942
1943 /*
1944 * XXX pg_do_encoding_conversion() requires a transaction. In the
1945 * absence of one, hope for the input to be valid UTF8.
1946 */
1947 if (IsTransactionState())
1948 {
1949 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1950 len,
1951 msgenc,
1952 PG_UTF8);
1953 if (utf8 != str)
1954 len = strlen(utf8);
1955 }
1956 else
1957 utf8 = (char *) str;
1958
1959 utf16 = palloc_array(WCHAR, len + 1);
1961 utf16[dstlen] = (WCHAR) 0;
1962
1963 if (utf8 != str)
1964 pfree(utf8);
1965 }
1966
1967 if (dstlen == 0 && len > 0)
1968 {
1969 pfree(utf16);
1970 return NULL; /* error */
1971 }
1972
1973 if (utf16len)
1974 *utf16len = dstlen;
1975 return utf16;
1976}
1977
1978#endif /* WIN32 */
#define write_stderr(str)
Definition parallel.c:186
#define NameStr(name)
Definition c.h:777
#define unconstify(underlying_type, expr)
Definition c.h:1262
#define Min(x, y)
Definition c.h:1019
#define IS_HIGHBIT_SET(ch)
Definition c.h:1172
#define pg_noreturn
Definition c.h:176
#define VARHDRSZ
Definition c.h:723
#define Assert(condition)
Definition c.h:885
int32_t int32
Definition c.h:554
#define unlikely(x)
Definition c.h:424
#define pg_fallthrough
Definition c.h:144
#define OidIsValid(objectId)
Definition c.h:800
size_t Size
Definition c.h:631
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define LOG
Definition elog.h:31
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define FATAL
Definition elog.h:41
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
const char * pg_enc2gettext_tbl[]
Definition encnames.c:360
const pg_enc2name pg_enc2name_tbl[]
Definition encnames.c:308
#define MaxAllocSize
Definition fe_memutils.h:22
#define palloc_array(type, count)
Definition fe_memutils.h:76
void fmgr_info_cxt(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt)
Definition fmgr.c:138
#define PG_FREE_IF_COPY(ptr, n)
Definition fmgr.h:260
#define PG_GETARG_BYTEA_PP(n)
Definition fmgr.h:309
#define PG_RETURN_BYTEA_P(x)
Definition fmgr.h:373
#define DirectFunctionCall1(func, arg1)
Definition fmgr.h:684
#define PG_GETARG_DATUM(n)
Definition fmgr.h:268
#define PG_RETURN_NULL()
Definition fmgr.h:346
#define PG_GETARG_NAME(n)
Definition fmgr.h:279
#define OidFunctionCall6(functionId, arg1, arg2, arg3, arg4, arg5, arg6)
Definition fmgr.h:732
#define PG_RETURN_INT32(x)
Definition fmgr.h:355
#define PG_GETARG_INT32(n)
Definition fmgr.h:269
#define PG_RETURN_DATUM(x)
Definition fmgr.h:354
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition fmgr.h:688
#define FunctionCall6(flinfo, arg1, arg2, arg3, arg4, arg5, arg6)
Definition fmgr.h:712
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
const char * str
static char * encoding
Definition initdb.c:139
int a
Definition isn.c:73
int j
Definition isn.c:78
int i
Definition isn.c:77
List * lcons(void *datum, List *list)
Definition list.c:495
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
static pg_noreturn void report_invalid_encoding_db(const char *mbstr, int mblen, int len)
Definition mbutils.c:1859
char * pg_client_to_server(const char *s, int len)
Definition mbutils.c:671
static pg_noreturn void report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
Definition mbutils.c:1834
int GetDatabaseEncoding(void)
Definition mbutils.c:1389
Datum pg_convert_to(PG_FUNCTION_ARGS)
Definition mbutils.c:511
int pg_encoding_wchar2mb_with_len(int encoding, const pg_wchar *from, char *to, int len)
Definition mbutils.c:1026
int pg_mblen_cstr(const char *mbstr)
Definition mbutils.c:1045
static bool pg_generic_charinc(unsigned char *charptr, int len)
Definition mbutils.c:1453
static const pg_enc2name * ClientEncoding
Definition mbutils.c:82
static FmgrInfo * ToServerConvProc
Definition mbutils.c:69
static FmgrInfo * ToClientConvProc
Definition mbutils.c:70
int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
Definition mbutils.c:1725
void InitializeClientEncoding(void)
Definition mbutils.c:290
int pg_dsplen(const char *mbstr)
Definition mbutils.c:1158
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition mbutils.c:1185
mbcharacter_incrementer pg_database_encoding_character_incrementer(void)
Definition mbutils.c:1651
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition mbutils.c:1019
static FmgrInfo * Utf8ToServerConvProc
Definition mbutils.c:77
static List * ConvProcList
Definition mbutils.c:63
int pg_mb2wchar(const char *from, pg_wchar *to)
Definition mbutils.c:990
char * pg_any_to_server(const char *s, int len, int encoding)
Definition mbutils.c:687
int pg_mblen_unbounded(const char *mbstr)
Definition mbutils.c:1137
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition mbutils.c:1253
bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
Definition mbutils.c:937
Datum PG_char_to_encoding(PG_FUNCTION_ARGS)
Definition mbutils.c:1413
int pg_mblen_range(const char *mbstr, const char *end)
Definition mbutils.c:1084
unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding)
Definition mbutils.c:365
static const pg_enc2name * MessageEncoding
Definition mbutils.c:84
void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len)
Definition mbutils.c:1871
int pg_wchar2mb(const pg_wchar *from, char *to)
Definition mbutils.c:1012
int pg_mbstrlen(const char *mbstr)
Definition mbutils.c:1165
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition mbutils.c:1694
static char * perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
Definition mbutils.c:794
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition mbutils.c:1684
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition mbutils.c:1211
int GetMessageEncoding(void)
Definition mbutils.c:1436
Datum pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
Definition mbutils.c:655
Datum getdatabaseencoding(PG_FUNCTION_ARGS)
Definition mbutils.c:1401
int pg_mblen_with_len(const char *mbstr, int limit)
Definition mbutils.c:1108
int pg_do_encoding_conversion_buf(Oid proc, int src_encoding, int dest_encoding, unsigned char *src, int srclen, unsigned char *dest, int destlen, bool noError)
Definition mbutils.c:478
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition mbutils.c:1826
const char * GetDatabaseEncodingName(void)
Definition mbutils.c:1395
int SetClientEncoding(int encoding)
Definition mbutils.c:217
void SetMessageEncoding(int encoding)
Definition mbutils.c:1299
char * pg_server_to_client(const char *s, int len)
Definition mbutils.c:749
Datum pg_convert(PG_FUNCTION_ARGS)
Definition mbutils.c:564
void check_encoding_conversion_args(int src_encoding, int dest_encoding, int len, int expected_src_encoding, int expected_dest_encoding)
Definition mbutils.c:1797
int pg_database_encoding_max_length(void)
Definition mbutils.c:1674
int PrepareClientEncoding(int encoding)
Definition mbutils.c:119
static bool backend_startup_complete
Definition mbutils.c:92
Datum pg_convert_from(PG_FUNCTION_ARGS)
Definition mbutils.c:537
int pg_get_client_encoding(void)
Definition mbutils.c:345
void pg_unicode_to_server(char32_t c, unsigned char *s)
Definition mbutils.c:875
static bool pg_utf8_increment(unsigned char *charptr, int length)
Definition mbutils.c:1487
char * pg_server_to_any(const char *s, int len, int encoding)
Definition mbutils.c:760
Datum length_in_encoding(PG_FUNCTION_ARGS)
Definition mbutils.c:626
static int cliplen(const char *str, int len, int limit)
Definition mbutils.c:1278
static int pending_client_encoding
Definition mbutils.c:93
void SetDatabaseEncoding(int encoding)
Definition mbutils.c:1289
int pg_encoding_mbcliplen(int encoding, const char *mbstr, int len, int limit)
Definition mbutils.c:1221
const char * pg_get_client_encoding_name(void)
Definition mbutils.c:354
Datum pg_client_encoding(PG_FUNCTION_ARGS)
Definition mbutils.c:1407
Datum PG_encoding_to_char(PG_FUNCTION_ARGS)
Definition mbutils.c:1421
int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len)
Definition mbutils.c:1004
static bool pg_eucjp_increment(unsigned char *charptr, int length)
Definition mbutils.c:1565
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition mbutils.c:997
static const pg_enc2name * DatabaseEncoding
Definition mbutils.c:83
int pg_mblen(const char *mbstr)
Definition mbutils.c:1151
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void * MemoryContextAllocHuge(MemoryContext context, Size size)
Definition mcxt.c:1725
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition memdebug.h:23
#define MaxAllocHugeSize
Definition memutils.h:45
Datum namein(PG_FUNCTION_ARGS)
Definition name.c:48
Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding)
Definition namespace.c:4152
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
const void size_t len
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
#define foreach_delete_current(lst, var_or_cell)
Definition pg_list.h:391
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define MAX_MULTIBYTE_CHAR_LEN
Definition pg_wchar.h:33
#define MAX_CONVERSION_GROWTH
Definition pg_wchar.h:302
#define pg_utf_mblen
Definition pg_wchar.h:633
@ PG_SQL_ASCII
Definition pg_wchar.h:226
@ PG_EUC_JP
Definition pg_wchar.h:227
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition pg_wchar.h:575
#define SS2
Definition pg_wchar.h:38
bool(* mbcharacter_incrementer)(unsigned char *mbstr, int len)
Definition pg_wchar.h:370
int(* mbchar_verifier)(const unsigned char *mbstr, int len)
Definition pg_wchar.h:372
#define PG_VALID_ENCODING(_enc)
Definition pg_wchar.h:287
#define PG_VALID_FE_ENCODING(_enc)
Definition pg_wchar.h:291
#define PG_VALID_BE_ENCODING(_enc)
Definition pg_wchar.h:281
static bool is_valid_unicode_codepoint(char32_t c)
Definition pg_wchar.h:519
#define pg_encoding_to_char
Definition pg_wchar.h:630
#define pg_char_to_encoding
Definition pg_wchar.h:629
#define SS3
Definition pg_wchar.h:39
int(* mblen_converter)(const unsigned char *mbstr)
Definition pg_wchar.h:366
int pg_strcasecmp(const char *s1, const char *s2)
#define sprintf
Definition port.h:262
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition chklocale.c:301
static Datum BoolGetDatum(bool X)
Definition postgres.h:112
uint64_t Datum
Definition postgres.h:70
static Datum CStringGetDatum(const char *X)
Definition postgres.h:380
static Datum Int32GetDatum(int32 X)
Definition postgres.h:222
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
unsigned int Oid
char * c
static int fb(int x)
static void AssertCouldGetRelation(void)
Definition relcache.h:44
int s_encoding
Definition mbutils.c:57
FmgrInfo to_client_info
Definition mbutils.c:60
int c_encoding
Definition mbutils.c:58
FmgrInfo to_server_info
Definition mbutils.c:59
Definition pg_list.h:54
Definition c.h:772
pg_enc encoding
Definition pg_wchar.h:342
const char * name
Definition pg_wchar.h:341
mbstr_verifier mbverifystr
Definition pg_wchar.h:385
wchar2mb_with_len_converter wchar2mb_with_len
Definition pg_wchar.h:380
mb2wchar_with_len_converter mb2wchar_with_len
Definition pg_wchar.h:378
mblen_converter mblen
Definition pg_wchar.h:382
mbdisplaylen_converter dsplen
Definition pg_wchar.h:383
mbchar_verifier mbverifychar
Definition pg_wchar.h:384
Definition c.h:718
static Size VARSIZE_ANY_EXHDR(const void *PTR)
Definition varatt.h:472
static char * VARDATA(const void *PTR)
Definition varatt.h:305
static char * VARDATA_ANY(const void *PTR)
Definition varatt.h:486
static void SET_VARSIZE(void *PTR, Size len)
Definition varatt.h:432
const char * name
int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, size_t remaining)
Definition wchar.c:2169
const pg_wchar_tbl pg_wchar_table[]
Definition wchar.c:2086
int pg_encoding_max_length(int encoding)
Definition wchar.c:2235
#define setlocale(a, b)
Definition win32_port.h:472
size_t pg_wchar_strlen(const pg_wchar *str)
Definition wstrncmp.c:70
bool IsTransactionState(void)
Definition xact.c:388