PostgreSQL Source Code git master
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. On some platforms
49 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 *
51 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 * values up to 255, and punt for values above that. This is 100% correct
53 * only in single-byte encodings such as LATINn. However, non-Unicode
54 * multibyte encodings are mostly Far Eastern character sets for which the
55 * properties being tested here aren't very relevant for higher code values
56 * anyway. The difficulty with using the <wctype.h> functions with
57 * non-Unicode multibyte encodings is that we can have no certainty that
58 * the platform's wchar_t representation matches what we do in pg_wchar
59 * conversions.
60 *
61 * As a special case, in the "default" collation, (2) and (3) force ASCII
62 * letters to follow ASCII upcase/downcase rules, while in a non-default
63 * collation we just let the library functions do what they will. The case
64 * where this matters is treatment of I/i in Turkish, and the behavior is
65 * meant to match the upper()/lower() SQL functions.
66 *
67 * We store the active collation setting in static variables. In principle
68 * it could be passed down to here via the regex library's "struct vars" data
69 * structure; but that would require somewhat invasive changes in the regex
70 * library, and right now there's no real benefit to be gained from that.
71 *
72 * NB: the coding here assumes pg_wchar is an unsigned type.
73 */
74
75/*
76 * Size of stack buffer to use for string transformations, used to avoid heap
77 * allocations in typical cases. This should be large enough that most strings
78 * will fit, but small enough that we feel comfortable putting it on the
79 * stack.
80 */
81#define TEXTBUFLEN 1024
82
84
85static int strncoll_libc(const char *arg1, ssize_t len1,
86 const char *arg2, ssize_t len2,
88static size_t strnxfrm_libc(char *dest, size_t destsize,
89 const char *src, ssize_t srclen,
91extern char *get_collation_actual_version_libc(const char *collcollate);
92static locale_t make_libc_collator(const char *collate,
93 const char *ctype);
94
95#ifdef WIN32
96static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 const char *arg2, ssize_t len2,
99#endif
100
101static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 size_t fromlen, locale_t loc);
103
104static size_t strlower_libc_sb(char *dest, size_t destsize,
105 const char *src, ssize_t srclen,
107static size_t strlower_libc_mb(char *dest, size_t destsize,
108 const char *src, ssize_t srclen,
110static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
113static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 const char *src, ssize_t srclen,
116static size_t strupper_libc_sb(char *dest, size_t destsize,
117 const char *src, ssize_t srclen,
119static size_t strupper_libc_mb(char *dest, size_t destsize,
120 const char *src, ssize_t srclen,
122
123static bool
125{
126 return isdigit_l((unsigned char) wc, locale->lt);
127}
128
129static bool
131{
132 return isalpha_l((unsigned char) wc, locale->lt);
133}
134
135static bool
137{
138 return isalnum_l((unsigned char) wc, locale->lt);
139}
140
141static bool
143{
144 return isupper_l((unsigned char) wc, locale->lt);
145}
146
147static bool
149{
150 return islower_l((unsigned char) wc, locale->lt);
151}
152
153static bool
155{
156 return isgraph_l((unsigned char) wc, locale->lt);
157}
158
159static bool
161{
162 return isprint_l((unsigned char) wc, locale->lt);
163}
164
165static bool
167{
168 return ispunct_l((unsigned char) wc, locale->lt);
169}
170
171static bool
173{
174 return isspace_l((unsigned char) wc, locale->lt);
175}
176
177static bool
179{
180#ifndef WIN32
181 return isxdigit_l((unsigned char) wc, locale->lt);
182#else
183 return _isxdigit_l((unsigned char) wc, locale->lt);
184#endif
185}
186
187static bool
189{
190 return iswdigit_l((wint_t) wc, locale->lt);
191}
192
193static bool
195{
196 return iswalpha_l((wint_t) wc, locale->lt);
197}
198
199static bool
201{
202 return iswalnum_l((wint_t) wc, locale->lt);
203}
204
205static bool
207{
208 return iswupper_l((wint_t) wc, locale->lt);
209}
210
211static bool
213{
214 return iswlower_l((wint_t) wc, locale->lt);
215}
216
217static bool
219{
220 return iswgraph_l((wint_t) wc, locale->lt);
221}
222
223static bool
225{
226 return iswprint_l((wint_t) wc, locale->lt);
227}
228
229static bool
231{
232 return iswpunct_l((wint_t) wc, locale->lt);
233}
234
235static bool
237{
238 return iswspace_l((wint_t) wc, locale->lt);
239}
240
241static bool
243{
244#ifndef WIN32
245 return iswxdigit_l((wint_t) wc, locale->lt);
246#else
247 return _iswxdigit_l((wint_t) wc, locale->lt);
248#endif
249}
250
251static char
253{
255 return tolower_l(ch, locale->lt);
256}
257
258static bool
260{
261 bool is_multibyte = pg_database_encoding_max_length() > 1;
262
263 if (is_multibyte && IS_HIGHBIT_SET(ch))
264 return true;
265 else
266 return isalpha_l((unsigned char) ch, locale->lt);
267}
268
269static pg_wchar
271{
273
274 /* force C behavior for ASCII characters, per comments above */
275 if (locale->is_default && wc <= (pg_wchar) 127)
276 return pg_ascii_toupper((unsigned char) wc);
277 if (wc <= (pg_wchar) UCHAR_MAX)
278 return toupper_l((unsigned char) wc, locale->lt);
279 else
280 return wc;
281}
282
283static pg_wchar
285{
287
288 /* force C behavior for ASCII characters, per comments above */
289 if (locale->is_default && wc <= (pg_wchar) 127)
290 return pg_ascii_toupper((unsigned char) wc);
291 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
292 return towupper_l((wint_t) wc, locale->lt);
293 else
294 return wc;
295}
296
297static pg_wchar
299{
301
302 /* force C behavior for ASCII characters, per comments above */
303 if (locale->is_default && wc <= (pg_wchar) 127)
304 return pg_ascii_tolower((unsigned char) wc);
305 if (wc <= (pg_wchar) UCHAR_MAX)
306 return tolower_l((unsigned char) wc, locale->lt);
307 else
308 return wc;
309}
310
311static pg_wchar
313{
315
316 /* force C behavior for ASCII characters, per comments above */
317 if (locale->is_default && wc <= (pg_wchar) 127)
318 return pg_ascii_tolower((unsigned char) wc);
319 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
320 return towlower_l((wint_t) wc, locale->lt);
321 else
322 return wc;
323}
324
327 .strtitle = strtitle_libc_sb,
328 .strupper = strupper_libc_sb,
329 /* in libc, casefolding is the same as lowercasing */
330 .strfold = strlower_libc_sb,
331 .wc_isdigit = wc_isdigit_libc_sb,
332 .wc_isalpha = wc_isalpha_libc_sb,
333 .wc_isalnum = wc_isalnum_libc_sb,
334 .wc_isupper = wc_isupper_libc_sb,
335 .wc_islower = wc_islower_libc_sb,
336 .wc_isgraph = wc_isgraph_libc_sb,
337 .wc_isprint = wc_isprint_libc_sb,
338 .wc_ispunct = wc_ispunct_libc_sb,
339 .wc_isspace = wc_isspace_libc_sb,
340 .wc_isxdigit = wc_isxdigit_libc_sb,
341 .char_is_cased = char_is_cased_libc,
342 .char_tolower = char_tolower_libc,
343 .wc_toupper = toupper_libc_sb,
344 .wc_tolower = tolower_libc_sb,
345};
346
347/*
348 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
349 * single-byte semantics for pattern matching.
350 */
353 .strtitle = strtitle_libc_mb,
354 .strupper = strupper_libc_mb,
355 /* in libc, casefolding is the same as lowercasing */
356 .strfold = strlower_libc_mb,
357 .wc_isdigit = wc_isdigit_libc_sb,
358 .wc_isalpha = wc_isalpha_libc_sb,
359 .wc_isalnum = wc_isalnum_libc_sb,
360 .wc_isupper = wc_isupper_libc_sb,
361 .wc_islower = wc_islower_libc_sb,
362 .wc_isgraph = wc_isgraph_libc_sb,
363 .wc_isprint = wc_isprint_libc_sb,
364 .wc_ispunct = wc_ispunct_libc_sb,
365 .wc_isspace = wc_isspace_libc_sb,
366 .wc_isxdigit = wc_isxdigit_libc_sb,
367 .char_is_cased = char_is_cased_libc,
368 .char_tolower = char_tolower_libc,
369 .wc_toupper = toupper_libc_sb,
370 .wc_tolower = tolower_libc_sb,
371};
372
375 .strtitle = strtitle_libc_mb,
376 .strupper = strupper_libc_mb,
377 /* in libc, casefolding is the same as lowercasing */
378 .strfold = strlower_libc_mb,
379 .wc_isdigit = wc_isdigit_libc_mb,
380 .wc_isalpha = wc_isalpha_libc_mb,
381 .wc_isalnum = wc_isalnum_libc_mb,
382 .wc_isupper = wc_isupper_libc_mb,
383 .wc_islower = wc_islower_libc_mb,
384 .wc_isgraph = wc_isgraph_libc_mb,
385 .wc_isprint = wc_isprint_libc_mb,
386 .wc_ispunct = wc_ispunct_libc_mb,
387 .wc_isspace = wc_isspace_libc_mb,
388 .wc_isxdigit = wc_isxdigit_libc_mb,
389 .char_is_cased = char_is_cased_libc,
390 .char_tolower = char_tolower_libc,
391 .wc_toupper = toupper_libc_mb,
392 .wc_tolower = tolower_libc_mb,
393};
394
397 .strnxfrm = strnxfrm_libc,
398 .strnxfrm_prefix = NULL,
399
400 /*
401 * Unfortunately, it seems that strxfrm() for non-C collations is broken
402 * on many common platforms; testing of multiple versions of glibc reveals
403 * that, for many locales, strcoll() and strxfrm() do not return
404 * consistent results. While no other libc other than Cygwin has so far
405 * been shown to have a problem, we take the conservative course of action
406 * for right now and disable this categorically. (Users who are certain
407 * this isn't a problem on their system can define TRUST_STRXFRM.)
408 */
409#ifdef TRUST_STRXFRM
410 .strxfrm_is_safe = true,
411#else
412 .strxfrm_is_safe = false,
413#endif
414};
415
416#ifdef WIN32
417static const struct collate_methods collate_methods_libc_win32_utf8 = {
418 .strncoll = strncoll_libc_win32_utf8,
419 .strnxfrm = strnxfrm_libc,
420 .strnxfrm_prefix = NULL,
421#ifdef TRUST_STRXFRM
422 .strxfrm_is_safe = true,
423#else
424 .strxfrm_is_safe = false,
425#endif
426};
427#endif
428
429static size_t
430strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
432{
433 if (srclen < 0)
434 srclen = strlen(src);
435
436 if (srclen + 1 <= destsize)
437 {
438 locale_t loc = locale->lt;
439 char *p;
440
441 memcpy(dest, src, srclen);
442 dest[srclen] = '\0';
443
444 /*
445 * Note: we assume that tolower_l() will not be so broken as to need
446 * an isupper_l() guard test. When using the default collation, we
447 * apply the traditional Postgres behavior that forces ASCII-style
448 * treatment of I/i, but in non-default collations you get exactly
449 * what the collation says.
450 */
451 for (p = dest; *p; p++)
452 {
453 if (locale->is_default)
454 {
455 if (*p >= 'A' && *p <= 'Z')
456 *p += 'a' - 'A';
457 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
458 *p = tolower_l((unsigned char) *p, loc);
459 }
460 else
461 *p = tolower_l((unsigned char) *p, loc);
462 }
463 }
464
465 return srclen;
466}
467
468static size_t
469strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
471{
472 locale_t loc = locale->lt;
473 size_t result_size;
474 wchar_t *workspace;
475 char *result;
476 size_t curr_char;
477 size_t max_size;
478
479 if (srclen < 0)
480 srclen = strlen(src);
481
482 /* Overflow paranoia */
483 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
485 (errcode(ERRCODE_OUT_OF_MEMORY),
486 errmsg("out of memory")));
487
488 /* Output workspace cannot have more codes than input bytes */
489 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
490
491 char2wchar(workspace, srclen + 1, src, srclen, loc);
492
493 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
494 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
495
496 /*
497 * Make result large enough; case change might change number of bytes
498 */
499 max_size = curr_char * pg_database_encoding_max_length();
500 result = palloc(max_size + 1);
501
502 result_size = wchar2char(result, workspace, max_size + 1, loc);
503
504 if (result_size + 1 > destsize)
505 return result_size;
506
507 memcpy(dest, result, result_size);
508 dest[result_size] = '\0';
509
510 pfree(workspace);
511 pfree(result);
512
513 return result_size;
514}
515
516static size_t
517strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
519{
520 if (srclen < 0)
521 srclen = strlen(src);
522
523 if (srclen + 1 <= destsize)
524 {
525 locale_t loc = locale->lt;
526 int wasalnum = false;
527 char *p;
528
529 memcpy(dest, src, srclen);
530 dest[srclen] = '\0';
531
532 /*
533 * Note: we assume that toupper_l()/tolower_l() will not be so broken
534 * as to need guard tests. When using the default collation, we apply
535 * the traditional Postgres behavior that forces ASCII-style treatment
536 * of I/i, but in non-default collations you get exactly what the
537 * collation says.
538 */
539 for (p = dest; *p; p++)
540 {
541 if (locale->is_default)
542 {
543 if (wasalnum)
544 {
545 if (*p >= 'A' && *p <= 'Z')
546 *p += 'a' - 'A';
547 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
548 *p = tolower_l((unsigned char) *p, loc);
549 }
550 else
551 {
552 if (*p >= 'a' && *p <= 'z')
553 *p -= 'a' - 'A';
554 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
555 *p = toupper_l((unsigned char) *p, loc);
556 }
557 }
558 else
559 {
560 if (wasalnum)
561 *p = tolower_l((unsigned char) *p, loc);
562 else
563 *p = toupper_l((unsigned char) *p, loc);
564 }
565 wasalnum = isalnum_l((unsigned char) *p, loc);
566 }
567 }
568
569 return srclen;
570}
571
572static size_t
573strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
575{
576 locale_t loc = locale->lt;
577 int wasalnum = false;
578 size_t result_size;
579 wchar_t *workspace;
580 char *result;
581 size_t curr_char;
582 size_t max_size;
583
584 if (srclen < 0)
585 srclen = strlen(src);
586
587 /* Overflow paranoia */
588 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
590 (errcode(ERRCODE_OUT_OF_MEMORY),
591 errmsg("out of memory")));
592
593 /* Output workspace cannot have more codes than input bytes */
594 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
595
596 char2wchar(workspace, srclen + 1, src, srclen, loc);
597
598 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
599 {
600 if (wasalnum)
601 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
602 else
603 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
604 wasalnum = iswalnum_l(workspace[curr_char], loc);
605 }
606
607 /*
608 * Make result large enough; case change might change number of bytes
609 */
610 max_size = curr_char * pg_database_encoding_max_length();
611 result = palloc(max_size + 1);
612
613 result_size = wchar2char(result, workspace, max_size + 1, loc);
614
615 if (result_size + 1 > destsize)
616 return result_size;
617
618 memcpy(dest, result, result_size);
619 dest[result_size] = '\0';
620
621 pfree(workspace);
622 pfree(result);
623
624 return result_size;
625}
626
627static size_t
628strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
630{
631 if (srclen < 0)
632 srclen = strlen(src);
633
634 if (srclen + 1 <= destsize)
635 {
636 locale_t loc = locale->lt;
637 char *p;
638
639 memcpy(dest, src, srclen);
640 dest[srclen] = '\0';
641
642 /*
643 * Note: we assume that toupper_l() will not be so broken as to need
644 * an islower_l() guard test. When using the default collation, we
645 * apply the traditional Postgres behavior that forces ASCII-style
646 * treatment of I/i, but in non-default collations you get exactly
647 * what the collation says.
648 */
649 for (p = dest; *p; p++)
650 {
651 if (locale->is_default)
652 {
653 if (*p >= 'a' && *p <= 'z')
654 *p -= 'a' - 'A';
655 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
656 *p = toupper_l((unsigned char) *p, loc);
657 }
658 else
659 *p = toupper_l((unsigned char) *p, loc);
660 }
661 }
662
663 return srclen;
664}
665
666static size_t
667strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
669{
670 locale_t loc = locale->lt;
671 size_t result_size;
672 wchar_t *workspace;
673 char *result;
674 size_t curr_char;
675 size_t max_size;
676
677 if (srclen < 0)
678 srclen = strlen(src);
679
680 /* Overflow paranoia */
681 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
683 (errcode(ERRCODE_OUT_OF_MEMORY),
684 errmsg("out of memory")));
685
686 /* Output workspace cannot have more codes than input bytes */
687 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
688
689 char2wchar(workspace, srclen + 1, src, srclen, loc);
690
691 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
692 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
693
694 /*
695 * Make result large enough; case change might change number of bytes
696 */
697 max_size = curr_char * pg_database_encoding_max_length();
698 result = palloc(max_size + 1);
699
700 result_size = wchar2char(result, workspace, max_size + 1, loc);
701
702 if (result_size + 1 > destsize)
703 return result_size;
704
705 memcpy(dest, result, result_size);
706 dest[result_size] = '\0';
707
708 pfree(workspace);
709 pfree(result);
710
711 return result_size;
712}
713
716{
717 const char *collate;
718 const char *ctype;
719 locale_t loc;
720 pg_locale_t result;
721
722 if (collid == DEFAULT_COLLATION_OID)
723 {
724 HeapTuple tp;
725 Datum datum;
726
728 if (!HeapTupleIsValid(tp))
729 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
730 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
731 Anum_pg_database_datcollate);
732 collate = TextDatumGetCString(datum);
733 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
734 Anum_pg_database_datctype);
735 ctype = TextDatumGetCString(datum);
736
737 ReleaseSysCache(tp);
738 }
739 else
740 {
741 HeapTuple tp;
742 Datum datum;
743
745 if (!HeapTupleIsValid(tp))
746 elog(ERROR, "cache lookup failed for collation %u", collid);
747
748 datum = SysCacheGetAttrNotNull(COLLOID, tp,
749 Anum_pg_collation_collcollate);
750 collate = TextDatumGetCString(datum);
751 datum = SysCacheGetAttrNotNull(COLLOID, tp,
752 Anum_pg_collation_collctype);
753 ctype = TextDatumGetCString(datum);
754
755 ReleaseSysCache(tp);
756 }
757
758
759 loc = make_libc_collator(collate, ctype);
760
761 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
762 result->deterministic = true;
763 result->collate_is_c = (strcmp(collate, "C") == 0) ||
764 (strcmp(collate, "POSIX") == 0);
765 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
766 (strcmp(ctype, "POSIX") == 0);
767 result->lt = loc;
768 if (!result->collate_is_c)
769 {
770#ifdef WIN32
772 result->collate = &collate_methods_libc_win32_utf8;
773 else
774#endif
775 result->collate = &collate_methods_libc;
776 }
777 if (!result->ctype_is_c)
778 {
783 else
784 result->ctype = &ctype_methods_libc_sb;
785 }
786
787 return result;
788}
789
790/*
791 * Create a locale_t with the given collation and ctype.
792 *
793 * The "C" and "POSIX" locales are not actually handled by libc, so return
794 * NULL.
795 *
796 * Ensure that no path leaks a locale_t.
797 */
798static locale_t
799make_libc_collator(const char *collate, const char *ctype)
800{
801 locale_t loc = 0;
802
803 if (strcmp(collate, ctype) == 0)
804 {
805 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
806 {
807 /* Normal case where they're the same */
808 errno = 0;
809#ifndef WIN32
810 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
811 NULL);
812#else
813 loc = _create_locale(LC_ALL, collate);
814#endif
815 if (!loc)
817 }
818 }
819 else
820 {
821#ifndef WIN32
822 /* We need two newlocale() steps */
823 locale_t loc1 = 0;
824
825 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
826 {
827 errno = 0;
828 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
829 if (!loc1)
831 }
832
833 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
834 {
835 errno = 0;
836 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
837 if (!loc)
838 {
839 if (loc1)
840 freelocale(loc1);
842 }
843 }
844 else
845 loc = loc1;
846#else
847
848 /*
849 * XXX The _create_locale() API doesn't appear to support this. Could
850 * perhaps be worked around by changing pg_locale_t to contain two
851 * separate fields.
852 */
854 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
855 errmsg("collations with different collate and ctype values are not supported on this platform")));
856#endif
857 }
858
859 return loc;
860}
861
862/*
863 * strncoll_libc
864 *
865 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
866 *
867 * An input string length of -1 means that it's already NUL-terminated.
868 */
869int
870strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
872{
873 char sbuf[TEXTBUFLEN];
874 char *buf = sbuf;
875 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
876 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
877 const char *arg1n;
878 const char *arg2n;
879 int result;
880
881 if (bufsize1 + bufsize2 > TEXTBUFLEN)
882 buf = palloc(bufsize1 + bufsize2);
883
884 /* nul-terminate arguments if necessary */
885 if (len1 == -1)
886 {
887 arg1n = arg1;
888 }
889 else
890 {
891 char *buf1 = buf;
892
893 memcpy(buf1, arg1, len1);
894 buf1[len1] = '\0';
895 arg1n = buf1;
896 }
897
898 if (len2 == -1)
899 {
900 arg2n = arg2;
901 }
902 else
903 {
904 char *buf2 = buf + bufsize1;
905
906 memcpy(buf2, arg2, len2);
907 buf2[len2] = '\0';
908 arg2n = buf2;
909 }
910
911 result = strcoll_l(arg1n, arg2n, locale->lt);
912
913 if (buf != sbuf)
914 pfree(buf);
915
916 return result;
917}
918
919/*
920 * strnxfrm_libc
921 *
922 * NUL-terminate src, if necessary, and pass to strxfrm_l().
923 *
924 * A source length of -1 means that it's already NUL-terminated.
925 */
926size_t
927strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
929{
930 char sbuf[TEXTBUFLEN];
931 char *buf = sbuf;
932 size_t bufsize = srclen + 1;
933 size_t result;
934
935 if (srclen == -1)
936 return strxfrm_l(dest, src, destsize, locale->lt);
937
938 if (bufsize > TEXTBUFLEN)
939 buf = palloc(bufsize);
940
941 /* nul-terminate argument */
942 memcpy(buf, src, srclen);
943 buf[srclen] = '\0';
944
945 result = strxfrm_l(dest, buf, destsize, locale->lt);
946
947 if (buf != sbuf)
948 pfree(buf);
949
950 /* if dest is defined, it should be nul-terminated */
951 Assert(result >= destsize || dest[result] == '\0');
952
953 return result;
954}
955
956char *
957get_collation_actual_version_libc(const char *collcollate)
958{
959 char *collversion = NULL;
960
961 if (pg_strcasecmp("C", collcollate) != 0 &&
962 pg_strncasecmp("C.", collcollate, 2) != 0 &&
963 pg_strcasecmp("POSIX", collcollate) != 0)
964 {
965#if defined(__GLIBC__)
966 /* Use the glibc version because we don't have anything better. */
967 collversion = pstrdup(gnu_get_libc_version());
968#elif defined(LC_VERSION_MASK)
969 locale_t loc;
970
971 /* Look up FreeBSD collation version. */
972 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
973 if (loc)
974 {
975 collversion =
976 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
977 freelocale(loc);
978 }
979 else
981 (errmsg("could not load locale \"%s\"", collcollate)));
982#elif defined(WIN32)
983 /*
984 * If we are targeting Windows Vista and above, we can ask for a name
985 * given a collation name (earlier versions required a location code
986 * that we don't have).
987 */
988 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
989 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
990
991 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
992 LOCALE_NAME_MAX_LENGTH);
993 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
994 {
995 /*
996 * GetNLSVersionEx() wants a language tag such as "en-US", not a
997 * locale name like "English_United States.1252". Until those
998 * values can be prevented from entering the system, or 100%
999 * reliably converted to the more useful tag format, tolerate the
1000 * resulting error and report that we have no version data.
1001 */
1002 if (GetLastError() == ERROR_INVALID_PARAMETER)
1003 return NULL;
1004
1005 ereport(ERROR,
1006 (errmsg("could not get collation version for locale \"%s\": error code %lu",
1007 collcollate,
1008 GetLastError())));
1009 }
1010 collversion = psprintf("%lu.%lu,%lu.%lu",
1011 (version.dwNLSVersion >> 8) & 0xFFFF,
1012 version.dwNLSVersion & 0xFF,
1013 (version.dwDefinedVersion >> 8) & 0xFFFF,
1014 version.dwDefinedVersion & 0xFF);
1015#endif
1016 }
1017
1018 return collversion;
1019}
1020
1021/*
1022 * strncoll_libc_win32_utf8
1023 *
1024 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1025 * invoke wcscoll_l().
1026 *
1027 * An input string length of -1 means that it's NUL-terminated.
1028 */
1029#ifdef WIN32
1030static int
1031strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1032 ssize_t len2, pg_locale_t locale)
1033{
1034 char sbuf[TEXTBUFLEN];
1035 char *buf = sbuf;
1036 char *a1p,
1037 *a2p;
1038 int a1len;
1039 int a2len;
1040 int r;
1041 int result;
1042
1044
1045 if (len1 == -1)
1046 len1 = strlen(arg1);
1047 if (len2 == -1)
1048 len2 = strlen(arg2);
1049
1050 a1len = len1 * 2 + 2;
1051 a2len = len2 * 2 + 2;
1052
1053 if (a1len + a2len > TEXTBUFLEN)
1054 buf = palloc(a1len + a2len);
1055
1056 a1p = buf;
1057 a2p = buf + a1len;
1058
1059 /* API does not work for zero-length input */
1060 if (len1 == 0)
1061 r = 0;
1062 else
1063 {
1064 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1065 (LPWSTR) a1p, a1len / 2);
1066 if (!r)
1067 ereport(ERROR,
1068 (errmsg("could not convert string to UTF-16: error code %lu",
1069 GetLastError())));
1070 }
1071 ((LPWSTR) a1p)[r] = 0;
1072
1073 if (len2 == 0)
1074 r = 0;
1075 else
1076 {
1077 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1078 (LPWSTR) a2p, a2len / 2);
1079 if (!r)
1080 ereport(ERROR,
1081 (errmsg("could not convert string to UTF-16: error code %lu",
1082 GetLastError())));
1083 }
1084 ((LPWSTR) a2p)[r] = 0;
1085
1086 errno = 0;
1087 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1088 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1089 ereport(ERROR,
1090 (errmsg("could not compare Unicode strings: %m")));
1091
1092 if (buf != sbuf)
1093 pfree(buf);
1094
1095 return result;
1096}
1097#endif /* WIN32 */
1098
1099/* simple subroutine for reporting errors from newlocale() */
1100void
1101report_newlocale_failure(const char *localename)
1102{
1103 int save_errno;
1104
1105 /*
1106 * Windows doesn't provide any useful error indication from
1107 * _create_locale(), and BSD-derived platforms don't seem to feel they
1108 * need to set errno either (even though POSIX is pretty clear that
1109 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1110 * is what to report.
1111 */
1112 if (errno == 0)
1113 errno = ENOENT;
1114
1115 /*
1116 * ENOENT means "no such locale", not "no such file", so clarify that
1117 * errno with an errdetail message.
1118 */
1119 save_errno = errno; /* auxiliary funcs might change errno */
1120 ereport(ERROR,
1121 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1122 errmsg("could not create locale \"%s\": %m",
1123 localename),
1124 (save_errno == ENOENT ?
1125 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1126 localename) : 0)));
1127}
1128
1129/*
1130 * POSIX doesn't define _l-variants of these functions, but several systems
1131 * have them. We provide our own replacements here.
1132 */
1133#ifndef HAVE_MBSTOWCS_L
1134static size_t
1135mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1136{
1137#ifdef WIN32
1138 return _mbstowcs_l(dest, src, n, loc);
1139#else
1140 size_t result;
1141 locale_t save_locale = uselocale(loc);
1142
1143 result = mbstowcs(dest, src, n);
1144 uselocale(save_locale);
1145 return result;
1146#endif
1147}
1148#endif
1149#ifndef HAVE_WCSTOMBS_L
1150static size_t
1151wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1152{
1153#ifdef WIN32
1154 return _wcstombs_l(dest, src, n, loc);
1155#else
1156 size_t result;
1157 locale_t save_locale = uselocale(loc);
1158
1159 result = wcstombs(dest, src, n);
1160 uselocale(save_locale);
1161 return result;
1162#endif
1163}
1164#endif
1165
1166/*
1167 * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1168 * Therefore we keep them here rather than with the mbutils code.
1169 */
1170
1171/*
1172 * wchar2char --- convert wide characters to multibyte format
1173 *
1174 * This has the same API as the standard wcstombs_l() function; in particular,
1175 * tolen is the maximum number of bytes to store at *to, and *from must be
1176 * zero-terminated. The output will be zero-terminated iff there is room.
1177 */
1178size_t
1179wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1180{
1181 size_t result;
1182
1183 if (tolen == 0)
1184 return 0;
1185
1186#ifdef WIN32
1187
1188 /*
1189 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1190 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1191 * MultiByteToWideChar().
1192 */
1194 {
1195 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1196 NULL, NULL);
1197 /* A zero return is failure */
1198 if (result <= 0)
1199 result = -1;
1200 else
1201 {
1202 Assert(result <= tolen);
1203 /* Microsoft counts the zero terminator in the result */
1204 result--;
1205 }
1206 }
1207 else
1208#endif /* WIN32 */
1209 if (loc == (locale_t) 0)
1210 {
1211 /* Use wcstombs directly for the default locale */
1212 result = wcstombs(to, from, tolen);
1213 }
1214 else
1215 {
1216 /* Use wcstombs_l for nondefault locales */
1217 result = wcstombs_l(to, from, tolen, loc);
1218 }
1219
1220 return result;
1221}
1222
1223/*
1224 * char2wchar --- convert multibyte characters to wide characters
1225 *
1226 * This has almost the API of mbstowcs_l(), except that *from need not be
1227 * null-terminated; instead, the number of input bytes is specified as
1228 * fromlen. Also, we ereport() rather than returning -1 for invalid
1229 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1230 * The output will be zero-terminated iff there is room.
1231 */
1232static size_t
1233char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1234 locale_t loc)
1235{
1236 size_t result;
1237
1238 if (tolen == 0)
1239 return 0;
1240
1241#ifdef WIN32
1242 /* See WIN32 "Unicode" comment above */
1244 {
1245 /* Win32 API does not work for zero-length input */
1246 if (fromlen == 0)
1247 result = 0;
1248 else
1249 {
1250 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1251 /* A zero return is failure */
1252 if (result == 0)
1253 result = -1;
1254 }
1255
1256 if (result != -1)
1257 {
1258 Assert(result < tolen);
1259 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1260 to[result] = 0;
1261 }
1262 }
1263 else
1264#endif /* WIN32 */
1265 {
1266 /* mbstowcs requires ending '\0' */
1267 char *str = pnstrdup(from, fromlen);
1268
1269 if (loc == (locale_t) 0)
1270 {
1271 /* Use mbstowcs directly for the default locale */
1272 result = mbstowcs(to, str, tolen);
1273 }
1274 else
1275 {
1276 /* Use mbstowcs_l for nondefault locales */
1277 result = mbstowcs_l(to, str, tolen, loc);
1278 }
1279
1280 pfree(str);
1281 }
1282
1283 if (result == -1)
1284 {
1285 /*
1286 * Invalid multibyte character encountered. We try to give a useful
1287 * error message by letting pg_verifymbstr check the string. But it's
1288 * possible that the string is OK to us, and not OK to mbstowcs ---
1289 * this suggests that the LC_CTYPE locale is different from the
1290 * database encoding. Give a generic error message if pg_verifymbstr
1291 * can't find anything wrong.
1292 */
1293 pg_verifymbstr(from, fromlen, false); /* might not return */
1294 /* but if it does ... */
1295 ereport(ERROR,
1296 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1297 errmsg("invalid multibyte character for locale"),
1298 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1299 }
1300
1301 return result;
1302}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1143
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1262
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1557
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1547
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1770
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static char char_tolower_libc(unsigned char ch, pg_locale_t locale)
static bool char_is_cased_libc(char ch, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
#define TEXTBUFLEN
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:32
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition: port.h:188
static unsigned char pg_ascii_toupper(unsigned char ch)
Definition: port.h:177
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:65
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:75
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:101
const struct ctype_methods * ctype
Definition: pg_locale.h:161
const struct collate_methods * collate
Definition: pg_locale.h:160
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
#define locale_t
Definition: win32_port.h:432
#define toupper_l
Definition: win32_port.h:434
#define iswalnum_l
Definition: win32_port.h:442
#define isgraph_l
Definition: win32_port.h:447
#define towupper_l
Definition: win32_port.h:436
#define ispunct_l
Definition: win32_port.h:451
#define isalpha_l
Definition: win32_port.h:439
#define strcoll_l
Definition: win32_port.h:455
#define iswgraph_l
Definition: win32_port.h:448
#define strxfrm_l
Definition: win32_port.h:456
#define towlower_l
Definition: win32_port.h:435
#define iswspace_l
Definition: win32_port.h:454
#define isdigit_l
Definition: win32_port.h:437
#define wcscoll_l
Definition: win32_port.h:457
#define tolower_l
Definition: win32_port.h:433
#define iswupper_l
Definition: win32_port.h:444
#define iswalpha_l
Definition: win32_port.h:440
#define isprint_l
Definition: win32_port.h:449
#define iswprint_l
Definition: win32_port.h:450
#define isupper_l
Definition: win32_port.h:443
#define isalnum_l
Definition: win32_port.h:441
#define islower_l
Definition: win32_port.h:445
#define iswlower_l
Definition: win32_port.h:446
#define iswpunct_l
Definition: win32_port.h:452
#define isspace_l
Definition: win32_port.h:453
#define iswdigit_l
Definition: win32_port.h:438