PostgreSQL Source Code git master
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. On some platforms
49 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 *
51 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 * values up to 255, and punt for values above that. This is 100% correct
53 * only in single-byte encodings such as LATINn. However, non-Unicode
54 * multibyte encodings are mostly Far Eastern character sets for which the
55 * properties being tested here aren't very relevant for higher code values
56 * anyway. The difficulty with using the <wctype.h> functions with
57 * non-Unicode multibyte encodings is that we can have no certainty that
58 * the platform's wchar_t representation matches what we do in pg_wchar
59 * conversions.
60 *
61 * As a special case, in the "default" collation, (2) and (3) force ASCII
62 * letters to follow ASCII upcase/downcase rules, while in a non-default
63 * collation we just let the library functions do what they will. The case
64 * where this matters is treatment of I/i in Turkish, and the behavior is
65 * meant to match the upper()/lower() SQL functions.
66 *
67 * We store the active collation setting in static variables. In principle
68 * it could be passed down to here via the regex library's "struct vars" data
69 * structure; but that would require somewhat invasive changes in the regex
70 * library, and right now there's no real benefit to be gained from that.
71 *
72 * NB: the coding here assumes pg_wchar is an unsigned type.
73 */
74
75/*
76 * Size of stack buffer to use for string transformations, used to avoid heap
77 * allocations in typical cases. This should be large enough that most strings
78 * will fit, but small enough that we feel comfortable putting it on the
79 * stack.
80 */
81#define TEXTBUFLEN 1024
82
84
85static int strncoll_libc(const char *arg1, ssize_t len1,
86 const char *arg2, ssize_t len2,
88static size_t strnxfrm_libc(char *dest, size_t destsize,
89 const char *src, ssize_t srclen,
91extern char *get_collation_actual_version_libc(const char *collcollate);
92static locale_t make_libc_collator(const char *collate,
93 const char *ctype);
94
95#ifdef WIN32
96static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 const char *arg2, ssize_t len2,
99#endif
100
101static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 size_t fromlen, locale_t loc);
103
104static size_t strlower_libc_sb(char *dest, size_t destsize,
105 const char *src, ssize_t srclen,
107static size_t strlower_libc_mb(char *dest, size_t destsize,
108 const char *src, ssize_t srclen,
110static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
113static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 const char *src, ssize_t srclen,
116static size_t strupper_libc_sb(char *dest, size_t destsize,
117 const char *src, ssize_t srclen,
119static size_t strupper_libc_mb(char *dest, size_t destsize,
120 const char *src, ssize_t srclen,
122
123static bool
125{
126 return isdigit_l((unsigned char) wc, locale->lt);
127}
128
129static bool
131{
132 return isalpha_l((unsigned char) wc, locale->lt);
133}
134
135static bool
137{
138 return isalnum_l((unsigned char) wc, locale->lt);
139}
140
141static bool
143{
144 return isupper_l((unsigned char) wc, locale->lt);
145}
146
147static bool
149{
150 return islower_l((unsigned char) wc, locale->lt);
151}
152
153static bool
155{
156 return isgraph_l((unsigned char) wc, locale->lt);
157}
158
159static bool
161{
162 return isprint_l((unsigned char) wc, locale->lt);
163}
164
165static bool
167{
168 return ispunct_l((unsigned char) wc, locale->lt);
169}
170
171static bool
173{
174 return isspace_l((unsigned char) wc, locale->lt);
175}
176
177static bool
179{
180#ifndef WIN32
181 return isxdigit_l((unsigned char) wc, locale->lt);
182#else
183 return _isxdigit_l((unsigned char) wc, locale->lt);
184#endif
185}
186
187static bool
189{
190 return isupper_l((unsigned char) wc, locale->lt) ||
191 islower_l((unsigned char) wc, locale->lt);
192}
193
194static bool
196{
197 return iswdigit_l((wint_t) wc, locale->lt);
198}
199
200static bool
202{
203 return iswalpha_l((wint_t) wc, locale->lt);
204}
205
206static bool
208{
209 return iswalnum_l((wint_t) wc, locale->lt);
210}
211
212static bool
214{
215 return iswupper_l((wint_t) wc, locale->lt);
216}
217
218static bool
220{
221 return iswlower_l((wint_t) wc, locale->lt);
222}
223
224static bool
226{
227 return iswgraph_l((wint_t) wc, locale->lt);
228}
229
230static bool
232{
233 return iswprint_l((wint_t) wc, locale->lt);
234}
235
236static bool
238{
239 return iswpunct_l((wint_t) wc, locale->lt);
240}
241
242static bool
244{
245 return iswspace_l((wint_t) wc, locale->lt);
246}
247
248static bool
250{
251#ifndef WIN32
252 return iswxdigit_l((wint_t) wc, locale->lt);
253#else
254 return _iswxdigit_l((wint_t) wc, locale->lt);
255#endif
256}
257
258static bool
260{
261 return iswupper_l((wint_t) wc, locale->lt) ||
262 iswlower_l((wint_t) wc, locale->lt);
263}
264
265static pg_wchar
267{
269
270 /* force C behavior for ASCII characters, per comments above */
271 if (locale->is_default && wc <= (pg_wchar) 127)
272 return pg_ascii_toupper((unsigned char) wc);
273 if (wc <= (pg_wchar) UCHAR_MAX)
274 return toupper_l((unsigned char) wc, locale->lt);
275 else
276 return wc;
277}
278
279static pg_wchar
281{
283
284 /* force C behavior for ASCII characters, per comments above */
285 if (locale->is_default && wc <= (pg_wchar) 127)
286 return pg_ascii_toupper((unsigned char) wc);
287 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
288 return towupper_l((wint_t) wc, locale->lt);
289 else
290 return wc;
291}
292
293static pg_wchar
295{
297
298 /* force C behavior for ASCII characters, per comments above */
299 if (locale->is_default && wc <= (pg_wchar) 127)
300 return pg_ascii_tolower((unsigned char) wc);
301 if (wc <= (pg_wchar) UCHAR_MAX)
302 return tolower_l((unsigned char) wc, locale->lt);
303 else
304 return wc;
305}
306
307static pg_wchar
309{
311
312 /* force C behavior for ASCII characters, per comments above */
313 if (locale->is_default && wc <= (pg_wchar) 127)
314 return pg_ascii_tolower((unsigned char) wc);
315 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
316 return towlower_l((wint_t) wc, locale->lt);
317 else
318 return wc;
319}
320
321/*
322 * Characters A..Z always downcase to a..z, even in the Turkish
323 * locale. Characters beyond 127 use tolower().
324 */
325static size_t
326downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src,
327 ssize_t srclen, pg_locale_t locale)
328{
329 locale_t loc = locale->lt;
330 int i;
331
332 for (i = 0; i < srclen && i < dstsize; i++)
333 {
334 unsigned char ch = (unsigned char) src[i];
335
336 if (ch >= 'A' && ch <= 'Z')
337 ch = pg_ascii_tolower(ch);
338 else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
339 ch = tolower_l(ch, loc);
340 dst[i] = (char) ch;
341 }
342
343 if (i < dstsize)
344 dst[i] = '\0';
345
346 return srclen;
347}
348
351 .strtitle = strtitle_libc_sb,
352 .strupper = strupper_libc_sb,
353 /* in libc, casefolding is the same as lowercasing */
354 .strfold = strlower_libc_sb,
355 .downcase_ident = downcase_ident_libc_sb,
356 .wc_isdigit = wc_isdigit_libc_sb,
357 .wc_isalpha = wc_isalpha_libc_sb,
358 .wc_isalnum = wc_isalnum_libc_sb,
359 .wc_isupper = wc_isupper_libc_sb,
360 .wc_islower = wc_islower_libc_sb,
361 .wc_isgraph = wc_isgraph_libc_sb,
362 .wc_isprint = wc_isprint_libc_sb,
363 .wc_ispunct = wc_ispunct_libc_sb,
364 .wc_isspace = wc_isspace_libc_sb,
365 .wc_isxdigit = wc_isxdigit_libc_sb,
366 .wc_iscased = wc_iscased_libc_sb,
367 .wc_toupper = toupper_libc_sb,
368 .wc_tolower = tolower_libc_sb,
369};
370
371/*
372 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
373 * single-byte semantics for pattern matching.
374 */
377 .strtitle = strtitle_libc_mb,
378 .strupper = strupper_libc_mb,
379 /* in libc, casefolding is the same as lowercasing */
380 .strfold = strlower_libc_mb,
381 /* uses plain ASCII semantics for historical reasons */
382 .downcase_ident = NULL,
383 .wc_isdigit = wc_isdigit_libc_sb,
384 .wc_isalpha = wc_isalpha_libc_sb,
385 .wc_isalnum = wc_isalnum_libc_sb,
386 .wc_isupper = wc_isupper_libc_sb,
387 .wc_islower = wc_islower_libc_sb,
388 .wc_isgraph = wc_isgraph_libc_sb,
389 .wc_isprint = wc_isprint_libc_sb,
390 .wc_ispunct = wc_ispunct_libc_sb,
391 .wc_isspace = wc_isspace_libc_sb,
392 .wc_isxdigit = wc_isxdigit_libc_sb,
393 .wc_iscased = wc_iscased_libc_sb,
394 .wc_toupper = toupper_libc_sb,
395 .wc_tolower = tolower_libc_sb,
396};
397
400 .strtitle = strtitle_libc_mb,
401 .strupper = strupper_libc_mb,
402 /* in libc, casefolding is the same as lowercasing */
403 .strfold = strlower_libc_mb,
404 /* uses plain ASCII semantics for historical reasons */
405 .downcase_ident = NULL,
406 .wc_isdigit = wc_isdigit_libc_mb,
407 .wc_isalpha = wc_isalpha_libc_mb,
408 .wc_isalnum = wc_isalnum_libc_mb,
409 .wc_isupper = wc_isupper_libc_mb,
410 .wc_islower = wc_islower_libc_mb,
411 .wc_isgraph = wc_isgraph_libc_mb,
412 .wc_isprint = wc_isprint_libc_mb,
413 .wc_ispunct = wc_ispunct_libc_mb,
414 .wc_isspace = wc_isspace_libc_mb,
415 .wc_isxdigit = wc_isxdigit_libc_mb,
416 .wc_iscased = wc_iscased_libc_mb,
417 .wc_toupper = toupper_libc_mb,
418 .wc_tolower = tolower_libc_mb,
419};
420
423 .strnxfrm = strnxfrm_libc,
424 .strnxfrm_prefix = NULL,
425
426 /*
427 * Unfortunately, it seems that strxfrm() for non-C collations is broken
428 * on many common platforms; testing of multiple versions of glibc reveals
429 * that, for many locales, strcoll() and strxfrm() do not return
430 * consistent results. While no other libc other than Cygwin has so far
431 * been shown to have a problem, we take the conservative course of action
432 * for right now and disable this categorically. (Users who are certain
433 * this isn't a problem on their system can define TRUST_STRXFRM.)
434 */
435#ifdef TRUST_STRXFRM
436 .strxfrm_is_safe = true,
437#else
438 .strxfrm_is_safe = false,
439#endif
440};
441
442#ifdef WIN32
443static const struct collate_methods collate_methods_libc_win32_utf8 = {
444 .strncoll = strncoll_libc_win32_utf8,
445 .strnxfrm = strnxfrm_libc,
446 .strnxfrm_prefix = NULL,
447#ifdef TRUST_STRXFRM
448 .strxfrm_is_safe = true,
449#else
450 .strxfrm_is_safe = false,
451#endif
452};
453#endif
454
455static size_t
456strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
458{
459 if (srclen < 0)
460 srclen = strlen(src);
461
462 if (srclen + 1 <= destsize)
463 {
464 locale_t loc = locale->lt;
465 char *p;
466
467 memcpy(dest, src, srclen);
468 dest[srclen] = '\0';
469
470 /*
471 * Note: we assume that tolower_l() will not be so broken as to need
472 * an isupper_l() guard test. When using the default collation, we
473 * apply the traditional Postgres behavior that forces ASCII-style
474 * treatment of I/i, but in non-default collations you get exactly
475 * what the collation says.
476 */
477 for (p = dest; *p; p++)
478 {
479 if (locale->is_default)
480 {
481 if (*p >= 'A' && *p <= 'Z')
482 *p += 'a' - 'A';
483 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
484 *p = tolower_l((unsigned char) *p, loc);
485 }
486 else
487 *p = tolower_l((unsigned char) *p, loc);
488 }
489 }
490
491 return srclen;
492}
493
494static size_t
495strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
497{
498 locale_t loc = locale->lt;
499 size_t result_size;
500 wchar_t *workspace;
501 char *result;
502 size_t curr_char;
503 size_t max_size;
504
505 if (srclen < 0)
506 srclen = strlen(src);
507
508 /* Overflow paranoia */
509 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
511 (errcode(ERRCODE_OUT_OF_MEMORY),
512 errmsg("out of memory")));
513
514 /* Output workspace cannot have more codes than input bytes */
515 workspace = palloc_array(wchar_t, srclen + 1);
516
517 char2wchar(workspace, srclen + 1, src, srclen, loc);
518
519 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
520 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
521
522 /*
523 * Make result large enough; case change might change number of bytes
524 */
525 max_size = curr_char * pg_database_encoding_max_length();
526 result = palloc(max_size + 1);
527
528 result_size = wchar2char(result, workspace, max_size + 1, loc);
529
530 if (result_size + 1 > destsize)
531 return result_size;
532
533 memcpy(dest, result, result_size);
534 dest[result_size] = '\0';
535
536 pfree(workspace);
537 pfree(result);
538
539 return result_size;
540}
541
542static size_t
543strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
545{
546 if (srclen < 0)
547 srclen = strlen(src);
548
549 if (srclen + 1 <= destsize)
550 {
551 locale_t loc = locale->lt;
552 int wasalnum = false;
553 char *p;
554
555 memcpy(dest, src, srclen);
556 dest[srclen] = '\0';
557
558 /*
559 * Note: we assume that toupper_l()/tolower_l() will not be so broken
560 * as to need guard tests. When using the default collation, we apply
561 * the traditional Postgres behavior that forces ASCII-style treatment
562 * of I/i, but in non-default collations you get exactly what the
563 * collation says.
564 */
565 for (p = dest; *p; p++)
566 {
567 if (locale->is_default)
568 {
569 if (wasalnum)
570 {
571 if (*p >= 'A' && *p <= 'Z')
572 *p += 'a' - 'A';
573 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
574 *p = tolower_l((unsigned char) *p, loc);
575 }
576 else
577 {
578 if (*p >= 'a' && *p <= 'z')
579 *p -= 'a' - 'A';
580 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
581 *p = toupper_l((unsigned char) *p, loc);
582 }
583 }
584 else
585 {
586 if (wasalnum)
587 *p = tolower_l((unsigned char) *p, loc);
588 else
589 *p = toupper_l((unsigned char) *p, loc);
590 }
591 wasalnum = isalnum_l((unsigned char) *p, loc);
592 }
593 }
594
595 return srclen;
596}
597
598static size_t
599strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
601{
602 locale_t loc = locale->lt;
603 int wasalnum = false;
604 size_t result_size;
605 wchar_t *workspace;
606 char *result;
607 size_t curr_char;
608 size_t max_size;
609
610 if (srclen < 0)
611 srclen = strlen(src);
612
613 /* Overflow paranoia */
614 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
616 (errcode(ERRCODE_OUT_OF_MEMORY),
617 errmsg("out of memory")));
618
619 /* Output workspace cannot have more codes than input bytes */
620 workspace = palloc_array(wchar_t, srclen + 1);
621
622 char2wchar(workspace, srclen + 1, src, srclen, loc);
623
624 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
625 {
626 if (wasalnum)
627 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
628 else
629 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
630 wasalnum = iswalnum_l(workspace[curr_char], loc);
631 }
632
633 /*
634 * Make result large enough; case change might change number of bytes
635 */
636 max_size = curr_char * pg_database_encoding_max_length();
637 result = palloc(max_size + 1);
638
639 result_size = wchar2char(result, workspace, max_size + 1, loc);
640
641 if (result_size + 1 > destsize)
642 return result_size;
643
644 memcpy(dest, result, result_size);
645 dest[result_size] = '\0';
646
647 pfree(workspace);
648 pfree(result);
649
650 return result_size;
651}
652
653static size_t
654strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
656{
657 if (srclen < 0)
658 srclen = strlen(src);
659
660 if (srclen + 1 <= destsize)
661 {
662 locale_t loc = locale->lt;
663 char *p;
664
665 memcpy(dest, src, srclen);
666 dest[srclen] = '\0';
667
668 /*
669 * Note: we assume that toupper_l() will not be so broken as to need
670 * an islower_l() guard test. When using the default collation, we
671 * apply the traditional Postgres behavior that forces ASCII-style
672 * treatment of I/i, but in non-default collations you get exactly
673 * what the collation says.
674 */
675 for (p = dest; *p; p++)
676 {
677 if (locale->is_default)
678 {
679 if (*p >= 'a' && *p <= 'z')
680 *p -= 'a' - 'A';
681 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
682 *p = toupper_l((unsigned char) *p, loc);
683 }
684 else
685 *p = toupper_l((unsigned char) *p, loc);
686 }
687 }
688
689 return srclen;
690}
691
692static size_t
693strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
695{
696 locale_t loc = locale->lt;
697 size_t result_size;
698 wchar_t *workspace;
699 char *result;
700 size_t curr_char;
701 size_t max_size;
702
703 if (srclen < 0)
704 srclen = strlen(src);
705
706 /* Overflow paranoia */
707 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
709 (errcode(ERRCODE_OUT_OF_MEMORY),
710 errmsg("out of memory")));
711
712 /* Output workspace cannot have more codes than input bytes */
713 workspace = palloc_array(wchar_t, srclen + 1);
714
715 char2wchar(workspace, srclen + 1, src, srclen, loc);
716
717 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
718 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
719
720 /*
721 * Make result large enough; case change might change number of bytes
722 */
723 max_size = curr_char * pg_database_encoding_max_length();
724 result = palloc(max_size + 1);
725
726 result_size = wchar2char(result, workspace, max_size + 1, loc);
727
728 if (result_size + 1 > destsize)
729 return result_size;
730
731 memcpy(dest, result, result_size);
732 dest[result_size] = '\0';
733
734 pfree(workspace);
735 pfree(result);
736
737 return result_size;
738}
739
742{
743 const char *collate;
744 const char *ctype;
745 locale_t loc;
746 pg_locale_t result;
747
748 if (collid == DEFAULT_COLLATION_OID)
749 {
750 HeapTuple tp;
751 Datum datum;
752
754 if (!HeapTupleIsValid(tp))
755 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
756 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
757 Anum_pg_database_datcollate);
758 collate = TextDatumGetCString(datum);
759 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
760 Anum_pg_database_datctype);
761 ctype = TextDatumGetCString(datum);
762
763 ReleaseSysCache(tp);
764 }
765 else
766 {
767 HeapTuple tp;
768 Datum datum;
769
771 if (!HeapTupleIsValid(tp))
772 elog(ERROR, "cache lookup failed for collation %u", collid);
773
774 datum = SysCacheGetAttrNotNull(COLLOID, tp,
775 Anum_pg_collation_collcollate);
776 collate = TextDatumGetCString(datum);
777 datum = SysCacheGetAttrNotNull(COLLOID, tp,
778 Anum_pg_collation_collctype);
779 ctype = TextDatumGetCString(datum);
780
781 ReleaseSysCache(tp);
782 }
783
784
785 loc = make_libc_collator(collate, ctype);
786
787 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
788 result->deterministic = true;
789 result->collate_is_c = (strcmp(collate, "C") == 0) ||
790 (strcmp(collate, "POSIX") == 0);
791 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
792 (strcmp(ctype, "POSIX") == 0);
793 result->lt = loc;
794 if (!result->collate_is_c)
795 {
796#ifdef WIN32
798 result->collate = &collate_methods_libc_win32_utf8;
799 else
800#endif
801 result->collate = &collate_methods_libc;
802 }
803 if (!result->ctype_is_c)
804 {
809 else
810 result->ctype = &ctype_methods_libc_sb;
811 }
812
813 return result;
814}
815
816/*
817 * Create a locale_t with the given collation and ctype.
818 *
819 * The "C" and "POSIX" locales are not actually handled by libc, so return
820 * NULL.
821 *
822 * Ensure that no path leaks a locale_t.
823 */
824static locale_t
825make_libc_collator(const char *collate, const char *ctype)
826{
827 locale_t loc = 0;
828
829 if (strcmp(collate, ctype) == 0)
830 {
831 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
832 {
833 /* Normal case where they're the same */
834 errno = 0;
835#ifndef WIN32
836 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
837 NULL);
838#else
839 loc = _create_locale(LC_ALL, collate);
840#endif
841 if (!loc)
843 }
844 }
845 else
846 {
847#ifndef WIN32
848 /* We need two newlocale() steps */
849 locale_t loc1 = 0;
850
851 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
852 {
853 errno = 0;
854 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
855 if (!loc1)
857 }
858
859 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
860 {
861 errno = 0;
862 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
863 if (!loc)
864 {
865 if (loc1)
866 freelocale(loc1);
868 }
869 }
870 else
871 loc = loc1;
872#else
873
874 /*
875 * XXX The _create_locale() API doesn't appear to support this. Could
876 * perhaps be worked around by changing pg_locale_t to contain two
877 * separate fields.
878 */
880 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
881 errmsg("collations with different collate and ctype values are not supported on this platform")));
882#endif
883 }
884
885 return loc;
886}
887
888/*
889 * strncoll_libc
890 *
891 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
892 *
893 * An input string length of -1 means that it's already NUL-terminated.
894 */
895int
896strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
898{
899 char sbuf[TEXTBUFLEN];
900 char *buf = sbuf;
901 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
902 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
903 const char *arg1n;
904 const char *arg2n;
905 int result;
906
907 if (bufsize1 + bufsize2 > TEXTBUFLEN)
908 buf = palloc(bufsize1 + bufsize2);
909
910 /* nul-terminate arguments if necessary */
911 if (len1 == -1)
912 {
913 arg1n = arg1;
914 }
915 else
916 {
917 char *buf1 = buf;
918
919 memcpy(buf1, arg1, len1);
920 buf1[len1] = '\0';
921 arg1n = buf1;
922 }
923
924 if (len2 == -1)
925 {
926 arg2n = arg2;
927 }
928 else
929 {
930 char *buf2 = buf + bufsize1;
931
932 memcpy(buf2, arg2, len2);
933 buf2[len2] = '\0';
934 arg2n = buf2;
935 }
936
937 result = strcoll_l(arg1n, arg2n, locale->lt);
938
939 if (buf != sbuf)
940 pfree(buf);
941
942 return result;
943}
944
945/*
946 * strnxfrm_libc
947 *
948 * NUL-terminate src, if necessary, and pass to strxfrm_l().
949 *
950 * A source length of -1 means that it's already NUL-terminated.
951 */
952size_t
953strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
955{
956 char sbuf[TEXTBUFLEN];
957 char *buf = sbuf;
958 size_t bufsize = srclen + 1;
959 size_t result;
960
961 if (srclen == -1)
962 return strxfrm_l(dest, src, destsize, locale->lt);
963
964 if (bufsize > TEXTBUFLEN)
965 buf = palloc(bufsize);
966
967 /* nul-terminate argument */
968 memcpy(buf, src, srclen);
969 buf[srclen] = '\0';
970
971 result = strxfrm_l(dest, buf, destsize, locale->lt);
972
973 if (buf != sbuf)
974 pfree(buf);
975
976 /* if dest is defined, it should be nul-terminated */
977 Assert(result >= destsize || dest[result] == '\0');
978
979 return result;
980}
981
982char *
983get_collation_actual_version_libc(const char *collcollate)
984{
985 char *collversion = NULL;
986
987 if (pg_strcasecmp("C", collcollate) != 0 &&
988 pg_strncasecmp("C.", collcollate, 2) != 0 &&
989 pg_strcasecmp("POSIX", collcollate) != 0)
990 {
991#if defined(__GLIBC__)
992 /* Use the glibc version because we don't have anything better. */
993 collversion = pstrdup(gnu_get_libc_version());
994#elif defined(LC_VERSION_MASK)
995 locale_t loc;
996
997 /* Look up FreeBSD collation version. */
998 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
999 if (loc)
1000 {
1001 collversion =
1002 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1003 freelocale(loc);
1004 }
1005 else
1006 ereport(ERROR,
1007 (errmsg("could not load locale \"%s\"", collcollate)));
1008#elif defined(WIN32)
1009 /*
1010 * If we are targeting Windows Vista and above, we can ask for a name
1011 * given a collation name (earlier versions required a location code
1012 * that we don't have).
1013 */
1014 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1015 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1016
1017 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1018 LOCALE_NAME_MAX_LENGTH);
1019 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1020 {
1021 /*
1022 * GetNLSVersionEx() wants a language tag such as "en-US", not a
1023 * locale name like "English_United States.1252". Until those
1024 * values can be prevented from entering the system, or 100%
1025 * reliably converted to the more useful tag format, tolerate the
1026 * resulting error and report that we have no version data.
1027 */
1028 if (GetLastError() == ERROR_INVALID_PARAMETER)
1029 return NULL;
1030
1031 ereport(ERROR,
1032 (errmsg("could not get collation version for locale \"%s\": error code %lu",
1033 collcollate,
1034 GetLastError())));
1035 }
1036 collversion = psprintf("%lu.%lu,%lu.%lu",
1037 (version.dwNLSVersion >> 8) & 0xFFFF,
1038 version.dwNLSVersion & 0xFF,
1039 (version.dwDefinedVersion >> 8) & 0xFFFF,
1040 version.dwDefinedVersion & 0xFF);
1041#endif
1042 }
1043
1044 return collversion;
1045}
1046
1047/*
1048 * strncoll_libc_win32_utf8
1049 *
1050 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1051 * invoke wcscoll_l().
1052 *
1053 * An input string length of -1 means that it's NUL-terminated.
1054 */
1055#ifdef WIN32
1056static int
1057strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1058 ssize_t len2, pg_locale_t locale)
1059{
1060 char sbuf[TEXTBUFLEN];
1061 char *buf = sbuf;
1062 char *a1p,
1063 *a2p;
1064 int a1len;
1065 int a2len;
1066 int r;
1067 int result;
1068
1070
1071 if (len1 == -1)
1072 len1 = strlen(arg1);
1073 if (len2 == -1)
1074 len2 = strlen(arg2);
1075
1076 a1len = len1 * 2 + 2;
1077 a2len = len2 * 2 + 2;
1078
1079 if (a1len + a2len > TEXTBUFLEN)
1080 buf = palloc(a1len + a2len);
1081
1082 a1p = buf;
1083 a2p = buf + a1len;
1084
1085 /* API does not work for zero-length input */
1086 if (len1 == 0)
1087 r = 0;
1088 else
1089 {
1090 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1091 (LPWSTR) a1p, a1len / 2);
1092 if (!r)
1093 ereport(ERROR,
1094 (errmsg("could not convert string to UTF-16: error code %lu",
1095 GetLastError())));
1096 }
1097 ((LPWSTR) a1p)[r] = 0;
1098
1099 if (len2 == 0)
1100 r = 0;
1101 else
1102 {
1103 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1104 (LPWSTR) a2p, a2len / 2);
1105 if (!r)
1106 ereport(ERROR,
1107 (errmsg("could not convert string to UTF-16: error code %lu",
1108 GetLastError())));
1109 }
1110 ((LPWSTR) a2p)[r] = 0;
1111
1112 errno = 0;
1113 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1114 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1115 ereport(ERROR,
1116 (errmsg("could not compare Unicode strings: %m")));
1117
1118 if (buf != sbuf)
1119 pfree(buf);
1120
1121 return result;
1122}
1123#endif /* WIN32 */
1124
1125/* simple subroutine for reporting errors from newlocale() */
1126void
1127report_newlocale_failure(const char *localename)
1128{
1129 int save_errno;
1130
1131 /*
1132 * Windows doesn't provide any useful error indication from
1133 * _create_locale(), and BSD-derived platforms don't seem to feel they
1134 * need to set errno either (even though POSIX is pretty clear that
1135 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1136 * is what to report.
1137 */
1138 if (errno == 0)
1139 errno = ENOENT;
1140
1141 /*
1142 * ENOENT means "no such locale", not "no such file", so clarify that
1143 * errno with an errdetail message.
1144 */
1145 save_errno = errno; /* auxiliary funcs might change errno */
1146 ereport(ERROR,
1147 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1148 errmsg("could not create locale \"%s\": %m",
1149 localename),
1150 (save_errno == ENOENT ?
1151 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1152 localename) : 0)));
1153}
1154
1155/*
1156 * POSIX doesn't define _l-variants of these functions, but several systems
1157 * have them. We provide our own replacements here.
1158 */
1159#ifndef HAVE_MBSTOWCS_L
1160static size_t
1161mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1162{
1163#ifdef WIN32
1164 return _mbstowcs_l(dest, src, n, loc);
1165#else
1166 size_t result;
1167 locale_t save_locale = uselocale(loc);
1168
1169 result = mbstowcs(dest, src, n);
1170 uselocale(save_locale);
1171 return result;
1172#endif
1173}
1174#endif
1175#ifndef HAVE_WCSTOMBS_L
1176static size_t
1177wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1178{
1179#ifdef WIN32
1180 return _wcstombs_l(dest, src, n, loc);
1181#else
1182 size_t result;
1183 locale_t save_locale = uselocale(loc);
1184
1185 result = wcstombs(dest, src, n);
1186 uselocale(save_locale);
1187 return result;
1188#endif
1189}
1190#endif
1191
1192/*
1193 * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1194 * Therefore we keep them here rather than with the mbutils code.
1195 */
1196
1197/*
1198 * wchar2char --- convert wide characters to multibyte format
1199 *
1200 * This has the same API as the standard wcstombs_l() function; in particular,
1201 * tolen is the maximum number of bytes to store at *to, and *from must be
1202 * zero-terminated. The output will be zero-terminated iff there is room.
1203 */
1204size_t
1205wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1206{
1207 size_t result;
1208
1209 if (tolen == 0)
1210 return 0;
1211
1212#ifdef WIN32
1213
1214 /*
1215 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1216 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1217 * MultiByteToWideChar().
1218 */
1220 {
1221 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1222 NULL, NULL);
1223 /* A zero return is failure */
1224 if (result <= 0)
1225 result = -1;
1226 else
1227 {
1228 Assert(result <= tolen);
1229 /* Microsoft counts the zero terminator in the result */
1230 result--;
1231 }
1232 }
1233 else
1234#endif /* WIN32 */
1235 if (loc == (locale_t) 0)
1236 {
1237 /* Use wcstombs directly for the default locale */
1238 result = wcstombs(to, from, tolen);
1239 }
1240 else
1241 {
1242 /* Use wcstombs_l for nondefault locales */
1243 result = wcstombs_l(to, from, tolen, loc);
1244 }
1245
1246 return result;
1247}
1248
1249/*
1250 * char2wchar --- convert multibyte characters to wide characters
1251 *
1252 * This has almost the API of mbstowcs_l(), except that *from need not be
1253 * null-terminated; instead, the number of input bytes is specified as
1254 * fromlen. Also, we ereport() rather than returning -1 for invalid
1255 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1256 * The output will be zero-terminated iff there is room.
1257 */
1258static size_t
1259char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1260 locale_t loc)
1261{
1262 size_t result;
1263
1264 if (tolen == 0)
1265 return 0;
1266
1267#ifdef WIN32
1268 /* See WIN32 "Unicode" comment above */
1270 {
1271 /* Win32 API does not work for zero-length input */
1272 if (fromlen == 0)
1273 result = 0;
1274 else
1275 {
1276 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1277 /* A zero return is failure */
1278 if (result == 0)
1279 result = -1;
1280 }
1281
1282 if (result != -1)
1283 {
1284 Assert(result < tolen);
1285 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1286 to[result] = 0;
1287 }
1288 }
1289 else
1290#endif /* WIN32 */
1291 {
1292 /* mbstowcs requires ending '\0' */
1293 char *str = pnstrdup(from, fromlen);
1294
1295 if (loc == (locale_t) 0)
1296 {
1297 /* Use mbstowcs directly for the default locale */
1298 result = mbstowcs(to, str, tolen);
1299 }
1300 else
1301 {
1302 /* Use mbstowcs_l for nondefault locales */
1303 result = mbstowcs_l(to, str, tolen, loc);
1304 }
1305
1306 pfree(str);
1307 }
1308
1309 if (result == -1)
1310 {
1311 /*
1312 * Invalid multibyte character encountered. We try to give a useful
1313 * error message by letting pg_verifymbstr check the string. But it's
1314 * possible that the string is OK to us, and not OK to mbstowcs ---
1315 * this suggests that the LC_CTYPE locale is different from the
1316 * database encoding. Give a generic error message if pg_verifymbstr
1317 * can't find anything wrong.
1318 */
1319 pg_verifymbstr(from, fromlen, false); /* might not return */
1320 /* but if it does ... */
1321 ereport(ERROR,
1322 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1323 errmsg("invalid multibyte character for locale"),
1324 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1325 }
1326
1327 return result;
1328}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1140
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc_array(type, count)
Definition: fe_memutils.h:76
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1559
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1549
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1266
char * pstrdup(const char *in)
Definition: mcxt.c:1781
void pfree(void *pointer)
Definition: mcxt.c:1616
void * palloc(Size size)
Definition: mcxt.c:1387
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1792
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
#define TEXTBUFLEN
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:32
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition: port.h:188
static unsigned char pg_ascii_toupper(unsigned char ch)
Definition: port.h:177
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:65
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:66
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:92
const struct ctype_methods * ctype
Definition: pg_locale.h:146
const struct collate_methods * collate
Definition: pg_locale.h:145
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
#define locale_t
Definition: win32_port.h:429
#define toupper_l
Definition: win32_port.h:431
#define iswalnum_l
Definition: win32_port.h:439
#define isgraph_l
Definition: win32_port.h:444
#define towupper_l
Definition: win32_port.h:433
#define ispunct_l
Definition: win32_port.h:448
#define isalpha_l
Definition: win32_port.h:436
#define strcoll_l
Definition: win32_port.h:452
#define iswgraph_l
Definition: win32_port.h:445
#define strxfrm_l
Definition: win32_port.h:453
#define towlower_l
Definition: win32_port.h:432
#define iswspace_l
Definition: win32_port.h:451
#define isdigit_l
Definition: win32_port.h:434
#define wcscoll_l
Definition: win32_port.h:454
#define tolower_l
Definition: win32_port.h:430
#define iswupper_l
Definition: win32_port.h:441
#define iswalpha_l
Definition: win32_port.h:437
#define isprint_l
Definition: win32_port.h:446
#define iswprint_l
Definition: win32_port.h:447
#define isupper_l
Definition: win32_port.h:440
#define isalnum_l
Definition: win32_port.h:438
#define islower_l
Definition: win32_port.h:442
#define iswlower_l
Definition: win32_port.h:443
#define iswpunct_l
Definition: win32_port.h:449
#define isspace_l
Definition: win32_port.h:450
#define iswdigit_l
Definition: win32_port.h:435