PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. On some platforms
49 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 *
51 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 * values up to 255, and punt for values above that. This is 100% correct
53 * only in single-byte encodings such as LATINn. However, non-Unicode
54 * multibyte encodings are mostly Far Eastern character sets for which the
55 * properties being tested here aren't very relevant for higher code values
56 * anyway. The difficulty with using the <wctype.h> functions with
57 * non-Unicode multibyte encodings is that we can have no certainty that
58 * the platform's wchar_t representation matches what we do in pg_wchar
59 * conversions.
60 *
61 * As a special case, in the "default" collation, (2) and (3) force ASCII
62 * letters to follow ASCII upcase/downcase rules, while in a non-default
63 * collation we just let the library functions do what they will. The case
64 * where this matters is treatment of I/i in Turkish, and the behavior is
65 * meant to match the upper()/lower() SQL functions.
66 *
67 * We store the active collation setting in static variables. In principle
68 * it could be passed down to here via the regex library's "struct vars" data
69 * structure; but that would require somewhat invasive changes in the regex
70 * library, and right now there's no real benefit to be gained from that.
71 *
72 * NB: the coding here assumes pg_wchar is an unsigned type.
73 */
74
75/*
76 * Size of stack buffer to use for string transformations, used to avoid heap
77 * allocations in typical cases. This should be large enough that most strings
78 * will fit, but small enough that we feel comfortable putting it on the
79 * stack.
80 */
81#define TEXTBUFLEN 1024
82
84
85static int strncoll_libc(const char *arg1, size_t len1,
86 const char *arg2, size_t len2,
87 pg_locale_t locale);
88static int strcoll_libc(const char *arg1, const char *arg2,
89 pg_locale_t locale);
90static size_t strnxfrm_libc(char *dest, size_t destsize,
91 const char *src, size_t srclen,
92 pg_locale_t locale);
93static size_t strxfrm_libc(char *dest, size_t destsize,
94 const char *src, pg_locale_t locale);
95extern char *get_collation_actual_version_libc(const char *collcollate);
96static locale_t make_libc_collator(const char *collate,
97 const char *ctype);
98
99#ifdef WIN32
100static int strncoll_libc_win32_utf8(const char *arg1, size_t len1,
101 const char *arg2, size_t len2,
102 pg_locale_t locale);
103static int strcoll_libc_win32_utf8(const char *arg1, const char *arg2,
104 pg_locale_t locale);
105#endif
106
107static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
108 size_t fromlen, locale_t loc);
109
110static size_t strlower_libc_sb(char *dest, size_t destsize,
111 const char *src, size_t srclen,
112 pg_locale_t locale);
113static size_t strlower_libc_mb(char *dest, size_t destsize,
114 const char *src, size_t srclen,
115 pg_locale_t locale);
116static size_t strtitle_libc_sb(char *dest, size_t destsize,
117 const char *src, size_t srclen,
118 pg_locale_t locale);
119static size_t strtitle_libc_mb(char *dest, size_t destsize,
120 const char *src, size_t srclen,
121 pg_locale_t locale);
122static size_t strupper_libc_sb(char *dest, size_t destsize,
123 const char *src, size_t srclen,
124 pg_locale_t locale);
125static size_t strupper_libc_mb(char *dest, size_t destsize,
126 const char *src, size_t srclen,
127 pg_locale_t locale);
128
129static bool
131{
132 return isdigit_l((unsigned char) wc, locale->lt);
133}
134
135static bool
137{
138 return isalpha_l((unsigned char) wc, locale->lt);
139}
140
141static bool
143{
144 return isalnum_l((unsigned char) wc, locale->lt);
145}
146
147static bool
149{
150 return isupper_l((unsigned char) wc, locale->lt);
151}
152
153static bool
155{
156 return islower_l((unsigned char) wc, locale->lt);
157}
158
159static bool
161{
162 return isgraph_l((unsigned char) wc, locale->lt);
163}
164
165static bool
167{
168 return isprint_l((unsigned char) wc, locale->lt);
169}
170
171static bool
173{
174 return ispunct_l((unsigned char) wc, locale->lt);
175}
176
177static bool
179{
180 return isspace_l((unsigned char) wc, locale->lt);
181}
182
183static bool
185{
186#ifndef WIN32
187 return isxdigit_l((unsigned char) wc, locale->lt);
188#else
189 return _isxdigit_l((unsigned char) wc, locale->lt);
190#endif
191}
192
193static bool
195{
196 return isupper_l((unsigned char) wc, locale->lt) ||
197 islower_l((unsigned char) wc, locale->lt);
198}
199
200static bool
202{
203 return iswdigit_l((wint_t) wc, locale->lt);
204}
205
206static bool
208{
209 return iswalpha_l((wint_t) wc, locale->lt);
210}
211
212static bool
214{
215 return iswalnum_l((wint_t) wc, locale->lt);
216}
217
218static bool
220{
221 return iswupper_l((wint_t) wc, locale->lt);
222}
223
224static bool
226{
227 return iswlower_l((wint_t) wc, locale->lt);
228}
229
230static bool
232{
233 return iswgraph_l((wint_t) wc, locale->lt);
234}
235
236static bool
238{
239 return iswprint_l((wint_t) wc, locale->lt);
240}
241
242static bool
244{
245 return iswpunct_l((wint_t) wc, locale->lt);
246}
247
248static bool
250{
251 return iswspace_l((wint_t) wc, locale->lt);
252}
253
254static bool
256{
257#ifndef WIN32
258 return iswxdigit_l((wint_t) wc, locale->lt);
259#else
260 return _iswxdigit_l((wint_t) wc, locale->lt);
261#endif
262}
263
264static bool
266{
267 return iswupper_l((wint_t) wc, locale->lt) ||
268 iswlower_l((wint_t) wc, locale->lt);
269}
270
271static pg_wchar
273{
275
276 /* force C behavior for ASCII characters, per comments above */
277 if (locale->is_default && wc <= (pg_wchar) 127)
278 return pg_ascii_toupper((unsigned char) wc);
279 if (wc <= (pg_wchar) UCHAR_MAX)
280 return toupper_l((unsigned char) wc, locale->lt);
281 else
282 return wc;
283}
284
285static pg_wchar
287{
289
290 /* force C behavior for ASCII characters, per comments above */
291 if (locale->is_default && wc <= (pg_wchar) 127)
292 return pg_ascii_toupper((unsigned char) wc);
293 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
294 return towupper_l((wint_t) wc, locale->lt);
295 else
296 return wc;
297}
298
299static pg_wchar
301{
303
304 /* force C behavior for ASCII characters, per comments above */
305 if (locale->is_default && wc <= (pg_wchar) 127)
306 return pg_ascii_tolower((unsigned char) wc);
307 if (wc <= (pg_wchar) UCHAR_MAX)
308 return tolower_l((unsigned char) wc, locale->lt);
309 else
310 return wc;
311}
312
313static pg_wchar
315{
317
318 /* force C behavior for ASCII characters, per comments above */
319 if (locale->is_default && wc <= (pg_wchar) 127)
320 return pg_ascii_tolower((unsigned char) wc);
321 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
322 return towlower_l((wint_t) wc, locale->lt);
323 else
324 return wc;
325}
326
327/*
328 * Characters A..Z always downcase to a..z, even in the Turkish
329 * locale. Characters beyond 127 use tolower().
330 */
331static size_t
332downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src,
333 size_t srclen, pg_locale_t locale)
334{
335 locale_t loc = locale->lt;
336 int i;
337
338 for (i = 0; i < srclen && i < dstsize; i++)
339 {
340 unsigned char ch = (unsigned char) src[i];
341
342 if (ch >= 'A' && ch <= 'Z')
344 else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
345 ch = tolower_l(ch, loc);
346 dst[i] = (char) ch;
347 }
348
349 if (i < dstsize)
350 dst[i] = '\0';
351
352 return srclen;
353}
354
357 .strtitle = strtitle_libc_sb,
358 .strupper = strupper_libc_sb,
359 /* in libc, casefolding is the same as lowercasing */
360 .strfold = strlower_libc_sb,
361 .downcase_ident = downcase_ident_libc_sb,
362 .wc_isdigit = wc_isdigit_libc_sb,
363 .wc_isalpha = wc_isalpha_libc_sb,
364 .wc_isalnum = wc_isalnum_libc_sb,
365 .wc_isupper = wc_isupper_libc_sb,
366 .wc_islower = wc_islower_libc_sb,
367 .wc_isgraph = wc_isgraph_libc_sb,
368 .wc_isprint = wc_isprint_libc_sb,
369 .wc_ispunct = wc_ispunct_libc_sb,
370 .wc_isspace = wc_isspace_libc_sb,
371 .wc_isxdigit = wc_isxdigit_libc_sb,
372 .wc_iscased = wc_iscased_libc_sb,
373 .wc_toupper = toupper_libc_sb,
374 .wc_tolower = tolower_libc_sb,
375};
376
377/*
378 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
379 * single-byte semantics for pattern matching.
380 */
383 .strtitle = strtitle_libc_mb,
384 .strupper = strupper_libc_mb,
385 /* in libc, casefolding is the same as lowercasing */
386 .strfold = strlower_libc_mb,
387 /* uses plain ASCII semantics for historical reasons */
388 .downcase_ident = NULL,
389 .wc_isdigit = wc_isdigit_libc_sb,
390 .wc_isalpha = wc_isalpha_libc_sb,
391 .wc_isalnum = wc_isalnum_libc_sb,
392 .wc_isupper = wc_isupper_libc_sb,
393 .wc_islower = wc_islower_libc_sb,
394 .wc_isgraph = wc_isgraph_libc_sb,
395 .wc_isprint = wc_isprint_libc_sb,
396 .wc_ispunct = wc_ispunct_libc_sb,
397 .wc_isspace = wc_isspace_libc_sb,
398 .wc_isxdigit = wc_isxdigit_libc_sb,
399 .wc_iscased = wc_iscased_libc_sb,
400 .wc_toupper = toupper_libc_sb,
401 .wc_tolower = tolower_libc_sb,
402};
403
406 .strtitle = strtitle_libc_mb,
407 .strupper = strupper_libc_mb,
408 /* in libc, casefolding is the same as lowercasing */
409 .strfold = strlower_libc_mb,
410 /* uses plain ASCII semantics for historical reasons */
411 .downcase_ident = NULL,
412 .wc_isdigit = wc_isdigit_libc_mb,
413 .wc_isalpha = wc_isalpha_libc_mb,
414 .wc_isalnum = wc_isalnum_libc_mb,
415 .wc_isupper = wc_isupper_libc_mb,
416 .wc_islower = wc_islower_libc_mb,
417 .wc_isgraph = wc_isgraph_libc_mb,
418 .wc_isprint = wc_isprint_libc_mb,
419 .wc_ispunct = wc_ispunct_libc_mb,
420 .wc_isspace = wc_isspace_libc_mb,
421 .wc_isxdigit = wc_isxdigit_libc_mb,
422 .wc_iscased = wc_iscased_libc_mb,
423 .wc_toupper = toupper_libc_mb,
424 .wc_tolower = tolower_libc_mb,
425};
426
429 .strcoll = strcoll_libc,
430 .strnxfrm = strnxfrm_libc,
431 .strxfrm = strxfrm_libc,
432 .strnxfrm_prefix = NULL,
433 .strxfrm_prefix = NULL,
434
435 /*
436 * Unfortunately, it seems that strxfrm() for non-C collations is broken
437 * on many common platforms; testing of multiple versions of glibc reveals
438 * that, for many locales, strcoll() and strxfrm() do not return
439 * consistent results. While no other libc other than Cygwin has so far
440 * been shown to have a problem, we take the conservative course of action
441 * for right now and disable this categorically. (Users who are certain
442 * this isn't a problem on their system can define TRUST_STRXFRM.)
443 */
444#ifdef TRUST_STRXFRM
445 .strxfrm_is_safe = true,
446#else
447 .strxfrm_is_safe = false,
448#endif
449};
450
451#ifdef WIN32
454 .strcoll = strcoll_libc_win32_utf8,
455 .strnxfrm = strnxfrm_libc,
456 .strxfrm = strxfrm_libc,
457 .strnxfrm_prefix = NULL,
458#ifdef TRUST_STRXFRM
459 .strxfrm_is_safe = true,
460#else
461 .strxfrm_is_safe = false,
462#endif
463};
464#endif
465
466static size_t
467strlower_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen,
468 pg_locale_t locale)
469{
470 if (srclen + 1 <= destsize)
471 {
472 locale_t loc = locale->lt;
473 char *p;
474
475 memcpy(dest, src, srclen);
476 dest[srclen] = '\0';
477
478 /*
479 * Note: we assume that tolower_l() will not be so broken as to need
480 * an isupper_l() guard test. When using the default collation, we
481 * apply the traditional Postgres behavior that forces ASCII-style
482 * treatment of I/i, but in non-default collations you get exactly
483 * what the collation says.
484 */
485 for (p = dest; *p; p++)
486 {
487 if (locale->is_default)
488 {
489 if (*p >= 'A' && *p <= 'Z')
490 *p += 'a' - 'A';
491 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
492 *p = tolower_l((unsigned char) *p, loc);
493 }
494 else
495 *p = tolower_l((unsigned char) *p, loc);
496 }
497 }
498
499 return srclen;
500}
501
502static size_t
503strlower_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen,
504 pg_locale_t locale)
505{
506 locale_t loc = locale->lt;
507 size_t result_size;
508 wchar_t *workspace;
509 char *result;
510 size_t curr_char;
511 size_t max_size;
512
513 /* Overflow paranoia */
514 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
517 errmsg("out of memory")));
518
519 /* Output workspace cannot have more codes than input bytes */
520 workspace = palloc_array(wchar_t, srclen + 1);
521
522 char2wchar(workspace, srclen + 1, src, srclen, loc);
523
524 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
525 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
526
527 /*
528 * Make result large enough; case change might change number of bytes
529 */
531 result = palloc(max_size + 1);
532
533 result_size = wchar2char(result, workspace, max_size + 1, loc);
534
535 if (destsize >= result_size + 1)
536 {
537 memcpy(dest, result, result_size);
538 dest[result_size] = '\0';
539 }
540
541 pfree(workspace);
542 pfree(result);
543
544 return result_size;
545}
546
547static size_t
548strtitle_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen,
549 pg_locale_t locale)
550{
551 if (srclen + 1 <= destsize)
552 {
553 locale_t loc = locale->lt;
554 int wasalnum = false;
555 char *p;
556
557 memcpy(dest, src, srclen);
558 dest[srclen] = '\0';
559
560 /*
561 * Note: we assume that toupper_l()/tolower_l() will not be so broken
562 * as to need guard tests. When using the default collation, we apply
563 * the traditional Postgres behavior that forces ASCII-style treatment
564 * of I/i, but in non-default collations you get exactly what the
565 * collation says.
566 */
567 for (p = dest; *p; p++)
568 {
569 if (locale->is_default)
570 {
571 if (wasalnum)
572 {
573 if (*p >= 'A' && *p <= 'Z')
574 *p += 'a' - 'A';
575 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
576 *p = tolower_l((unsigned char) *p, loc);
577 }
578 else
579 {
580 if (*p >= 'a' && *p <= 'z')
581 *p -= 'a' - 'A';
582 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
583 *p = toupper_l((unsigned char) *p, loc);
584 }
585 }
586 else
587 {
588 if (wasalnum)
589 *p = tolower_l((unsigned char) *p, loc);
590 else
591 *p = toupper_l((unsigned char) *p, loc);
592 }
593 wasalnum = isalnum_l((unsigned char) *p, loc);
594 }
595 }
596
597 return srclen;
598}
599
600static size_t
601strtitle_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen,
602 pg_locale_t locale)
603{
604 locale_t loc = locale->lt;
605 int wasalnum = false;
606 size_t result_size;
607 wchar_t *workspace;
608 char *result;
609 size_t curr_char;
610 size_t max_size;
611
612 /* Overflow paranoia */
613 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
616 errmsg("out of memory")));
617
618 /* Output workspace cannot have more codes than input bytes */
619 workspace = palloc_array(wchar_t, srclen + 1);
620
621 char2wchar(workspace, srclen + 1, src, srclen, loc);
622
623 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
624 {
625 if (wasalnum)
626 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
627 else
628 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
629 wasalnum = iswalnum_l(workspace[curr_char], loc);
630 }
631
632 /*
633 * Make result large enough; case change might change number of bytes
634 */
636 result = palloc(max_size + 1);
637
638 result_size = wchar2char(result, workspace, max_size + 1, loc);
639
640 if (destsize >= result_size + 1)
641 {
642 memcpy(dest, result, result_size);
643 dest[result_size] = '\0';
644 }
645
646 pfree(workspace);
647 pfree(result);
648
649 return result_size;
650}
651
652static size_t
653strupper_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen,
654 pg_locale_t locale)
655{
656 if (srclen + 1 <= destsize)
657 {
658 locale_t loc = locale->lt;
659 char *p;
660
661 memcpy(dest, src, srclen);
662 dest[srclen] = '\0';
663
664 /*
665 * Note: we assume that toupper_l() will not be so broken as to need
666 * an islower_l() guard test. When using the default collation, we
667 * apply the traditional Postgres behavior that forces ASCII-style
668 * treatment of I/i, but in non-default collations you get exactly
669 * what the collation says.
670 */
671 for (p = dest; *p; p++)
672 {
673 if (locale->is_default)
674 {
675 if (*p >= 'a' && *p <= 'z')
676 *p -= 'a' - 'A';
677 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
678 *p = toupper_l((unsigned char) *p, loc);
679 }
680 else
681 *p = toupper_l((unsigned char) *p, loc);
682 }
683 }
684
685 return srclen;
686}
687
688static size_t
689strupper_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen,
690 pg_locale_t locale)
691{
692 locale_t loc = locale->lt;
693 size_t result_size;
694 wchar_t *workspace;
695 char *result;
696 size_t curr_char;
697 size_t max_size;
698
699 /* Overflow paranoia */
700 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
703 errmsg("out of memory")));
704
705 /* Output workspace cannot have more codes than input bytes */
706 workspace = palloc_array(wchar_t, srclen + 1);
707
708 char2wchar(workspace, srclen + 1, src, srclen, loc);
709
710 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
711 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
712
713 /*
714 * Make result large enough; case change might change number of bytes
715 */
717 result = palloc(max_size + 1);
718
719 result_size = wchar2char(result, workspace, max_size + 1, loc);
720
721 if (destsize >= result_size + 1)
722 {
723 memcpy(dest, result, result_size);
724 dest[result_size] = '\0';
725 }
726
727 pfree(workspace);
728 pfree(result);
729
730 return result_size;
731}
732
735{
736 const char *collate;
737 const char *ctype;
738 locale_t loc;
740
742 {
743 HeapTuple tp;
744 Datum datum;
745
747 if (!HeapTupleIsValid(tp))
748 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
751 collate = TextDatumGetCString(datum);
754 ctype = TextDatumGetCString(datum);
755
756 ReleaseSysCache(tp);
757 }
758 else
759 {
760 HeapTuple tp;
761 Datum datum;
762
764 if (!HeapTupleIsValid(tp))
765 elog(ERROR, "cache lookup failed for collation %u", collid);
766
769 collate = TextDatumGetCString(datum);
772 ctype = TextDatumGetCString(datum);
773
774 ReleaseSysCache(tp);
775 }
776
777
778 loc = make_libc_collator(collate, ctype);
779
780 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
781 result->deterministic = true;
782 result->collate_is_c = (strcmp(collate, "C") == 0) ||
783 (strcmp(collate, "POSIX") == 0);
784 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
785 (strcmp(ctype, "POSIX") == 0);
786 result->lt = loc;
787 if (!result->collate_is_c)
788 {
789#ifdef WIN32
792 else
793#endif
794 result->collate = &collate_methods_libc;
795 }
796 if (!result->ctype_is_c)
797 {
800 else if (pg_database_encoding_max_length() > 1)
802 else
804 }
805
806 return result;
807}
808
809/*
810 * Create a locale_t with the given collation and ctype.
811 *
812 * The "C" and "POSIX" locales are not actually handled by libc, so return
813 * NULL.
814 *
815 * Ensure that no path leaks a locale_t.
816 */
817static locale_t
818make_libc_collator(const char *collate, const char *ctype)
819{
820 locale_t loc = 0;
821
822 if (strcmp(collate, ctype) == 0)
823 {
824 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
825 {
826 /* Normal case where they're the same */
827 errno = 0;
828#ifndef WIN32
829 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
830 NULL);
831#else
832 loc = _create_locale(LC_ALL, collate);
833#endif
834 if (!loc)
836 }
837 }
838 else
839 {
840#ifndef WIN32
841 /* We need two newlocale() steps */
842 locale_t loc1 = 0;
843
844 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
845 {
846 errno = 0;
847 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
848 if (!loc1)
850 }
851
852 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
853 {
854 errno = 0;
855 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
856 if (!loc)
857 {
858 if (loc1)
861 }
862 }
863 else
864 loc = loc1;
865#else
866
867 /*
868 * XXX The _create_locale() API doesn't appear to support this. Could
869 * perhaps be worked around by changing pg_locale_t to contain two
870 * separate fields.
871 */
874 errmsg("collations with different collate and ctype values are not supported on this platform")));
875#endif
876 }
877
878 return loc;
879}
880
881/*
882 * strncoll_libc
883 *
884 * NUL-terminate arguments and pass to strcoll_l().
885 */
886static int
887strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
888 pg_locale_t locale)
889{
890 char sbuf[TEXTBUFLEN];
891 char *buf = sbuf;
892 size_t bufsize1 = len1 + 1;
893 size_t bufsize2 = len2 + 1;
894 char *buf1;
895 char *buf2;
896 const char *arg1n;
897 const char *arg2n;
898 int result;
899
902
903 buf1 = buf;
904 buf2 = buf + bufsize1;
905
906 memcpy(buf1, arg1, len1);
907 buf1[len1] = '\0';
908 arg1n = buf1;
909
910 memcpy(buf2, arg2, len2);
911 buf2[len2] = '\0';
912 arg2n = buf2;
913
914 result = strcoll_l(arg1n, arg2n, locale->lt);
915
916 if (buf != sbuf)
917 pfree(buf);
918
919 return result;
920}
921
922/*
923 * strcoll_libc
924 */
925static int
926strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
927{
928 return strcoll_l(arg1, arg2, locale->lt);
929}
930
931/*
932 * strnxfrm_libc
933 *
934 * NUL-terminate src and pass to strxfrm_l().
935 */
936static size_t
937strnxfrm_libc(char *dest, size_t destsize, const char *src, size_t srclen,
938 pg_locale_t locale)
939{
940 char sbuf[TEXTBUFLEN];
941 char *buf = sbuf;
942 size_t bufsize = srclen + 1;
943 size_t result;
944
945 if (bufsize > TEXTBUFLEN)
946 buf = palloc(bufsize);
947
948 /* nul-terminate argument */
949 memcpy(buf, src, srclen);
950 buf[srclen] = '\0';
951
952 result = strxfrm_l(dest, buf, destsize, locale->lt);
953
954 if (buf != sbuf)
955 pfree(buf);
956
957 /* if dest is defined, it should be nul-terminated */
958 Assert(result >= destsize || dest[result] == '\0');
959
960 return result;
961}
962
963/*
964 * strxfrm_libc
965 */
966static size_t
967strxfrm_libc(char *dest, size_t destsize, const char *src, pg_locale_t locale)
968{
969 return strxfrm_l(dest, src, destsize, locale->lt);
970}
971
972char *
974{
975 char *collversion = NULL;
976
977 if (pg_strcasecmp("C", collcollate) != 0 &&
978 pg_strncasecmp("C.", collcollate, 2) != 0 &&
979 pg_strcasecmp("POSIX", collcollate) != 0)
980 {
981#if defined(__GLIBC__)
982 /* Use the glibc version because we don't have anything better. */
984#elif defined(LC_VERSION_MASK)
985 locale_t loc;
986
987 /* Look up FreeBSD collation version. */
989 if (loc)
990 {
993 freelocale(loc);
994 }
995 else
997 (errmsg("could not load locale \"%s\"", collcollate)));
998#elif defined(WIN32)
999 /*
1000 * If we are targeting Windows Vista and above, we can ask for a name
1001 * given a collation name (earlier versions required a location code
1002 * that we don't have).
1003 */
1004 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1006
1010 {
1011 /*
1012 * GetNLSVersionEx() wants a language tag such as "en-US", not a
1013 * locale name like "English_United States.1252". Until those
1014 * values can be prevented from entering the system, or 100%
1015 * reliably converted to the more useful tag format, tolerate the
1016 * resulting error and report that we have no version data.
1017 */
1019 return NULL;
1020
1021 ereport(ERROR,
1022 (errmsg("could not get collation version for locale \"%s\": error code %lu",
1024 GetLastError())));
1025 }
1026 collversion = psprintf("%lu.%lu,%lu.%lu",
1027 (version.dwNLSVersion >> 8) & 0xFFFF,
1028 version.dwNLSVersion & 0xFF,
1029 (version.dwDefinedVersion >> 8) & 0xFFFF,
1030 version.dwDefinedVersion & 0xFF);
1031#endif
1032 }
1033
1034 return collversion;
1035}
1036
1037/*
1038 * strncoll_libc_win32_utf8
1039 *
1040 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1041 * invoke wcscoll_l().
1042 */
1043#ifdef WIN32
1044static int
1045strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
1046 size_t len2, pg_locale_t locale)
1047{
1048 char sbuf[TEXTBUFLEN];
1049 char *buf = sbuf;
1050 char *a1p,
1051 *a2p;
1052 size_t a1len,
1053 a2len,
1054 buflen;
1055 int r;
1056 int result;
1057
1059
1060 /*
1061 * In a 32-bit build, twice the input length can overflow size_t, so we
1062 * must be careful.
1063 */
1064 a1len = add_size(add_size(len1, len1), 2);
1065 a2len = add_size(add_size(len2, len2), 2);
1066 buflen = add_size(a1len, a2len);
1067
1068 if (buflen > TEXTBUFLEN)
1069 buf = palloc(buflen);
1070
1071 a1p = buf;
1072 a2p = buf + a1len;
1073
1074 /* API does not work for zero-length input */
1075 if (len1 == 0)
1076 r = 0;
1077 else
1078 {
1079 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1080 (LPWSTR) a1p, a1len / 2);
1081 if (!r)
1082 ereport(ERROR,
1083 (errmsg("could not convert string to UTF-16: error code %lu",
1084 GetLastError())));
1085 }
1086 ((LPWSTR) a1p)[r] = 0;
1087
1088 if (len2 == 0)
1089 r = 0;
1090 else
1091 {
1092 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1093 (LPWSTR) a2p, a2len / 2);
1094 if (!r)
1095 ereport(ERROR,
1096 (errmsg("could not convert string to UTF-16: error code %lu",
1097 GetLastError())));
1098 }
1099 ((LPWSTR) a2p)[r] = 0;
1100
1101 errno = 0;
1102 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1103 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1104 ereport(ERROR,
1105 (errmsg("could not compare Unicode strings: %m")));
1106
1107 if (buf != sbuf)
1108 pfree(buf);
1109
1110 return result;
1111}
1112
1113static int
1114strcoll_libc_win32_utf8(const char *arg1, const char *arg2,
1115 pg_locale_t locale)
1116{
1117 size_t len1 = strlen(arg1);
1118 size_t len2 = strlen(arg2);
1119
1120 return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1121}
1122#endif /* WIN32 */
1123
1124/* simple subroutine for reporting errors from newlocale() */
1125void
1126report_newlocale_failure(const char *localename)
1127{
1128 int save_errno;
1129
1130 /*
1131 * Windows doesn't provide any useful error indication from
1132 * _create_locale(), and BSD-derived platforms don't seem to feel they
1133 * need to set errno either (even though POSIX is pretty clear that
1134 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1135 * is what to report.
1136 */
1137 if (errno == 0)
1138 errno = ENOENT;
1139
1140 /*
1141 * ENOENT means "no such locale", not "no such file", so clarify that
1142 * errno with an errdetail message.
1143 */
1144 save_errno = errno; /* auxiliary funcs might change errno */
1145 ereport(ERROR,
1147 errmsg("could not create locale \"%s\": %m",
1148 localename),
1149 (save_errno == ENOENT ?
1150 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1151 localename) : 0)));
1152}
1153
1154/*
1155 * POSIX doesn't define _l-variants of these functions, but several systems
1156 * have them. We provide our own replacements here.
1157 */
1158#ifndef HAVE_MBSTOWCS_L
1159static size_t
1160mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1161{
1162#ifdef WIN32
1163 return _mbstowcs_l(dest, src, n, loc);
1164#else
1165 size_t result;
1167
1168 result = mbstowcs(dest, src, n);
1170 return result;
1171#endif
1172}
1173#endif
1174#ifndef HAVE_WCSTOMBS_L
1175static size_t
1176wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1177{
1178#ifdef WIN32
1179 return _wcstombs_l(dest, src, n, loc);
1180#else
1181 size_t result;
1183
1184 result = wcstombs(dest, src, n);
1186 return result;
1187#endif
1188}
1189#endif
1190
1191/*
1192 * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1193 * Therefore we keep them here rather than with the mbutils code.
1194 */
1195
1196/*
1197 * wchar2char --- convert wide characters to multibyte format
1198 *
1199 * This has the same API as the standard wcstombs_l() function; in particular,
1200 * tolen is the maximum number of bytes to store at *to, and *from must be
1201 * zero-terminated. The output will be zero-terminated iff there is room.
1202 */
1203size_t
1204wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1205{
1206 size_t result;
1207
1208 if (tolen == 0)
1209 return 0;
1210
1211#ifdef WIN32
1212
1213 /*
1214 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1215 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1216 * MultiByteToWideChar().
1217 */
1219 {
1220 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1221 NULL, NULL);
1222 /* A zero return is failure */
1223 if (result <= 0)
1224 result = -1;
1225 else
1226 {
1227 Assert(result <= tolen);
1228 /* Microsoft counts the zero terminator in the result */
1229 result--;
1230 }
1231 }
1232 else
1233#endif /* WIN32 */
1234 if (loc == (locale_t) 0)
1235 {
1236 /* Use wcstombs directly for the default locale */
1237 result = wcstombs(to, from, tolen);
1238 }
1239 else
1240 {
1241 /* Use wcstombs_l for nondefault locales */
1242 result = wcstombs_l(to, from, tolen, loc);
1243 }
1244
1245 return result;
1246}
1247
1248/*
1249 * char2wchar --- convert multibyte characters to wide characters
1250 *
1251 * This has almost the API of mbstowcs_l(), except that *from need not be
1252 * null-terminated; instead, the number of input bytes is specified as
1253 * fromlen. Also, we ereport() rather than returning -1 for invalid
1254 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1255 * The output will be zero-terminated iff there is room.
1256 */
1257static size_t
1258char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1259 locale_t loc)
1260{
1261 size_t result;
1262
1263 if (tolen == 0)
1264 return 0;
1265
1266#ifdef WIN32
1267 /* See WIN32 "Unicode" comment above */
1269 {
1270 /* Win32 API does not work for zero-length input */
1271 if (fromlen == 0)
1272 result = 0;
1273 else
1274 {
1275 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1276 /* A zero return is failure */
1277 if (result == 0)
1278 result = -1;
1279 }
1280
1281 if (result != -1)
1282 {
1283 Assert(result < tolen);
1284 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1285 to[result] = 0;
1286 }
1287 }
1288 else
1289#endif /* WIN32 */
1290 {
1291 /* mbstowcs requires ending '\0' */
1292 char *str = pnstrdup(from, fromlen);
1293
1294 if (loc == (locale_t) 0)
1295 {
1296 /* Use mbstowcs directly for the default locale */
1297 result = mbstowcs(to, str, tolen);
1298 }
1299 else
1300 {
1301 /* Use mbstowcs_l for nondefault locales */
1302 result = mbstowcs_l(to, str, tolen, loc);
1303 }
1304
1305 pfree(str);
1306 }
1307
1308 if (result == -1)
1309 {
1310 /*
1311 * Invalid multibyte character encountered. We try to give a useful
1312 * error message by letting pg_verifymbstr check the string. But it's
1313 * possible that the string is OK to us, and not OK to mbstowcs ---
1314 * this suggests that the LC_CTYPE locale is different from the
1315 * database encoding. Give a generic error message if pg_verifymbstr
1316 * can't find anything wrong.
1317 */
1318 pg_verifymbstr(from, fromlen, false); /* might not return */
1319 /* but if it does ... */
1320 ereport(ERROR,
1322 errmsg("invalid multibyte character for locale"),
1323 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1324 }
1325
1326 return result;
1327}
#define TextDatumGetCString(d)
Definition builtins.h:99
#define IS_HIGHBIT_SET(ch)
Definition c.h:1244
#define Assert(condition)
Definition c.h:943
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
Oid collid
int errcode(int sqlerrcode)
Definition elog.c:875
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
#define palloc_array(type, count)
Definition fe_memutils.h:91
Oid MyDatabaseId
Definition globals.c:96
const char * str
#define HeapTupleIsValid(tuple)
Definition htup.h:78
#define bufsize
int i
Definition isn.c:77
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
int GetDatabaseEncoding(void)
Definition mbutils.c:1389
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition mbutils.c:1683
int pg_database_encoding_max_length(void)
Definition mbutils.c:1673
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1269
Size add_size(Size s1, Size s2)
Definition mcxt.c:1733
char * pstrdup(const char *in)
Definition mcxt.c:1910
void pfree(void *pointer)
Definition mcxt.c:1619
void * palloc(Size size)
Definition mcxt.c:1390
char * pnstrdup(const char *in, Size len)
Definition mcxt.c:1921
static char * errmsg
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static int strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strxfrm_libc(char *dest, size_t destsize, const char *src, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static bool wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src, size_t srclen, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
#define TEXTBUFLEN
static int strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
static bool wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
static char buf[DEFAULT_XLOG_SEG_SIZE]
int pg_strcasecmp(const char *s1, const char *s2)
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition port.h:189
static unsigned char pg_ascii_toupper(unsigned char ch)
Definition port.h:178
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:252
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
static int fb(int x)
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
int(* strncoll)(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale)
Definition pg_locale.h:66
size_t(* strlower)(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
Definition pg_locale.h:101
void ReleaseSysCache(HeapTuple tuple)
Definition syscache.c:265
Datum SysCacheGetAttrNotNull(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition syscache.c:626
HeapTuple SearchSysCache1(SysCacheIdentifier cacheId, Datum key1)
Definition syscache.c:221
#define locale_t
Definition win32_port.h:429
#define toupper_l
Definition win32_port.h:431
#define iswalnum_l
Definition win32_port.h:439
#define isgraph_l
Definition win32_port.h:444
#define towupper_l
Definition win32_port.h:433
#define ispunct_l
Definition win32_port.h:448
#define isalpha_l
Definition win32_port.h:436
#define strcoll_l
Definition win32_port.h:452
#define iswgraph_l
Definition win32_port.h:445
#define strxfrm_l
Definition win32_port.h:453
#define towlower_l
Definition win32_port.h:432
#define iswspace_l
Definition win32_port.h:451
#define isdigit_l
Definition win32_port.h:434
#define wcscoll_l
Definition win32_port.h:454
#define tolower_l
Definition win32_port.h:430
#define iswupper_l
Definition win32_port.h:441
#define iswalpha_l
Definition win32_port.h:437
#define isprint_l
Definition win32_port.h:446
#define iswprint_l
Definition win32_port.h:447
#define isupper_l
Definition win32_port.h:440
#define isalnum_l
Definition win32_port.h:438
#define islower_l
Definition win32_port.h:442
#define iswlower_l
Definition win32_port.h:443
#define iswpunct_l
Definition win32_port.h:449
#define isspace_l
Definition win32_port.h:450
#define iswdigit_l
Definition win32_port.h:435