PostgreSQL Source Code git master
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. On some platforms
49 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 *
51 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 * values up to 255, and punt for values above that. This is 100% correct
53 * only in single-byte encodings such as LATINn. However, non-Unicode
54 * multibyte encodings are mostly Far Eastern character sets for which the
55 * properties being tested here aren't very relevant for higher code values
56 * anyway. The difficulty with using the <wctype.h> functions with
57 * non-Unicode multibyte encodings is that we can have no certainty that
58 * the platform's wchar_t representation matches what we do in pg_wchar
59 * conversions.
60 *
61 * As a special case, in the "default" collation, (2) and (3) force ASCII
62 * letters to follow ASCII upcase/downcase rules, while in a non-default
63 * collation we just let the library functions do what they will. The case
64 * where this matters is treatment of I/i in Turkish, and the behavior is
65 * meant to match the upper()/lower() SQL functions.
66 *
67 * We store the active collation setting in static variables. In principle
68 * it could be passed down to here via the regex library's "struct vars" data
69 * structure; but that would require somewhat invasive changes in the regex
70 * library, and right now there's no real benefit to be gained from that.
71 *
72 * NB: the coding here assumes pg_wchar is an unsigned type.
73 */
74
75/*
76 * Size of stack buffer to use for string transformations, used to avoid heap
77 * allocations in typical cases. This should be large enough that most strings
78 * will fit, but small enough that we feel comfortable putting it on the
79 * stack.
80 */
81#define TEXTBUFLEN 1024
82
84
85static int strncoll_libc(const char *arg1, ssize_t len1,
86 const char *arg2, ssize_t len2,
88static size_t strnxfrm_libc(char *dest, size_t destsize,
89 const char *src, ssize_t srclen,
91extern char *get_collation_actual_version_libc(const char *collcollate);
92static locale_t make_libc_collator(const char *collate,
93 const char *ctype);
94
95#ifdef WIN32
96static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 const char *arg2, ssize_t len2,
99#endif
100
101static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 size_t fromlen, locale_t loc);
103
104static size_t strlower_libc_sb(char *dest, size_t destsize,
105 const char *src, ssize_t srclen,
107static size_t strlower_libc_mb(char *dest, size_t destsize,
108 const char *src, ssize_t srclen,
110static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
113static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 const char *src, ssize_t srclen,
116static size_t strupper_libc_sb(char *dest, size_t destsize,
117 const char *src, ssize_t srclen,
119static size_t strupper_libc_mb(char *dest, size_t destsize,
120 const char *src, ssize_t srclen,
122
123static bool
125{
126 return isdigit_l((unsigned char) wc, locale->lt);
127}
128
129static bool
131{
132 return isalpha_l((unsigned char) wc, locale->lt);
133}
134
135static bool
137{
138 return isalnum_l((unsigned char) wc, locale->lt);
139}
140
141static bool
143{
144 return isupper_l((unsigned char) wc, locale->lt);
145}
146
147static bool
149{
150 return islower_l((unsigned char) wc, locale->lt);
151}
152
153static bool
155{
156 return isgraph_l((unsigned char) wc, locale->lt);
157}
158
159static bool
161{
162 return isprint_l((unsigned char) wc, locale->lt);
163}
164
165static bool
167{
168 return ispunct_l((unsigned char) wc, locale->lt);
169}
170
171static bool
173{
174 return isspace_l((unsigned char) wc, locale->lt);
175}
176
177static bool
179{
180#ifndef WIN32
181 return isxdigit_l((unsigned char) wc, locale->lt);
182#else
183 return _isxdigit_l((unsigned char) wc, locale->lt);
184#endif
185}
186
187static bool
189{
190 return isupper_l((unsigned char) wc, locale->lt) ||
191 islower_l((unsigned char) wc, locale->lt);
192}
193
194static bool
196{
197 return iswdigit_l((wint_t) wc, locale->lt);
198}
199
200static bool
202{
203 return iswalpha_l((wint_t) wc, locale->lt);
204}
205
206static bool
208{
209 return iswalnum_l((wint_t) wc, locale->lt);
210}
211
212static bool
214{
215 return iswupper_l((wint_t) wc, locale->lt);
216}
217
218static bool
220{
221 return iswlower_l((wint_t) wc, locale->lt);
222}
223
224static bool
226{
227 return iswgraph_l((wint_t) wc, locale->lt);
228}
229
230static bool
232{
233 return iswprint_l((wint_t) wc, locale->lt);
234}
235
236static bool
238{
239 return iswpunct_l((wint_t) wc, locale->lt);
240}
241
242static bool
244{
245 return iswspace_l((wint_t) wc, locale->lt);
246}
247
248static bool
250{
251#ifndef WIN32
252 return iswxdigit_l((wint_t) wc, locale->lt);
253#else
254 return _iswxdigit_l((wint_t) wc, locale->lt);
255#endif
256}
257
258static bool
260{
261 return iswupper_l((wint_t) wc, locale->lt) ||
262 iswlower_l((wint_t) wc, locale->lt);
263}
264
265static bool
267{
268 bool is_multibyte = pg_database_encoding_max_length() > 1;
269
270 if (is_multibyte && IS_HIGHBIT_SET(ch))
271 return true;
272 else
273 return isalpha_l((unsigned char) ch, locale->lt);
274}
275
276static pg_wchar
278{
280
281 /* force C behavior for ASCII characters, per comments above */
282 if (locale->is_default && wc <= (pg_wchar) 127)
283 return pg_ascii_toupper((unsigned char) wc);
284 if (wc <= (pg_wchar) UCHAR_MAX)
285 return toupper_l((unsigned char) wc, locale->lt);
286 else
287 return wc;
288}
289
290static pg_wchar
292{
294
295 /* force C behavior for ASCII characters, per comments above */
296 if (locale->is_default && wc <= (pg_wchar) 127)
297 return pg_ascii_toupper((unsigned char) wc);
298 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
299 return towupper_l((wint_t) wc, locale->lt);
300 else
301 return wc;
302}
303
304static pg_wchar
306{
308
309 /* force C behavior for ASCII characters, per comments above */
310 if (locale->is_default && wc <= (pg_wchar) 127)
311 return pg_ascii_tolower((unsigned char) wc);
312 if (wc <= (pg_wchar) UCHAR_MAX)
313 return tolower_l((unsigned char) wc, locale->lt);
314 else
315 return wc;
316}
317
318static pg_wchar
320{
322
323 /* force C behavior for ASCII characters, per comments above */
324 if (locale->is_default && wc <= (pg_wchar) 127)
325 return pg_ascii_tolower((unsigned char) wc);
326 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
327 return towlower_l((wint_t) wc, locale->lt);
328 else
329 return wc;
330}
331
334 .strtitle = strtitle_libc_sb,
335 .strupper = strupper_libc_sb,
336 /* in libc, casefolding is the same as lowercasing */
337 .strfold = strlower_libc_sb,
338 .wc_isdigit = wc_isdigit_libc_sb,
339 .wc_isalpha = wc_isalpha_libc_sb,
340 .wc_isalnum = wc_isalnum_libc_sb,
341 .wc_isupper = wc_isupper_libc_sb,
342 .wc_islower = wc_islower_libc_sb,
343 .wc_isgraph = wc_isgraph_libc_sb,
344 .wc_isprint = wc_isprint_libc_sb,
345 .wc_ispunct = wc_ispunct_libc_sb,
346 .wc_isspace = wc_isspace_libc_sb,
347 .wc_isxdigit = wc_isxdigit_libc_sb,
348 .char_is_cased = char_is_cased_libc,
349 .wc_iscased = wc_iscased_libc_sb,
350 .wc_toupper = toupper_libc_sb,
351 .wc_tolower = tolower_libc_sb,
352};
353
354/*
355 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
356 * single-byte semantics for pattern matching.
357 */
360 .strtitle = strtitle_libc_mb,
361 .strupper = strupper_libc_mb,
362 /* in libc, casefolding is the same as lowercasing */
363 .strfold = strlower_libc_mb,
364 .wc_isdigit = wc_isdigit_libc_sb,
365 .wc_isalpha = wc_isalpha_libc_sb,
366 .wc_isalnum = wc_isalnum_libc_sb,
367 .wc_isupper = wc_isupper_libc_sb,
368 .wc_islower = wc_islower_libc_sb,
369 .wc_isgraph = wc_isgraph_libc_sb,
370 .wc_isprint = wc_isprint_libc_sb,
371 .wc_ispunct = wc_ispunct_libc_sb,
372 .wc_isspace = wc_isspace_libc_sb,
373 .wc_isxdigit = wc_isxdigit_libc_sb,
374 .char_is_cased = char_is_cased_libc,
375 .wc_iscased = wc_iscased_libc_sb,
376 .wc_toupper = toupper_libc_sb,
377 .wc_tolower = tolower_libc_sb,
378};
379
382 .strtitle = strtitle_libc_mb,
383 .strupper = strupper_libc_mb,
384 /* in libc, casefolding is the same as lowercasing */
385 .strfold = strlower_libc_mb,
386 .wc_isdigit = wc_isdigit_libc_mb,
387 .wc_isalpha = wc_isalpha_libc_mb,
388 .wc_isalnum = wc_isalnum_libc_mb,
389 .wc_isupper = wc_isupper_libc_mb,
390 .wc_islower = wc_islower_libc_mb,
391 .wc_isgraph = wc_isgraph_libc_mb,
392 .wc_isprint = wc_isprint_libc_mb,
393 .wc_ispunct = wc_ispunct_libc_mb,
394 .wc_isspace = wc_isspace_libc_mb,
395 .wc_isxdigit = wc_isxdigit_libc_mb,
396 .char_is_cased = char_is_cased_libc,
397 .wc_iscased = wc_iscased_libc_mb,
398 .wc_toupper = toupper_libc_mb,
399 .wc_tolower = tolower_libc_mb,
400};
401
404 .strnxfrm = strnxfrm_libc,
405 .strnxfrm_prefix = NULL,
406
407 /*
408 * Unfortunately, it seems that strxfrm() for non-C collations is broken
409 * on many common platforms; testing of multiple versions of glibc reveals
410 * that, for many locales, strcoll() and strxfrm() do not return
411 * consistent results. While no other libc other than Cygwin has so far
412 * been shown to have a problem, we take the conservative course of action
413 * for right now and disable this categorically. (Users who are certain
414 * this isn't a problem on their system can define TRUST_STRXFRM.)
415 */
416#ifdef TRUST_STRXFRM
417 .strxfrm_is_safe = true,
418#else
419 .strxfrm_is_safe = false,
420#endif
421};
422
423#ifdef WIN32
424static const struct collate_methods collate_methods_libc_win32_utf8 = {
425 .strncoll = strncoll_libc_win32_utf8,
426 .strnxfrm = strnxfrm_libc,
427 .strnxfrm_prefix = NULL,
428#ifdef TRUST_STRXFRM
429 .strxfrm_is_safe = true,
430#else
431 .strxfrm_is_safe = false,
432#endif
433};
434#endif
435
436static size_t
437strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
439{
440 if (srclen < 0)
441 srclen = strlen(src);
442
443 if (srclen + 1 <= destsize)
444 {
445 locale_t loc = locale->lt;
446 char *p;
447
448 memcpy(dest, src, srclen);
449 dest[srclen] = '\0';
450
451 /*
452 * Note: we assume that tolower_l() will not be so broken as to need
453 * an isupper_l() guard test. When using the default collation, we
454 * apply the traditional Postgres behavior that forces ASCII-style
455 * treatment of I/i, but in non-default collations you get exactly
456 * what the collation says.
457 */
458 for (p = dest; *p; p++)
459 {
460 if (locale->is_default)
461 {
462 if (*p >= 'A' && *p <= 'Z')
463 *p += 'a' - 'A';
464 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
465 *p = tolower_l((unsigned char) *p, loc);
466 }
467 else
468 *p = tolower_l((unsigned char) *p, loc);
469 }
470 }
471
472 return srclen;
473}
474
475static size_t
476strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
478{
479 locale_t loc = locale->lt;
480 size_t result_size;
481 wchar_t *workspace;
482 char *result;
483 size_t curr_char;
484 size_t max_size;
485
486 if (srclen < 0)
487 srclen = strlen(src);
488
489 /* Overflow paranoia */
490 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
492 (errcode(ERRCODE_OUT_OF_MEMORY),
493 errmsg("out of memory")));
494
495 /* Output workspace cannot have more codes than input bytes */
496 workspace = palloc_array(wchar_t, srclen + 1);
497
498 char2wchar(workspace, srclen + 1, src, srclen, loc);
499
500 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
501 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
502
503 /*
504 * Make result large enough; case change might change number of bytes
505 */
506 max_size = curr_char * pg_database_encoding_max_length();
507 result = palloc(max_size + 1);
508
509 result_size = wchar2char(result, workspace, max_size + 1, loc);
510
511 if (result_size + 1 > destsize)
512 return result_size;
513
514 memcpy(dest, result, result_size);
515 dest[result_size] = '\0';
516
517 pfree(workspace);
518 pfree(result);
519
520 return result_size;
521}
522
523static size_t
524strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
526{
527 if (srclen < 0)
528 srclen = strlen(src);
529
530 if (srclen + 1 <= destsize)
531 {
532 locale_t loc = locale->lt;
533 int wasalnum = false;
534 char *p;
535
536 memcpy(dest, src, srclen);
537 dest[srclen] = '\0';
538
539 /*
540 * Note: we assume that toupper_l()/tolower_l() will not be so broken
541 * as to need guard tests. When using the default collation, we apply
542 * the traditional Postgres behavior that forces ASCII-style treatment
543 * of I/i, but in non-default collations you get exactly what the
544 * collation says.
545 */
546 for (p = dest; *p; p++)
547 {
548 if (locale->is_default)
549 {
550 if (wasalnum)
551 {
552 if (*p >= 'A' && *p <= 'Z')
553 *p += 'a' - 'A';
554 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
555 *p = tolower_l((unsigned char) *p, loc);
556 }
557 else
558 {
559 if (*p >= 'a' && *p <= 'z')
560 *p -= 'a' - 'A';
561 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
562 *p = toupper_l((unsigned char) *p, loc);
563 }
564 }
565 else
566 {
567 if (wasalnum)
568 *p = tolower_l((unsigned char) *p, loc);
569 else
570 *p = toupper_l((unsigned char) *p, loc);
571 }
572 wasalnum = isalnum_l((unsigned char) *p, loc);
573 }
574 }
575
576 return srclen;
577}
578
579static size_t
580strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
582{
583 locale_t loc = locale->lt;
584 int wasalnum = false;
585 size_t result_size;
586 wchar_t *workspace;
587 char *result;
588 size_t curr_char;
589 size_t max_size;
590
591 if (srclen < 0)
592 srclen = strlen(src);
593
594 /* Overflow paranoia */
595 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
597 (errcode(ERRCODE_OUT_OF_MEMORY),
598 errmsg("out of memory")));
599
600 /* Output workspace cannot have more codes than input bytes */
601 workspace = palloc_array(wchar_t, srclen + 1);
602
603 char2wchar(workspace, srclen + 1, src, srclen, loc);
604
605 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
606 {
607 if (wasalnum)
608 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
609 else
610 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
611 wasalnum = iswalnum_l(workspace[curr_char], loc);
612 }
613
614 /*
615 * Make result large enough; case change might change number of bytes
616 */
617 max_size = curr_char * pg_database_encoding_max_length();
618 result = palloc(max_size + 1);
619
620 result_size = wchar2char(result, workspace, max_size + 1, loc);
621
622 if (result_size + 1 > destsize)
623 return result_size;
624
625 memcpy(dest, result, result_size);
626 dest[result_size] = '\0';
627
628 pfree(workspace);
629 pfree(result);
630
631 return result_size;
632}
633
634static size_t
635strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
637{
638 if (srclen < 0)
639 srclen = strlen(src);
640
641 if (srclen + 1 <= destsize)
642 {
643 locale_t loc = locale->lt;
644 char *p;
645
646 memcpy(dest, src, srclen);
647 dest[srclen] = '\0';
648
649 /*
650 * Note: we assume that toupper_l() will not be so broken as to need
651 * an islower_l() guard test. When using the default collation, we
652 * apply the traditional Postgres behavior that forces ASCII-style
653 * treatment of I/i, but in non-default collations you get exactly
654 * what the collation says.
655 */
656 for (p = dest; *p; p++)
657 {
658 if (locale->is_default)
659 {
660 if (*p >= 'a' && *p <= 'z')
661 *p -= 'a' - 'A';
662 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
663 *p = toupper_l((unsigned char) *p, loc);
664 }
665 else
666 *p = toupper_l((unsigned char) *p, loc);
667 }
668 }
669
670 return srclen;
671}
672
673static size_t
674strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
676{
677 locale_t loc = locale->lt;
678 size_t result_size;
679 wchar_t *workspace;
680 char *result;
681 size_t curr_char;
682 size_t max_size;
683
684 if (srclen < 0)
685 srclen = strlen(src);
686
687 /* Overflow paranoia */
688 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
690 (errcode(ERRCODE_OUT_OF_MEMORY),
691 errmsg("out of memory")));
692
693 /* Output workspace cannot have more codes than input bytes */
694 workspace = palloc_array(wchar_t, srclen + 1);
695
696 char2wchar(workspace, srclen + 1, src, srclen, loc);
697
698 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
699 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
700
701 /*
702 * Make result large enough; case change might change number of bytes
703 */
704 max_size = curr_char * pg_database_encoding_max_length();
705 result = palloc(max_size + 1);
706
707 result_size = wchar2char(result, workspace, max_size + 1, loc);
708
709 if (result_size + 1 > destsize)
710 return result_size;
711
712 memcpy(dest, result, result_size);
713 dest[result_size] = '\0';
714
715 pfree(workspace);
716 pfree(result);
717
718 return result_size;
719}
720
723{
724 const char *collate;
725 const char *ctype;
726 locale_t loc;
727 pg_locale_t result;
728
729 if (collid == DEFAULT_COLLATION_OID)
730 {
731 HeapTuple tp;
732 Datum datum;
733
735 if (!HeapTupleIsValid(tp))
736 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
737 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
738 Anum_pg_database_datcollate);
739 collate = TextDatumGetCString(datum);
740 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
741 Anum_pg_database_datctype);
742 ctype = TextDatumGetCString(datum);
743
744 ReleaseSysCache(tp);
745 }
746 else
747 {
748 HeapTuple tp;
749 Datum datum;
750
752 if (!HeapTupleIsValid(tp))
753 elog(ERROR, "cache lookup failed for collation %u", collid);
754
755 datum = SysCacheGetAttrNotNull(COLLOID, tp,
756 Anum_pg_collation_collcollate);
757 collate = TextDatumGetCString(datum);
758 datum = SysCacheGetAttrNotNull(COLLOID, tp,
759 Anum_pg_collation_collctype);
760 ctype = TextDatumGetCString(datum);
761
762 ReleaseSysCache(tp);
763 }
764
765
766 loc = make_libc_collator(collate, ctype);
767
768 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
769 result->deterministic = true;
770 result->collate_is_c = (strcmp(collate, "C") == 0) ||
771 (strcmp(collate, "POSIX") == 0);
772 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
773 (strcmp(ctype, "POSIX") == 0);
774 result->lt = loc;
775 if (!result->collate_is_c)
776 {
777#ifdef WIN32
779 result->collate = &collate_methods_libc_win32_utf8;
780 else
781#endif
782 result->collate = &collate_methods_libc;
783 }
784 if (!result->ctype_is_c)
785 {
790 else
791 result->ctype = &ctype_methods_libc_sb;
792 }
793
794 return result;
795}
796
797/*
798 * Create a locale_t with the given collation and ctype.
799 *
800 * The "C" and "POSIX" locales are not actually handled by libc, so return
801 * NULL.
802 *
803 * Ensure that no path leaks a locale_t.
804 */
805static locale_t
806make_libc_collator(const char *collate, const char *ctype)
807{
808 locale_t loc = 0;
809
810 if (strcmp(collate, ctype) == 0)
811 {
812 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
813 {
814 /* Normal case where they're the same */
815 errno = 0;
816#ifndef WIN32
817 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
818 NULL);
819#else
820 loc = _create_locale(LC_ALL, collate);
821#endif
822 if (!loc)
824 }
825 }
826 else
827 {
828#ifndef WIN32
829 /* We need two newlocale() steps */
830 locale_t loc1 = 0;
831
832 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
833 {
834 errno = 0;
835 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
836 if (!loc1)
838 }
839
840 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
841 {
842 errno = 0;
843 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
844 if (!loc)
845 {
846 if (loc1)
847 freelocale(loc1);
849 }
850 }
851 else
852 loc = loc1;
853#else
854
855 /*
856 * XXX The _create_locale() API doesn't appear to support this. Could
857 * perhaps be worked around by changing pg_locale_t to contain two
858 * separate fields.
859 */
861 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
862 errmsg("collations with different collate and ctype values are not supported on this platform")));
863#endif
864 }
865
866 return loc;
867}
868
869/*
870 * strncoll_libc
871 *
872 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
873 *
874 * An input string length of -1 means that it's already NUL-terminated.
875 */
876int
877strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
879{
880 char sbuf[TEXTBUFLEN];
881 char *buf = sbuf;
882 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
883 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
884 const char *arg1n;
885 const char *arg2n;
886 int result;
887
888 if (bufsize1 + bufsize2 > TEXTBUFLEN)
889 buf = palloc(bufsize1 + bufsize2);
890
891 /* nul-terminate arguments if necessary */
892 if (len1 == -1)
893 {
894 arg1n = arg1;
895 }
896 else
897 {
898 char *buf1 = buf;
899
900 memcpy(buf1, arg1, len1);
901 buf1[len1] = '\0';
902 arg1n = buf1;
903 }
904
905 if (len2 == -1)
906 {
907 arg2n = arg2;
908 }
909 else
910 {
911 char *buf2 = buf + bufsize1;
912
913 memcpy(buf2, arg2, len2);
914 buf2[len2] = '\0';
915 arg2n = buf2;
916 }
917
918 result = strcoll_l(arg1n, arg2n, locale->lt);
919
920 if (buf != sbuf)
921 pfree(buf);
922
923 return result;
924}
925
926/*
927 * strnxfrm_libc
928 *
929 * NUL-terminate src, if necessary, and pass to strxfrm_l().
930 *
931 * A source length of -1 means that it's already NUL-terminated.
932 */
933size_t
934strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
936{
937 char sbuf[TEXTBUFLEN];
938 char *buf = sbuf;
939 size_t bufsize = srclen + 1;
940 size_t result;
941
942 if (srclen == -1)
943 return strxfrm_l(dest, src, destsize, locale->lt);
944
945 if (bufsize > TEXTBUFLEN)
946 buf = palloc(bufsize);
947
948 /* nul-terminate argument */
949 memcpy(buf, src, srclen);
950 buf[srclen] = '\0';
951
952 result = strxfrm_l(dest, buf, destsize, locale->lt);
953
954 if (buf != sbuf)
955 pfree(buf);
956
957 /* if dest is defined, it should be nul-terminated */
958 Assert(result >= destsize || dest[result] == '\0');
959
960 return result;
961}
962
963char *
964get_collation_actual_version_libc(const char *collcollate)
965{
966 char *collversion = NULL;
967
968 if (pg_strcasecmp("C", collcollate) != 0 &&
969 pg_strncasecmp("C.", collcollate, 2) != 0 &&
970 pg_strcasecmp("POSIX", collcollate) != 0)
971 {
972#if defined(__GLIBC__)
973 /* Use the glibc version because we don't have anything better. */
974 collversion = pstrdup(gnu_get_libc_version());
975#elif defined(LC_VERSION_MASK)
976 locale_t loc;
977
978 /* Look up FreeBSD collation version. */
979 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
980 if (loc)
981 {
982 collversion =
983 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
984 freelocale(loc);
985 }
986 else
988 (errmsg("could not load locale \"%s\"", collcollate)));
989#elif defined(WIN32)
990 /*
991 * If we are targeting Windows Vista and above, we can ask for a name
992 * given a collation name (earlier versions required a location code
993 * that we don't have).
994 */
995 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
996 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
997
998 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
999 LOCALE_NAME_MAX_LENGTH);
1000 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1001 {
1002 /*
1003 * GetNLSVersionEx() wants a language tag such as "en-US", not a
1004 * locale name like "English_United States.1252". Until those
1005 * values can be prevented from entering the system, or 100%
1006 * reliably converted to the more useful tag format, tolerate the
1007 * resulting error and report that we have no version data.
1008 */
1009 if (GetLastError() == ERROR_INVALID_PARAMETER)
1010 return NULL;
1011
1012 ereport(ERROR,
1013 (errmsg("could not get collation version for locale \"%s\": error code %lu",
1014 collcollate,
1015 GetLastError())));
1016 }
1017 collversion = psprintf("%lu.%lu,%lu.%lu",
1018 (version.dwNLSVersion >> 8) & 0xFFFF,
1019 version.dwNLSVersion & 0xFF,
1020 (version.dwDefinedVersion >> 8) & 0xFFFF,
1021 version.dwDefinedVersion & 0xFF);
1022#endif
1023 }
1024
1025 return collversion;
1026}
1027
1028/*
1029 * strncoll_libc_win32_utf8
1030 *
1031 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1032 * invoke wcscoll_l().
1033 *
1034 * An input string length of -1 means that it's NUL-terminated.
1035 */
1036#ifdef WIN32
1037static int
1038strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1039 ssize_t len2, pg_locale_t locale)
1040{
1041 char sbuf[TEXTBUFLEN];
1042 char *buf = sbuf;
1043 char *a1p,
1044 *a2p;
1045 int a1len;
1046 int a2len;
1047 int r;
1048 int result;
1049
1051
1052 if (len1 == -1)
1053 len1 = strlen(arg1);
1054 if (len2 == -1)
1055 len2 = strlen(arg2);
1056
1057 a1len = len1 * 2 + 2;
1058 a2len = len2 * 2 + 2;
1059
1060 if (a1len + a2len > TEXTBUFLEN)
1061 buf = palloc(a1len + a2len);
1062
1063 a1p = buf;
1064 a2p = buf + a1len;
1065
1066 /* API does not work for zero-length input */
1067 if (len1 == 0)
1068 r = 0;
1069 else
1070 {
1071 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1072 (LPWSTR) a1p, a1len / 2);
1073 if (!r)
1074 ereport(ERROR,
1075 (errmsg("could not convert string to UTF-16: error code %lu",
1076 GetLastError())));
1077 }
1078 ((LPWSTR) a1p)[r] = 0;
1079
1080 if (len2 == 0)
1081 r = 0;
1082 else
1083 {
1084 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1085 (LPWSTR) a2p, a2len / 2);
1086 if (!r)
1087 ereport(ERROR,
1088 (errmsg("could not convert string to UTF-16: error code %lu",
1089 GetLastError())));
1090 }
1091 ((LPWSTR) a2p)[r] = 0;
1092
1093 errno = 0;
1094 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1095 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1096 ereport(ERROR,
1097 (errmsg("could not compare Unicode strings: %m")));
1098
1099 if (buf != sbuf)
1100 pfree(buf);
1101
1102 return result;
1103}
1104#endif /* WIN32 */
1105
1106/* simple subroutine for reporting errors from newlocale() */
1107void
1108report_newlocale_failure(const char *localename)
1109{
1110 int save_errno;
1111
1112 /*
1113 * Windows doesn't provide any useful error indication from
1114 * _create_locale(), and BSD-derived platforms don't seem to feel they
1115 * need to set errno either (even though POSIX is pretty clear that
1116 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1117 * is what to report.
1118 */
1119 if (errno == 0)
1120 errno = ENOENT;
1121
1122 /*
1123 * ENOENT means "no such locale", not "no such file", so clarify that
1124 * errno with an errdetail message.
1125 */
1126 save_errno = errno; /* auxiliary funcs might change errno */
1127 ereport(ERROR,
1128 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1129 errmsg("could not create locale \"%s\": %m",
1130 localename),
1131 (save_errno == ENOENT ?
1132 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1133 localename) : 0)));
1134}
1135
1136/*
1137 * POSIX doesn't define _l-variants of these functions, but several systems
1138 * have them. We provide our own replacements here.
1139 */
1140#ifndef HAVE_MBSTOWCS_L
1141static size_t
1142mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1143{
1144#ifdef WIN32
1145 return _mbstowcs_l(dest, src, n, loc);
1146#else
1147 size_t result;
1148 locale_t save_locale = uselocale(loc);
1149
1150 result = mbstowcs(dest, src, n);
1151 uselocale(save_locale);
1152 return result;
1153#endif
1154}
1155#endif
1156#ifndef HAVE_WCSTOMBS_L
1157static size_t
1158wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1159{
1160#ifdef WIN32
1161 return _wcstombs_l(dest, src, n, loc);
1162#else
1163 size_t result;
1164 locale_t save_locale = uselocale(loc);
1165
1166 result = wcstombs(dest, src, n);
1167 uselocale(save_locale);
1168 return result;
1169#endif
1170}
1171#endif
1172
1173/*
1174 * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1175 * Therefore we keep them here rather than with the mbutils code.
1176 */
1177
1178/*
1179 * wchar2char --- convert wide characters to multibyte format
1180 *
1181 * This has the same API as the standard wcstombs_l() function; in particular,
1182 * tolen is the maximum number of bytes to store at *to, and *from must be
1183 * zero-terminated. The output will be zero-terminated iff there is room.
1184 */
1185size_t
1186wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1187{
1188 size_t result;
1189
1190 if (tolen == 0)
1191 return 0;
1192
1193#ifdef WIN32
1194
1195 /*
1196 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1197 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1198 * MultiByteToWideChar().
1199 */
1201 {
1202 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1203 NULL, NULL);
1204 /* A zero return is failure */
1205 if (result <= 0)
1206 result = -1;
1207 else
1208 {
1209 Assert(result <= tolen);
1210 /* Microsoft counts the zero terminator in the result */
1211 result--;
1212 }
1213 }
1214 else
1215#endif /* WIN32 */
1216 if (loc == (locale_t) 0)
1217 {
1218 /* Use wcstombs directly for the default locale */
1219 result = wcstombs(to, from, tolen);
1220 }
1221 else
1222 {
1223 /* Use wcstombs_l for nondefault locales */
1224 result = wcstombs_l(to, from, tolen, loc);
1225 }
1226
1227 return result;
1228}
1229
1230/*
1231 * char2wchar --- convert multibyte characters to wide characters
1232 *
1233 * This has almost the API of mbstowcs_l(), except that *from need not be
1234 * null-terminated; instead, the number of input bytes is specified as
1235 * fromlen. Also, we ereport() rather than returning -1 for invalid
1236 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1237 * The output will be zero-terminated iff there is room.
1238 */
1239static size_t
1240char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1241 locale_t loc)
1242{
1243 size_t result;
1244
1245 if (tolen == 0)
1246 return 0;
1247
1248#ifdef WIN32
1249 /* See WIN32 "Unicode" comment above */
1251 {
1252 /* Win32 API does not work for zero-length input */
1253 if (fromlen == 0)
1254 result = 0;
1255 else
1256 {
1257 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1258 /* A zero return is failure */
1259 if (result == 0)
1260 result = -1;
1261 }
1262
1263 if (result != -1)
1264 {
1265 Assert(result < tolen);
1266 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1267 to[result] = 0;
1268 }
1269 }
1270 else
1271#endif /* WIN32 */
1272 {
1273 /* mbstowcs requires ending '\0' */
1274 char *str = pnstrdup(from, fromlen);
1275
1276 if (loc == (locale_t) 0)
1277 {
1278 /* Use mbstowcs directly for the default locale */
1279 result = mbstowcs(to, str, tolen);
1280 }
1281 else
1282 {
1283 /* Use mbstowcs_l for nondefault locales */
1284 result = mbstowcs_l(to, str, tolen, loc);
1285 }
1286
1287 pfree(str);
1288 }
1289
1290 if (result == -1)
1291 {
1292 /*
1293 * Invalid multibyte character encountered. We try to give a useful
1294 * error message by letting pg_verifymbstr check the string. But it's
1295 * possible that the string is OK to us, and not OK to mbstowcs ---
1296 * this suggests that the LC_CTYPE locale is different from the
1297 * database encoding. Give a generic error message if pg_verifymbstr
1298 * can't find anything wrong.
1299 */
1300 pg_verifymbstr(from, fromlen, false); /* might not return */
1301 /* but if it does ... */
1302 ereport(ERROR,
1303 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1304 errmsg("invalid multibyte character for locale"),
1305 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1306 }
1307
1308 return result;
1309}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1153
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc_array(type, count)
Definition: fe_memutils.h:76
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1559
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1549
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1770
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool char_is_cased_libc(char ch, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
#define TEXTBUFLEN
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:32
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition: port.h:188
static unsigned char pg_ascii_toupper(unsigned char ch)
Definition: port.h:177
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:65
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:75
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:101
const struct ctype_methods * ctype
Definition: pg_locale.h:155
const struct collate_methods * collate
Definition: pg_locale.h:154
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
#define locale_t
Definition: win32_port.h:429
#define toupper_l
Definition: win32_port.h:431
#define iswalnum_l
Definition: win32_port.h:439
#define isgraph_l
Definition: win32_port.h:444
#define towupper_l
Definition: win32_port.h:433
#define ispunct_l
Definition: win32_port.h:448
#define isalpha_l
Definition: win32_port.h:436
#define strcoll_l
Definition: win32_port.h:452
#define iswgraph_l
Definition: win32_port.h:445
#define strxfrm_l
Definition: win32_port.h:453
#define towlower_l
Definition: win32_port.h:432
#define iswspace_l
Definition: win32_port.h:451
#define isdigit_l
Definition: win32_port.h:434
#define wcscoll_l
Definition: win32_port.h:454
#define tolower_l
Definition: win32_port.h:430
#define iswupper_l
Definition: win32_port.h:441
#define iswalpha_l
Definition: win32_port.h:437
#define isprint_l
Definition: win32_port.h:446
#define iswprint_l
Definition: win32_port.h:447
#define isupper_l
Definition: win32_port.h:440
#define isalnum_l
Definition: win32_port.h:438
#define islower_l
Definition: win32_port.h:442
#define iswlower_l
Definition: win32_port.h:443
#define iswpunct_l
Definition: win32_port.h:449
#define isspace_l
Definition: win32_port.h:450
#define iswdigit_l
Definition: win32_port.h:435