PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucasemap.h>
16#include <unicode/ucnv.h>
17#include <unicode/ucol.h>
18#include <unicode/ustring.h>
19
20/*
21 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
22 * (see
23 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
24 */
25#if U_ICU_VERSION_MAJOR_NUM >= 53
26#define HAVE_UCOL_STRCOLLUTF8 1
27#else
28#undef HAVE_UCOL_STRCOLLUTF8
29#endif
30
31#endif
32
33#include "access/htup_details.h"
34#include "catalog/pg_database.h"
36#include "mb/pg_wchar.h"
37#include "miscadmin.h"
38#include "utils/builtins.h"
39#include "utils/formatting.h"
40#include "utils/memutils.h"
41#include "utils/pg_locale.h"
42#include "utils/syscache.h"
43
44/*
45 * Size of stack buffer to use for string transformations, used to avoid heap
46 * allocations in typical cases. This should be large enough that most strings
47 * will fit, but small enough that we feel comfortable putting it on the
48 * stack.
49 */
50#define TEXTBUFLEN 1024
51
53
54#ifdef USE_ICU
55
56extern UCollator *pg_ucol_open(const char *loc_str);
57static UCaseMap *pg_ucasemap_open(const char *loc_str);
58
59static size_t strlower_icu(char *dest, size_t destsize, const char *src,
60 ssize_t srclen, pg_locale_t locale);
61static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
62 ssize_t srclen, pg_locale_t locale);
63static size_t strupper_icu(char *dest, size_t destsize, const char *src,
64 ssize_t srclen, pg_locale_t locale);
65static size_t strfold_icu(char *dest, size_t destsize, const char *src,
66 ssize_t srclen, pg_locale_t locale);
67static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
68 ssize_t srclen, pg_locale_t locale);
69static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
70 ssize_t srclen, pg_locale_t locale);
71static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
72 ssize_t srclen, pg_locale_t locale);
73static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
74 ssize_t srclen, pg_locale_t locale);
75static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
76 ssize_t srclen, pg_locale_t locale);
77static int strncoll_icu(const char *arg1, ssize_t len1,
78 const char *arg2, ssize_t len2,
79 pg_locale_t locale);
80static size_t strnxfrm_icu(char *dest, size_t destsize,
81 const char *src, ssize_t srclen,
82 pg_locale_t locale);
83extern char *get_collation_actual_version_icu(const char *collcollate);
84
86 const UChar *src, int32_t srcLength,
87 const char *locale,
89
90/*
91 * Converter object for converting between ICU's UChar strings and C strings
92 * in database encoding. Since the database encoding doesn't change, we only
93 * need one of these per session.
94 */
96
97static UCollator *make_icu_collator(const char *iculocstr,
98 const char *icurules);
99static int strncoll_icu(const char *arg1, ssize_t len1,
100 const char *arg2, ssize_t len2,
101 pg_locale_t locale);
102static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
103 const char *src, ssize_t srclen,
104 pg_locale_t locale);
105#ifdef HAVE_UCOL_STRCOLLUTF8
106static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
107 const char *arg2, ssize_t len2,
108 pg_locale_t locale);
109#endif
110static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
112 pg_locale_t locale);
113static void init_icu_converter(void);
114static size_t uchar_length(UConverter *converter,
115 const char *str, int32_t len);
117 UChar *dest, int32_t destlen,
118 const char *src, int32_t srclen);
119static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
120 size_t nbytes);
121static size_t icu_from_uchar(char *dest, size_t destsize,
123static void icu_set_collation_attributes(UCollator *collator, const char *loc,
124 UErrorCode *status);
125static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
126 size_t destsize, const char *src,
127 ssize_t srclen, pg_locale_t locale);
129 const UChar *src, int32_t srcLength,
130 const char *locale,
133 const UChar *src, int32_t srcLength,
134 const char *locale,
136static int32_t foldcase_options(const char *locale);
137
138/*
139 * XXX: many of the functions below rely on casts directly from pg_wchar to
140 * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
141 */
142
143static pg_wchar
145{
146 return u_toupper(wc);
147}
148
149static pg_wchar
151{
152 return u_tolower(wc);
153}
154
155static const struct collate_methods collate_methods_icu = {
157 .strnxfrm = strnxfrm_icu,
158 .strnxfrm_prefix = strnxfrm_prefix_icu,
159 .strxfrm_is_safe = true,
160};
161
162static const struct collate_methods collate_methods_icu_utf8 = {
163#ifdef HAVE_UCOL_STRCOLLUTF8
165#else
166 .strncoll = strncoll_icu,
167#endif
168 .strnxfrm = strnxfrm_icu,
169 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
170 .strxfrm_is_safe = true,
171};
172
173static bool
175{
176 return u_isdigit(wc);
177}
178
179static bool
181{
182 return u_isalpha(wc);
183}
184
185static bool
187{
188 return u_isalnum(wc);
189}
190
191static bool
193{
194 return u_isupper(wc);
195}
196
197static bool
199{
200 return u_islower(wc);
201}
202
203static bool
205{
206 return u_isgraph(wc);
207}
208
209static bool
211{
212 return u_isprint(wc);
213}
214
215static bool
217{
218 return u_ispunct(wc);
219}
220
221static bool
223{
224 return u_isspace(wc);
225}
226
227static bool
229{
230 return u_isxdigit(wc);
231}
232
233static bool
235{
237}
238
239static const struct ctype_methods ctype_methods_icu = {
241 .strtitle = strtitle_icu,
242 .strupper = strupper_icu,
243 .strfold = strfold_icu,
244 .downcase_ident = downcase_ident_icu,
245 .wc_isdigit = wc_isdigit_icu,
246 .wc_isalpha = wc_isalpha_icu,
247 .wc_isalnum = wc_isalnum_icu,
248 .wc_isupper = wc_isupper_icu,
249 .wc_islower = wc_islower_icu,
250 .wc_isgraph = wc_isgraph_icu,
251 .wc_isprint = wc_isprint_icu,
252 .wc_ispunct = wc_ispunct_icu,
253 .wc_isspace = wc_isspace_icu,
254 .wc_isxdigit = wc_isxdigit_icu,
255 .wc_iscased = wc_iscased_icu,
256 .wc_toupper = toupper_icu,
257 .wc_tolower = tolower_icu,
258};
259
260static const struct ctype_methods ctype_methods_icu_utf8 = {
262 .strtitle = strtitle_icu_utf8,
263 .strupper = strupper_icu_utf8,
264 .strfold = strfold_icu_utf8,
265 /* uses plain ASCII semantics for historical reasons */
266 .downcase_ident = NULL,
267 .wc_isdigit = wc_isdigit_icu,
268 .wc_isalpha = wc_isalpha_icu,
269 .wc_isalnum = wc_isalnum_icu,
270 .wc_isupper = wc_isupper_icu,
271 .wc_islower = wc_islower_icu,
272 .wc_isgraph = wc_isgraph_icu,
273 .wc_isprint = wc_isprint_icu,
274 .wc_ispunct = wc_ispunct_icu,
275 .wc_isspace = wc_isspace_icu,
276 .wc_isxdigit = wc_isxdigit_icu,
277 .wc_iscased = wc_iscased_icu,
278 .wc_toupper = toupper_icu,
279 .wc_tolower = tolower_icu,
280};
281
282/*
283 * ICU still depends on libc for compatibility with certain historical
284 * behavior for single-byte encodings. See downcase_ident_icu().
285 *
286 * XXX: consider fixing by decoding the single byte into a code point, and
287 * using u_tolower().
288 */
289static locale_t
290make_libc_ctype_locale(const char *ctype)
291{
292 locale_t loc;
293
294#ifndef WIN32
295 loc = newlocale(LC_CTYPE_MASK, ctype, NULL);
296#else
297 loc = _create_locale(LC_ALL, ctype);
298#endif
299 if (!loc)
301
302 return loc;
303}
304#endif
305
308{
309#ifdef USE_ICU
310 bool deterministic;
311 const char *iculocstr;
312 const char *icurules = NULL;
314 locale_t loc = (locale_t) 0;
315 pg_locale_t result;
316
318 {
319 HeapTuple tp;
320 Datum datum;
321 bool isnull;
322
324 if (!HeapTupleIsValid(tp))
325 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
326
327 /* default database collation is always deterministic */
328 deterministic = true;
332 datum = SysCacheGetAttr(DATABASEOID, tp,
334 if (!isnull)
336
337 /* libc only needed for default locale and single-byte encoding */
339 {
340 const char *ctype;
341
344 ctype = TextDatumGetCString(datum);
345
346 loc = make_libc_ctype_locale(ctype);
347 }
348
349 ReleaseSysCache(tp);
350 }
351 else
352 {
354 HeapTuple tp;
355 Datum datum;
356 bool isnull;
357
359 if (!HeapTupleIsValid(tp))
360 elog(ERROR, "cache lookup failed for collation %u", collid);
362 deterministic = collform->collisdeterministic;
366 datum = SysCacheGetAttr(COLLOID, tp,
368 if (!isnull)
370
371 ReleaseSysCache(tp);
372 }
373
375
376 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
377 result->icu.locale = MemoryContextStrdup(context, iculocstr);
378 result->icu.ucol = collator;
379 result->icu.lt = loc;
380 result->deterministic = deterministic;
381 result->collate_is_c = false;
382 result->ctype_is_c = false;
384 {
385 result->icu.ucasemap = pg_ucasemap_open(iculocstr);
387 result->ctype = &ctype_methods_icu_utf8;
388 }
389 else
390 {
391 result->collate = &collate_methods_icu;
392 result->ctype = &ctype_methods_icu;
393 }
394
395 return result;
396#else
397 /* could get here if a collation was created by a build with ICU */
400 errmsg("ICU is not supported in this build")));
401
402 return NULL;
403#endif
404}
405
406#ifdef USE_ICU
407
408/*
409 * Check locale string and fix it if necessary. Returns a new palloc'd string.
410 *
411 * In ICU versions 54 and earlier, "und" is not a recognized spelling of the
412 * root locale. If the first component of the locale is "und", replace with
413 * "root" before opening.
414 */
415static char *
416fix_icu_locale_str(const char *loc_str)
417{
418 /*
419 * Must never open default collator, because it depends on the environment
420 * and may change at any time. Should not happen, but check here to catch
421 * bugs that might be hard to catch otherwise.
422 *
423 * NB: the default collator is not the same as the collator for the root
424 * locale. The root locale may be specified as the empty string, "und", or
425 * "root". The default collator is opened by passing NULL to ucol_open().
426 */
427 if (loc_str == NULL)
428 elog(ERROR, "opening default collator is not supported");
429
431 {
432 char lang[ULOC_LANG_CAPACITY];
433 UErrorCode status = U_ZERO_ERROR;
434
436 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
437 {
440 errmsg("could not get language from locale \"%s\": %s",
441 loc_str, u_errorName(status))));
442 }
443
444 if (strcmp(lang, "und") == 0)
445 {
446 const char *remainder = loc_str + strlen("und");
447 char *fixed_str;
448
449 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
450 strcpy(fixed_str, "root");
452
453 return fixed_str;
454 }
455 }
456
457 return pstrdup(loc_str);
458}
459
460/*
461 * Wrapper around ucol_open() to handle API differences for older ICU
462 * versions.
463 *
464 * Ensure that no path leaks a UCollator.
465 */
466UCollator *
467pg_ucol_open(const char *loc_str)
468{
470 UErrorCode status;
471 char *fixed_str;
472
474
475 status = U_ZERO_ERROR;
476 collator = ucol_open(fixed_str, &status);
477 if (U_FAILURE(status))
479 /* use original string for error report */
481 errmsg("could not open collator for locale \"%s\": %s",
482 loc_str, u_errorName(status))));
483
485 {
486 status = U_ZERO_ERROR;
488
489 /*
490 * Pretend the error came from ucol_open(), for consistent error
491 * message across ICU versions.
492 */
493 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
494 {
498 errmsg("could not open collator for locale \"%s\": %s",
499 loc_str, u_errorName(status))));
500 }
501 }
502
504
505 return collator;
506}
507
508/*
509 * Wrapper around ucasemap_open() to handle API differences for older ICU
510 * versions.
511 *
512 * Additionally makes sure we get the right options for case folding.
513 */
514static UCaseMap *
515pg_ucasemap_open(const char *loc_str)
516{
517 UErrorCode status = U_ZERO_ERROR;
519 char *fixed_str;
520
522
524 if (U_FAILURE(status))
525 /* use original string for error report */
528 errmsg("could not open casemap for locale \"%s\": %s",
529 loc_str, u_errorName(status)));
530
532
533 return casemap;
534}
535
536/*
537 * Create a UCollator with the given locale string and rules.
538 *
539 * Ensure that no path leaks a UCollator.
540 */
541static UCollator *
542make_icu_collator(const char *iculocstr, const char *icurules)
543{
544 if (!icurules)
545 {
546 /* simple case without rules */
547 return pg_ucol_open(iculocstr);
548 }
549 else
550 {
553 const UChar *std_rules;
556 int32_t length;
557 int32_t total;
558 UErrorCode status;
559
560 /*
561 * If rules are specified, we extract the rules of the standard
562 * collation, add our own rules, and make a new collator with the
563 * combined rules.
564 */
566
568
570
571 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
572
573 /* avoid leaking collator on OOM */
575 if (!all_rules)
576 {
580 errmsg("out of memory")));
581 }
582
585
587
588 status = U_ZERO_ERROR;
591 NULL, &status);
592 if (U_FAILURE(status))
593 {
596 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
597 iculocstr, icurules, u_errorName(status))));
598 }
599
602 return collator_all_rules;
603 }
604}
605
606static size_t
607strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
608 pg_locale_t locale)
609{
610 return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
611}
612
613static size_t
614strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
615 pg_locale_t locale)
616{
617 return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
618}
619
620static size_t
621strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
622 pg_locale_t locale)
623{
624 return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
625}
626
627static size_t
628strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
629 pg_locale_t locale)
630{
631 return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
632}
633
634static size_t
635strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
636 pg_locale_t locale)
637{
638 UErrorCode status = U_ZERO_ERROR;
640
641 needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
642 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
644 errmsg("case conversion failed: %s", u_errorName(status)));
645 return needed;
646}
647
648static size_t
649strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
650 pg_locale_t locale)
651{
652 UErrorCode status = U_ZERO_ERROR;
654
655 needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
656 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
658 errmsg("case conversion failed: %s", u_errorName(status)));
659 return needed;
660}
661
662static size_t
663strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
664 pg_locale_t locale)
665{
666 UErrorCode status = U_ZERO_ERROR;
668
669 needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
670 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
672 errmsg("case conversion failed: %s", u_errorName(status)));
673 return needed;
674}
675
676static size_t
677strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
678 pg_locale_t locale)
679{
680 UErrorCode status = U_ZERO_ERROR;
682
683 needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
684 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
686 errmsg("case conversion failed: %s", u_errorName(status)));
687 return needed;
688}
689
690/*
691 * For historical compatibility, behavior is not multibyte-aware.
692 *
693 * NB: uses libc tolower() for single-byte encodings (also for historical
694 * compatibility), and therefore relies on the global LC_CTYPE setting.
695 */
696static size_t
697downcase_ident_icu(char *dst, size_t dstsize, const char *src,
698 ssize_t srclen, pg_locale_t locale)
699{
700 int i;
701 bool libc_lower;
702 locale_t lt = locale->icu.lt;
703
705
706 for (i = 0; i < srclen && i < dstsize; i++)
707 {
708 unsigned char ch = (unsigned char) src[i];
709
710 if (ch >= 'A' && ch <= 'Z')
712 else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt))
713 ch = tolower_l(ch, lt);
714 dst[i] = (char) ch;
715 }
716
717 if (i < dstsize)
718 dst[i] = '\0';
719
720 return srclen;
721}
722
723/*
724 * strncoll_icu_utf8
725 *
726 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
727 * database encoding. An argument length of -1 means the string is
728 * NUL-terminated.
729 */
730#ifdef HAVE_UCOL_STRCOLLUTF8
731int
732strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
733 pg_locale_t locale)
734{
735 int result;
736 UErrorCode status;
737
739
740 status = U_ZERO_ERROR;
741 result = ucol_strcollUTF8(locale->icu.ucol,
742 arg1, len1,
743 arg2, len2,
744 &status);
745 if (U_FAILURE(status))
747 (errmsg("collation failed: %s", u_errorName(status))));
748
749 return result;
750}
751#endif
752
753/* 'srclen' of -1 means the strings are NUL-terminated */
754size_t
755strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
756 pg_locale_t locale)
757{
758 char sbuf[TEXTBUFLEN];
759 char *buf = sbuf;
760 UChar *uchar;
762 size_t uchar_bsize;
764
766
768
769 uchar_bsize = (ulen + 1) * sizeof(UChar);
770
773
774 uchar = (UChar *) buf;
775
777
778 result_bsize = ucol_getSortKey(locale->icu.ucol,
779 uchar, ulen,
780 (uint8_t *) dest, destsize);
781
782 /*
783 * ucol_getSortKey() counts the nul-terminator in the result length, but
784 * this function should not.
785 */
786 Assert(result_bsize > 0);
787 result_bsize--;
788
789 if (buf != sbuf)
790 pfree(buf);
791
792 /* if dest is defined, it should be nul-terminated */
793 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
794
795 return result_bsize;
796}
797
798/* 'srclen' of -1 means the strings are NUL-terminated */
799size_t
800strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
801 const char *src, ssize_t srclen,
802 pg_locale_t locale)
803{
804 size_t result;
805 UCharIterator iter;
806 uint32_t state[2];
807 UErrorCode status;
808
810
811 uiter_setUTF8(&iter, src, srclen);
812 state[0] = state[1] = 0; /* won't need that again */
813 status = U_ZERO_ERROR;
814 result = ucol_nextSortKeyPart(locale->icu.ucol,
815 &iter,
816 state,
817 (uint8_t *) dest,
818 destsize,
819 &status);
820 if (U_FAILURE(status))
822 (errmsg("sort key generation failed: %s",
823 u_errorName(status))));
824
825 return result;
826}
827
828char *
830{
834
836
839
841 return pstrdup(buf);
842}
843
844/*
845 * Convert a string in the database encoding into a string of UChars.
846 *
847 * The source string at buff is of length nbytes
848 * (it needn't be nul-terminated)
849 *
850 * *buff_uchar receives a pointer to the palloc'd result string, and
851 * the function's result is the number of UChars generated.
852 *
853 * The result string is nul-terminated, though most callers rely on the
854 * result length instead.
855 */
856static int32_t
857icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
858{
860
862
864
865 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
867 *buff_uchar, len_uchar + 1, buff, nbytes);
868
869 return len_uchar;
870}
871
872/*
873 * Convert a string of UChars into the database encoding.
874 *
875 * The source string at buff_uchar is of length len_uchar
876 * (it needn't be nul-terminated)
877 *
878 * *result receives a pointer to the palloc'd result string, and the
879 * function's result is the number of bytes generated (not counting nul).
880 *
881 * The result string is nul-terminated.
882 */
883static size_t
884icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
885{
886 UErrorCode status;
888
890
891 status = U_ZERO_ERROR;
893 buff_uchar, len_uchar, &status);
894 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
896 (errmsg("%s failed: %s", "ucnv_fromUChars",
897 u_errorName(status))));
898
899 if (len_result + 1 > destsize)
900 return len_result;
901
902 status = U_ZERO_ERROR;
904 buff_uchar, len_uchar, &status);
905 if (U_FAILURE(status) ||
908 (errmsg("%s failed: %s", "ucnv_fromUChars",
909 u_errorName(status))));
910
911 return len_result;
912}
913
914static int32_t
917{
918 UErrorCode status;
920
921 len_dest = len_source; /* try first with same length */
922 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
923 status = U_ZERO_ERROR;
925 mylocale->icu.locale, &status);
926 if (status == U_BUFFER_OVERFLOW_ERROR)
927 {
928 /* try again with adjusted length */
930 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
931 status = U_ZERO_ERROR;
933 mylocale->icu.locale, &status);
934 }
935 if (U_FAILURE(status))
937 (errmsg("case conversion failed: %s", u_errorName(status))));
938 return len_dest;
939}
940
941static int32_t
942icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
943 const char *src, ssize_t srclen, pg_locale_t locale)
944{
949 size_t result_len;
950
952 len_conv = convert_case_uchar(func, locale, &buff_conv,
957
958 return result_len;
959}
960
961static int32_t
963 const UChar *src, int32_t srcLength,
964 const char *locale,
966{
967 return u_strToTitle(dest, destCapacity, src, srcLength,
968 NULL, locale, pErrorCode);
969}
970
971static int32_t
973 const UChar *src, int32_t srcLength,
974 const char *locale,
976{
977 return u_strFoldCase(dest, destCapacity, src, srcLength,
979}
980
981/*
982 * Return the correct u_strFoldCase() options for the given locale.
983 *
984 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
985 * folding does not accept a locale. Instead it just supports a single option
986 * relevant to Turkic languages 'az' and 'tr'; check for those languages.
987 */
988static int32_t
989foldcase_options(const char *locale)
990{
992 char lang[3];
993 UErrorCode status = U_ZERO_ERROR;
994
995 uloc_getLanguage(locale, lang, 3, &status);
996 if (U_SUCCESS(status))
997 {
998 /*
999 * The option name is confusing, but it causes u_strFoldCase to use
1000 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
1001 */
1002 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
1004 }
1005
1006 return options;
1007}
1008
1009/*
1010 * strncoll_icu
1011 *
1012 * Convert the arguments from the database encoding to UChar strings, then
1013 * call ucol_strcoll(). An argument length of -1 means that the string is
1014 * NUL-terminated.
1015 *
1016 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1017 * caller should call that instead.
1018 */
1019static int
1020strncoll_icu(const char *arg1, ssize_t len1,
1021 const char *arg2, ssize_t len2, pg_locale_t locale)
1022{
1023 char sbuf[TEXTBUFLEN];
1024 char *buf = sbuf;
1025 int32_t ulen1;
1026 int32_t ulen2;
1027 size_t bufsize1;
1028 size_t bufsize2;
1029 UChar *uchar1,
1030 *uchar2;
1031 int result;
1032
1033 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
1034#ifdef HAVE_UCOL_STRCOLLUTF8
1036#endif
1037
1039
1042
1043 bufsize1 = (ulen1 + 1) * sizeof(UChar);
1044 bufsize2 = (ulen2 + 1) * sizeof(UChar);
1045
1048
1049 uchar1 = (UChar *) buf;
1050 uchar2 = (UChar *) (buf + bufsize1);
1051
1054
1055 result = ucol_strcoll(locale->icu.ucol,
1056 uchar1, ulen1,
1057 uchar2, ulen2);
1058
1059 if (buf != sbuf)
1060 pfree(buf);
1061
1062 return result;
1063}
1064
1065/* 'srclen' of -1 means the strings are NUL-terminated */
1066static size_t
1067strnxfrm_prefix_icu(char *dest, size_t destsize,
1068 const char *src, ssize_t srclen,
1069 pg_locale_t locale)
1070{
1071 char sbuf[TEXTBUFLEN];
1072 char *buf = sbuf;
1073 UCharIterator iter;
1074 uint32_t state[2];
1075 UErrorCode status;
1076 int32_t ulen = -1;
1077 UChar *uchar = NULL;
1078 size_t uchar_bsize;
1080
1081 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
1083
1085
1087
1088 uchar_bsize = (ulen + 1) * sizeof(UChar);
1089
1090 if (uchar_bsize > TEXTBUFLEN)
1092
1093 uchar = (UChar *) buf;
1094
1096
1097 uiter_setString(&iter, uchar, ulen);
1098 state[0] = state[1] = 0; /* won't need that again */
1099 status = U_ZERO_ERROR;
1100 result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
1101 &iter,
1102 state,
1103 (uint8_t *) dest,
1104 destsize,
1105 &status);
1106 if (U_FAILURE(status))
1107 ereport(ERROR,
1108 (errmsg("sort key generation failed: %s",
1109 u_errorName(status))));
1110
1111 if (buf != sbuf)
1112 pfree(buf);
1113
1114 return result_bsize;
1115}
1116
1117static void
1119{
1120 const char *icu_encoding_name;
1121 UErrorCode status;
1123
1124 if (icu_converter)
1125 return; /* already done */
1126
1128 if (!icu_encoding_name)
1129 ereport(ERROR,
1131 errmsg("encoding \"%s\" not supported by ICU",
1133
1134 status = U_ZERO_ERROR;
1135 conv = ucnv_open(icu_encoding_name, &status);
1136 if (U_FAILURE(status))
1137 ereport(ERROR,
1138 (errmsg("could not open ICU converter for encoding \"%s\": %s",
1139 icu_encoding_name, u_errorName(status))));
1140
1142}
1143
1144/*
1145 * Find length, in UChars, of given string if converted to UChar string.
1146 *
1147 * A length of -1 indicates that the input string is NUL-terminated.
1148 */
1149static size_t
1151{
1152 UErrorCode status = U_ZERO_ERROR;
1153 int32_t ulen;
1154
1155 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
1156 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1157 ereport(ERROR,
1158 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1159 return ulen;
1160}
1161
1162/*
1163 * Convert the given source string into a UChar string, stored in dest, and
1164 * return the length (in UChars).
1165 *
1166 * A srclen of -1 indicates that the input string is NUL-terminated.
1167 */
1168static int32_t
1170 const char *src, int32_t srclen)
1171{
1172 UErrorCode status = U_ZERO_ERROR;
1173 int32_t ulen;
1174
1175 status = U_ZERO_ERROR;
1176 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1177 if (U_FAILURE(status))
1178 ereport(ERROR,
1179 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1180 return ulen;
1181}
1182
1183/*
1184 * Parse collation attributes from the given locale string and apply them to
1185 * the open collator.
1186 *
1187 * First, the locale string is canonicalized to an ICU format locale ID such
1188 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1189 * the key-value arguments.
1190 *
1191 * Starting with ICU version 54, the attributes are processed automatically by
1192 * ucol_open(), so this is only necessary for emulating this behavior on older
1193 * versions.
1194 */
1196static void
1198 UErrorCode *status)
1199{
1200 int32_t len;
1201 char *icu_locale_id;
1202 char *lower_str;
1203 char *str;
1204 char *token;
1205
1206 /*
1207 * The input locale may be a BCP 47 language tag, e.g.
1208 * "und-u-kc-ks-level1", which expresses the same attributes in a
1209 * different form. It will be converted to the equivalent ICU format
1210 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1211 * uloc_canonicalize().
1212 */
1213 *status = U_ZERO_ERROR;
1214 len = uloc_canonicalize(loc, NULL, 0, status);
1215 icu_locale_id = palloc(len + 1);
1216 *status = U_ZERO_ERROR;
1217 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1218 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1219 return;
1220
1222
1224
1225 str = strchr(lower_str, '@');
1226 if (!str)
1227 return;
1228 str++;
1229
1230 while ((token = strsep(&str, ";")))
1231 {
1232 char *e = strchr(token, '=');
1233
1234 if (e)
1235 {
1236 char *name;
1237 char *value;
1240
1241 *status = U_ZERO_ERROR;
1242
1243 *e = '\0';
1244 name = token;
1245 value = e + 1;
1246
1247 /*
1248 * See attribute name and value lists in ICU i18n/coll.cpp
1249 */
1250 if (strcmp(name, "colstrength") == 0)
1252 else if (strcmp(name, "colbackwards") == 0)
1254 else if (strcmp(name, "colcaselevel") == 0)
1256 else if (strcmp(name, "colcasefirst") == 0)
1258 else if (strcmp(name, "colalternate") == 0)
1260 else if (strcmp(name, "colnormalization") == 0)
1262 else if (strcmp(name, "colnumeric") == 0)
1264 else
1265 /* ignore if unknown */
1266 continue;
1267
1268 if (strcmp(value, "primary") == 0)
1270 else if (strcmp(value, "secondary") == 0)
1272 else if (strcmp(value, "tertiary") == 0)
1274 else if (strcmp(value, "quaternary") == 0)
1276 else if (strcmp(value, "identical") == 0)
1278 else if (strcmp(value, "no") == 0)
1279 uvalue = UCOL_OFF;
1280 else if (strcmp(value, "yes") == 0)
1281 uvalue = UCOL_ON;
1282 else if (strcmp(value, "shifted") == 0)
1284 else if (strcmp(value, "non-ignorable") == 0)
1286 else if (strcmp(value, "lower") == 0)
1288 else if (strcmp(value, "upper") == 0)
1290 else
1291 {
1292 *status = U_ILLEGAL_ARGUMENT_ERROR;
1293 break;
1294 }
1295
1297 }
1298 }
1299
1301}
1302
1303#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition builtins.h:99
#define pg_attribute_unused()
Definition c.h:132
#define IS_HIGHBIT_SET(ch)
Definition c.h:1172
#define Assert(condition)
Definition c.h:885
uint32_t uint32
Definition c.h:558
size_t Size
Definition c.h:631
Oid collid
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
const char * get_encoding_name_for_icu(int encoding)
Definition encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Oid MyDatabaseId
Definition globals.c:94
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
#define token
static struct @174 value
int i
Definition isn.c:77
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
int GetDatabaseEncoding(void)
Definition mbutils.c:1389
int pg_database_encoding_max_length(void)
Definition mbutils.c:1674
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition mcxt.c:1768
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1266
char * pstrdup(const char *in)
Definition mcxt.c:1781
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
void * palloc_extended(Size size, int flags)
Definition mcxt.c:1439
END_CATALOG_STRUCT typedef FormData_pg_collation * Form_pg_collation
const void size_t len
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
void report_newlocale_failure(const char *localename)
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pg_encoding_to_char
Definition pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition strsep.c:49
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition port.h:188
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:262
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
e
static int fb(int x)
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition pg_locale.h:66
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition pg_locale.h:92
const struct ctype_methods * ctype
Definition pg_locale.h:146
const struct collate_methods * collate
Definition pg_locale.h:145
const char * locale
Definition pg_locale.h:152
void ReleaseSysCache(HeapTuple tuple)
Definition syscache.c:264
Datum SysCacheGetAttrNotNull(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition syscache.c:625
HeapTuple SearchSysCache1(SysCacheIdentifier cacheId, Datum key1)
Definition syscache.c:220
Datum SysCacheGetAttr(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition syscache.c:595
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
const char * name
#define locale_t
Definition win32_port.h:429
#define tolower_l
Definition win32_port.h:430
#define isupper_l
Definition win32_port.h:440