PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucasemap.h>
16#include <unicode/ucnv.h>
17#include <unicode/ucol.h>
18#include <unicode/ustring.h>
19
20/*
21 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
22 * (see
23 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
24 */
25#if U_ICU_VERSION_MAJOR_NUM >= 53
26#define HAVE_UCOL_STRCOLLUTF8 1
27#else
28#undef HAVE_UCOL_STRCOLLUTF8
29#endif
30
31#endif
32
33#include "access/htup_details.h"
34#include "catalog/pg_database.h"
36#include "mb/pg_wchar.h"
37#include "miscadmin.h"
38#include "utils/builtins.h"
39#include "utils/formatting.h"
40#include "utils/memutils.h"
41#include "utils/pg_locale.h"
42#include "utils/syscache.h"
43
44/*
45 * Size of stack buffer to use for string transformations, used to avoid heap
46 * allocations in typical cases. This should be large enough that most strings
47 * will fit, but small enough that we feel comfortable putting it on the
48 * stack.
49 */
50#define TEXTBUFLEN 1024
51
53
54#ifdef USE_ICU
55
56extern UCollator *pg_ucol_open(const char *loc_str);
57static UCaseMap *pg_ucasemap_open(const char *loc_str);
58
59static size_t strlower_icu(char *dest, size_t destsize, const char *src,
60 ssize_t srclen, pg_locale_t locale);
61static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
62 ssize_t srclen, pg_locale_t locale);
63static size_t strupper_icu(char *dest, size_t destsize, const char *src,
64 ssize_t srclen, pg_locale_t locale);
65static size_t strfold_icu(char *dest, size_t destsize, const char *src,
66 ssize_t srclen, pg_locale_t locale);
67static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
68 ssize_t srclen, pg_locale_t locale);
69static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
70 ssize_t srclen, pg_locale_t locale);
71static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
72 ssize_t srclen, pg_locale_t locale);
73static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
74 ssize_t srclen, pg_locale_t locale);
75static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
76 ssize_t srclen, pg_locale_t locale);
77static int strncoll_icu(const char *arg1, ssize_t len1,
78 const char *arg2, ssize_t len2,
79 pg_locale_t locale);
80static size_t strnxfrm_icu(char *dest, size_t destsize,
81 const char *src, ssize_t srclen,
82 pg_locale_t locale);
83extern char *get_collation_actual_version_icu(const char *collcollate);
84
86 const UChar *src, int32_t srcLength,
87 const char *locale,
89
90/*
91 * Converter object for converting between ICU's UChar strings and C strings
92 * in database encoding. Since the database encoding doesn't change, we only
93 * need one of these per session.
94 */
96
97static UCollator *make_icu_collator(const char *iculocstr,
98 const char *icurules);
99static int strncoll_icu(const char *arg1, ssize_t len1,
100 const char *arg2, ssize_t len2,
101 pg_locale_t locale);
102static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
103 const char *src, ssize_t srclen,
104 pg_locale_t locale);
105#ifdef HAVE_UCOL_STRCOLLUTF8
106static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
107 const char *arg2, ssize_t len2,
108 pg_locale_t locale);
109#endif
110static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
112 pg_locale_t locale);
113static void init_icu_converter(void);
114static size_t uchar_length(UConverter *converter,
115 const char *str, int32_t len);
117 UChar *dest, int32_t destlen,
118 const char *src, int32_t srclen);
119static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
120 size_t nbytes);
121static size_t icu_from_uchar(char *dest, size_t destsize,
123static void icu_set_collation_attributes(UCollator *collator, const char *loc,
124 UErrorCode *status);
125static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
126 size_t destsize, const char *src,
127 ssize_t srclen, pg_locale_t locale);
129 const UChar *src, int32_t srcLength,
130 const char *locale,
133 const UChar *src, int32_t srcLength,
134 const char *locale,
136static int32_t foldcase_options(const char *locale);
137
138/*
139 * XXX: many of the functions below rely on casts directly from pg_wchar to
140 * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
141 */
142
143static pg_wchar
145{
146 return u_toupper(wc);
147}
148
149static pg_wchar
151{
152 return u_tolower(wc);
153}
154
155static const struct collate_methods collate_methods_icu = {
157 .strnxfrm = strnxfrm_icu,
158 .strnxfrm_prefix = strnxfrm_prefix_icu,
159 .strxfrm_is_safe = true,
160};
161
162static const struct collate_methods collate_methods_icu_utf8 = {
163#ifdef HAVE_UCOL_STRCOLLUTF8
165#else
166 .strncoll = strncoll_icu,
167#endif
168 .strnxfrm = strnxfrm_icu,
169 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
170 .strxfrm_is_safe = true,
171};
172
173static bool
175{
176 return u_isdigit(wc);
177}
178
179static bool
181{
182 return u_isalpha(wc);
183}
184
185static bool
187{
188 return u_isalnum(wc);
189}
190
191static bool
193{
194 return u_isupper(wc);
195}
196
197static bool
199{
200 return u_islower(wc);
201}
202
203static bool
205{
206 return u_isgraph(wc);
207}
208
209static bool
211{
212 return u_isprint(wc);
213}
214
215static bool
217{
218 return u_ispunct(wc);
219}
220
221static bool
223{
224 return u_isspace(wc);
225}
226
227static bool
229{
230 return u_isxdigit(wc);
231}
232
233static bool
235{
237}
238
239static const struct ctype_methods ctype_methods_icu = {
241 .strtitle = strtitle_icu,
242 .strupper = strupper_icu,
243 .strfold = strfold_icu,
244 .downcase_ident = downcase_ident_icu,
245 .wc_isdigit = wc_isdigit_icu,
246 .wc_isalpha = wc_isalpha_icu,
247 .wc_isalnum = wc_isalnum_icu,
248 .wc_isupper = wc_isupper_icu,
249 .wc_islower = wc_islower_icu,
250 .wc_isgraph = wc_isgraph_icu,
251 .wc_isprint = wc_isprint_icu,
252 .wc_ispunct = wc_ispunct_icu,
253 .wc_isspace = wc_isspace_icu,
254 .wc_isxdigit = wc_isxdigit_icu,
255 .wc_iscased = wc_iscased_icu,
256 .wc_toupper = toupper_icu,
257 .wc_tolower = tolower_icu,
258};
259
260static const struct ctype_methods ctype_methods_icu_utf8 = {
262 .strtitle = strtitle_icu_utf8,
263 .strupper = strupper_icu_utf8,
264 .strfold = strfold_icu_utf8,
265 /* uses plain ASCII semantics for historical reasons */
266 .downcase_ident = NULL,
267 .wc_isdigit = wc_isdigit_icu,
268 .wc_isalpha = wc_isalpha_icu,
269 .wc_isalnum = wc_isalnum_icu,
270 .wc_isupper = wc_isupper_icu,
271 .wc_islower = wc_islower_icu,
272 .wc_isgraph = wc_isgraph_icu,
273 .wc_isprint = wc_isprint_icu,
274 .wc_ispunct = wc_ispunct_icu,
275 .wc_isspace = wc_isspace_icu,
276 .wc_isxdigit = wc_isxdigit_icu,
277 .wc_iscased = wc_iscased_icu,
278 .wc_toupper = toupper_icu,
279 .wc_tolower = tolower_icu,
280};
281
282/*
283 * ICU still depends on libc for compatibility with certain historical
284 * behavior for single-byte encodings. See downcase_ident_icu().
285 *
286 * XXX: consider fixing by decoding the single byte into a code point, and
287 * using u_tolower().
288 */
289static locale_t
290make_libc_ctype_locale(const char *ctype)
291{
292 locale_t loc;
293
294#ifndef WIN32
295 loc = newlocale(LC_CTYPE_MASK, ctype, NULL);
296#else
297 loc = _create_locale(LC_ALL, ctype);
298#endif
299 if (!loc)
301
302 return loc;
303}
304#endif
305
308{
309#ifdef USE_ICU
310 bool deterministic;
311 const char *iculocstr;
312 const char *icurules = NULL;
314 locale_t loc = (locale_t) 0;
315 pg_locale_t result;
316
318 {
319 HeapTuple tp;
320 Datum datum;
321 bool isnull;
322
324 if (!HeapTupleIsValid(tp))
325 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
326
327 /* default database collation is always deterministic */
328 deterministic = true;
332 datum = SysCacheGetAttr(DATABASEOID, tp,
334 if (!isnull)
336
337 /* libc only needed for default locale and single-byte encoding */
339 {
340 const char *ctype;
341
344 ctype = TextDatumGetCString(datum);
345
346 loc = make_libc_ctype_locale(ctype);
347 }
348
349 ReleaseSysCache(tp);
350 }
351 else
352 {
354 HeapTuple tp;
355 Datum datum;
356 bool isnull;
357
359 if (!HeapTupleIsValid(tp))
360 elog(ERROR, "cache lookup failed for collation %u", collid);
362 deterministic = collform->collisdeterministic;
366 datum = SysCacheGetAttr(COLLOID, tp,
368 if (!isnull)
370
371 ReleaseSysCache(tp);
372 }
373
375
376 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
377 result->icu.locale = MemoryContextStrdup(context, iculocstr);
378 result->icu.ucol = collator;
379 result->icu.lt = loc;
380 result->deterministic = deterministic;
381 result->collate_is_c = false;
382 result->ctype_is_c = false;
384 {
385 result->icu.ucasemap = pg_ucasemap_open(iculocstr);
387 result->ctype = &ctype_methods_icu_utf8;
388 }
389 else
390 {
391 result->collate = &collate_methods_icu;
392 result->ctype = &ctype_methods_icu;
393 }
394
395 return result;
396#else
397 /* could get here if a collation was created by a build with ICU */
400 errmsg("ICU is not supported in this build")));
401
402 return NULL;
403#endif
404}
405
406#ifdef USE_ICU
407
408/*
409 * Check locale string and fix it if necessary. Returns a new palloc'd string.
410 *
411 * In ICU versions 54 and earlier, "und" is not a recognized spelling of the
412 * root locale. If the first component of the locale is "und", replace with
413 * "root" before opening.
414 */
415static char *
416fix_icu_locale_str(const char *loc_str)
417{
418 /*
419 * Must never open default collator, because it depends on the environment
420 * and may change at any time. Should not happen, but check here to catch
421 * bugs that might be hard to catch otherwise.
422 *
423 * NB: the default collator is not the same as the collator for the root
424 * locale. The root locale may be specified as the empty string, "und", or
425 * "root". The default collator is opened by passing NULL to ucol_open().
426 */
427 if (loc_str == NULL)
428 elog(ERROR, "opening default collator is not supported");
429
431 {
432 char lang[ULOC_LANG_CAPACITY];
433 UErrorCode status = U_ZERO_ERROR;
434
436 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
437 {
440 errmsg("could not get language from locale \"%s\": %s",
441 loc_str, u_errorName(status))));
442 }
443
444 if (strcmp(lang, "und") == 0)
445 {
446 const char *remainder = loc_str + strlen("und");
447 char *fixed_str;
448
449 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
450 strcpy(fixed_str, "root");
452
453 return fixed_str;
454 }
455 }
456
457 return pstrdup(loc_str);
458}
459
460/*
461 * Wrapper around ucol_open() to handle API differences for older ICU
462 * versions.
463 *
464 * Ensure that no path leaks a UCollator.
465 */
466UCollator *
467pg_ucol_open(const char *loc_str)
468{
470 UErrorCode status;
471 char *fixed_str;
472
474
475 status = U_ZERO_ERROR;
476 collator = ucol_open(fixed_str, &status);
477 if (U_FAILURE(status))
479 /* use original string for error report */
481 errmsg("could not open collator for locale \"%s\": %s",
482 loc_str, u_errorName(status))));
483
485 {
486 status = U_ZERO_ERROR;
488
489 /*
490 * Pretend the error came from ucol_open(), for consistent error
491 * message across ICU versions.
492 */
493 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
494 {
498 errmsg("could not open collator for locale \"%s\": %s",
499 loc_str, u_errorName(status))));
500 }
501 }
502
504
505 return collator;
506}
507
508/*
509 * Wrapper around ucasemap_open() to handle API differences for older ICU
510 * versions.
511 *
512 * Additionally makes sure we get the right options for case folding.
513 */
514static UCaseMap *
515pg_ucasemap_open(const char *loc_str)
516{
517 UErrorCode status = U_ZERO_ERROR;
519 char *fixed_str;
520
522
524 if (U_FAILURE(status))
525 /* use original string for error report */
528 errmsg("could not open casemap for locale \"%s\": %s",
529 loc_str, u_errorName(status)));
530
532
533 return casemap;
534}
535
536/*
537 * Create a UCollator with the given locale string and rules.
538 *
539 * Ensure that no path leaks a UCollator.
540 */
541static UCollator *
542make_icu_collator(const char *iculocstr, const char *icurules)
543{
544 if (!icurules)
545 {
546 /* simple case without rules */
547 return pg_ucol_open(iculocstr);
548 }
549 else
550 {
553 const UChar *std_rules;
556 int32_t length;
557 int32_t total;
558 UErrorCode status;
559
560 /*
561 * If rules are specified, we extract the rules of the standard
562 * collation, add our own rules, and make a new collator with the
563 * combined rules.
564 */
566
568
570
571 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
572
573 /* avoid leaking collator on OOM */
575 if (!all_rules)
576 {
580 errmsg("out of memory")));
581 }
582
585
587
588 status = U_ZERO_ERROR;
591 NULL, &status);
592 if (U_FAILURE(status))
593 {
596 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
597 iculocstr, icurules, u_errorName(status))));
598 }
599
600 return collator_all_rules;
601 }
602}
603
604static size_t
605strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
606 pg_locale_t locale)
607{
608 return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
609}
610
611static size_t
612strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
613 pg_locale_t locale)
614{
615 return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
616}
617
618static size_t
619strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
620 pg_locale_t locale)
621{
622 return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
623}
624
625static size_t
626strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
627 pg_locale_t locale)
628{
629 return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
630}
631
632static size_t
633strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
634 pg_locale_t locale)
635{
636 UErrorCode status = U_ZERO_ERROR;
638
639 needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
640 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
642 errmsg("case conversion failed: %s", u_errorName(status)));
643 return needed;
644}
645
646static size_t
647strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
648 pg_locale_t locale)
649{
650 UErrorCode status = U_ZERO_ERROR;
652
653 needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
654 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
656 errmsg("case conversion failed: %s", u_errorName(status)));
657 return needed;
658}
659
660static size_t
661strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
662 pg_locale_t locale)
663{
664 UErrorCode status = U_ZERO_ERROR;
666
667 needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
668 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
670 errmsg("case conversion failed: %s", u_errorName(status)));
671 return needed;
672}
673
674static size_t
675strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
676 pg_locale_t locale)
677{
678 UErrorCode status = U_ZERO_ERROR;
680
681 needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
682 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
684 errmsg("case conversion failed: %s", u_errorName(status)));
685 return needed;
686}
687
688/*
689 * For historical compatibility, behavior is not multibyte-aware.
690 *
691 * NB: uses libc tolower() for single-byte encodings (also for historical
692 * compatibility), and therefore relies on the global LC_CTYPE setting.
693 */
694static size_t
695downcase_ident_icu(char *dst, size_t dstsize, const char *src,
696 ssize_t srclen, pg_locale_t locale)
697{
698 int i;
699 bool libc_lower;
700 locale_t lt = locale->icu.lt;
701
703
704 for (i = 0; i < srclen && i < dstsize; i++)
705 {
706 unsigned char ch = (unsigned char) src[i];
707
708 if (ch >= 'A' && ch <= 'Z')
710 else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt))
711 ch = tolower_l(ch, lt);
712 dst[i] = (char) ch;
713 }
714
715 if (i < dstsize)
716 dst[i] = '\0';
717
718 return srclen;
719}
720
721/*
722 * strncoll_icu_utf8
723 *
724 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
725 * database encoding. An argument length of -1 means the string is
726 * NUL-terminated.
727 */
728#ifdef HAVE_UCOL_STRCOLLUTF8
729int
730strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
731 pg_locale_t locale)
732{
733 int result;
734 UErrorCode status;
735
737
738 status = U_ZERO_ERROR;
739 result = ucol_strcollUTF8(locale->icu.ucol,
740 arg1, len1,
741 arg2, len2,
742 &status);
743 if (U_FAILURE(status))
745 (errmsg("collation failed: %s", u_errorName(status))));
746
747 return result;
748}
749#endif
750
751/* 'srclen' of -1 means the strings are NUL-terminated */
752size_t
753strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
754 pg_locale_t locale)
755{
756 char sbuf[TEXTBUFLEN];
757 char *buf = sbuf;
758 UChar *uchar;
760 size_t uchar_bsize;
762
764
766
767 uchar_bsize = (ulen + 1) * sizeof(UChar);
768
771
772 uchar = (UChar *) buf;
773
775
776 result_bsize = ucol_getSortKey(locale->icu.ucol,
777 uchar, ulen,
778 (uint8_t *) dest, destsize);
779
780 /*
781 * ucol_getSortKey() counts the nul-terminator in the result length, but
782 * this function should not.
783 */
784 Assert(result_bsize > 0);
785 result_bsize--;
786
787 if (buf != sbuf)
788 pfree(buf);
789
790 /* if dest is defined, it should be nul-terminated */
791 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
792
793 return result_bsize;
794}
795
796/* 'srclen' of -1 means the strings are NUL-terminated */
797size_t
798strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
799 const char *src, ssize_t srclen,
800 pg_locale_t locale)
801{
802 size_t result;
803 UCharIterator iter;
804 uint32_t state[2];
805 UErrorCode status;
806
808
809 uiter_setUTF8(&iter, src, srclen);
810 state[0] = state[1] = 0; /* won't need that again */
811 status = U_ZERO_ERROR;
812 result = ucol_nextSortKeyPart(locale->icu.ucol,
813 &iter,
814 state,
815 (uint8_t *) dest,
816 destsize,
817 &status);
818 if (U_FAILURE(status))
820 (errmsg("sort key generation failed: %s",
821 u_errorName(status))));
822
823 return result;
824}
825
826char *
828{
832
834
837
839 return pstrdup(buf);
840}
841
842/*
843 * Convert a string in the database encoding into a string of UChars.
844 *
845 * The source string at buff is of length nbytes
846 * (it needn't be nul-terminated)
847 *
848 * *buff_uchar receives a pointer to the palloc'd result string, and
849 * the function's result is the number of UChars generated.
850 *
851 * The result string is nul-terminated, though most callers rely on the
852 * result length instead.
853 */
854static int32_t
855icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
856{
858
860
862
863 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
865 *buff_uchar, len_uchar + 1, buff, nbytes);
866
867 return len_uchar;
868}
869
870/*
871 * Convert a string of UChars into the database encoding.
872 *
873 * The source string at buff_uchar is of length len_uchar
874 * (it needn't be nul-terminated)
875 *
876 * *result receives a pointer to the palloc'd result string, and the
877 * function's result is the number of bytes generated (not counting nul).
878 *
879 * The result string is nul-terminated.
880 */
881static size_t
882icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
883{
884 UErrorCode status;
886
888
889 status = U_ZERO_ERROR;
891 buff_uchar, len_uchar, &status);
892 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
894 (errmsg("%s failed: %s", "ucnv_fromUChars",
895 u_errorName(status))));
896
897 if (len_result + 1 > destsize)
898 return len_result;
899
900 status = U_ZERO_ERROR;
902 buff_uchar, len_uchar, &status);
903 if (U_FAILURE(status) ||
906 (errmsg("%s failed: %s", "ucnv_fromUChars",
907 u_errorName(status))));
908
909 return len_result;
910}
911
912static int32_t
915{
916 UErrorCode status;
918
919 len_dest = len_source; /* try first with same length */
920 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
921 status = U_ZERO_ERROR;
923 mylocale->icu.locale, &status);
924 if (status == U_BUFFER_OVERFLOW_ERROR)
925 {
926 /* try again with adjusted length */
928 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
929 status = U_ZERO_ERROR;
931 mylocale->icu.locale, &status);
932 }
933 if (U_FAILURE(status))
935 (errmsg("case conversion failed: %s", u_errorName(status))));
936 return len_dest;
937}
938
939static int32_t
940icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
941 const char *src, ssize_t srclen, pg_locale_t locale)
942{
947 size_t result_len;
948
950 len_conv = convert_case_uchar(func, locale, &buff_conv,
955
956 return result_len;
957}
958
959static int32_t
961 const UChar *src, int32_t srcLength,
962 const char *locale,
964{
965 return u_strToTitle(dest, destCapacity, src, srcLength,
966 NULL, locale, pErrorCode);
967}
968
969static int32_t
971 const UChar *src, int32_t srcLength,
972 const char *locale,
974{
975 return u_strFoldCase(dest, destCapacity, src, srcLength,
977}
978
979/*
980 * Return the correct u_strFoldCase() options for the given locale.
981 *
982 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
983 * folding does not accept a locale. Instead it just supports a single option
984 * relevant to Turkic languages 'az' and 'tr'; check for those languages.
985 */
986static int32_t
987foldcase_options(const char *locale)
988{
990 char lang[3];
991 UErrorCode status = U_ZERO_ERROR;
992
993 uloc_getLanguage(locale, lang, 3, &status);
994 if (U_SUCCESS(status))
995 {
996 /*
997 * The option name is confusing, but it causes u_strFoldCase to use
998 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
999 */
1000 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
1002 }
1003
1004 return options;
1005}
1006
1007/*
1008 * strncoll_icu
1009 *
1010 * Convert the arguments from the database encoding to UChar strings, then
1011 * call ucol_strcoll(). An argument length of -1 means that the string is
1012 * NUL-terminated.
1013 *
1014 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1015 * caller should call that instead.
1016 */
1017static int
1018strncoll_icu(const char *arg1, ssize_t len1,
1019 const char *arg2, ssize_t len2, pg_locale_t locale)
1020{
1021 char sbuf[TEXTBUFLEN];
1022 char *buf = sbuf;
1023 int32_t ulen1;
1024 int32_t ulen2;
1025 size_t bufsize1;
1026 size_t bufsize2;
1027 UChar *uchar1,
1028 *uchar2;
1029 int result;
1030
1031 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
1032#ifdef HAVE_UCOL_STRCOLLUTF8
1034#endif
1035
1037
1040
1041 bufsize1 = (ulen1 + 1) * sizeof(UChar);
1042 bufsize2 = (ulen2 + 1) * sizeof(UChar);
1043
1046
1047 uchar1 = (UChar *) buf;
1048 uchar2 = (UChar *) (buf + bufsize1);
1049
1052
1053 result = ucol_strcoll(locale->icu.ucol,
1054 uchar1, ulen1,
1055 uchar2, ulen2);
1056
1057 if (buf != sbuf)
1058 pfree(buf);
1059
1060 return result;
1061}
1062
1063/* 'srclen' of -1 means the strings are NUL-terminated */
1064static size_t
1065strnxfrm_prefix_icu(char *dest, size_t destsize,
1066 const char *src, ssize_t srclen,
1067 pg_locale_t locale)
1068{
1069 char sbuf[TEXTBUFLEN];
1070 char *buf = sbuf;
1071 UCharIterator iter;
1072 uint32_t state[2];
1073 UErrorCode status;
1074 int32_t ulen = -1;
1075 UChar *uchar = NULL;
1076 size_t uchar_bsize;
1078
1079 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
1081
1083
1085
1086 uchar_bsize = (ulen + 1) * sizeof(UChar);
1087
1088 if (uchar_bsize > TEXTBUFLEN)
1090
1091 uchar = (UChar *) buf;
1092
1094
1095 uiter_setString(&iter, uchar, ulen);
1096 state[0] = state[1] = 0; /* won't need that again */
1097 status = U_ZERO_ERROR;
1098 result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
1099 &iter,
1100 state,
1101 (uint8_t *) dest,
1102 destsize,
1103 &status);
1104 if (U_FAILURE(status))
1105 ereport(ERROR,
1106 (errmsg("sort key generation failed: %s",
1107 u_errorName(status))));
1108
1109 return result_bsize;
1110}
1111
1112static void
1114{
1115 const char *icu_encoding_name;
1116 UErrorCode status;
1118
1119 if (icu_converter)
1120 return; /* already done */
1121
1123 if (!icu_encoding_name)
1124 ereport(ERROR,
1126 errmsg("encoding \"%s\" not supported by ICU",
1128
1129 status = U_ZERO_ERROR;
1130 conv = ucnv_open(icu_encoding_name, &status);
1131 if (U_FAILURE(status))
1132 ereport(ERROR,
1133 (errmsg("could not open ICU converter for encoding \"%s\": %s",
1134 icu_encoding_name, u_errorName(status))));
1135
1137}
1138
1139/*
1140 * Find length, in UChars, of given string if converted to UChar string.
1141 *
1142 * A length of -1 indicates that the input string is NUL-terminated.
1143 */
1144static size_t
1146{
1147 UErrorCode status = U_ZERO_ERROR;
1148 int32_t ulen;
1149
1150 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
1151 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1152 ereport(ERROR,
1153 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1154 return ulen;
1155}
1156
1157/*
1158 * Convert the given source string into a UChar string, stored in dest, and
1159 * return the length (in UChars).
1160 *
1161 * A srclen of -1 indicates that the input string is NUL-terminated.
1162 */
1163static int32_t
1165 const char *src, int32_t srclen)
1166{
1167 UErrorCode status = U_ZERO_ERROR;
1168 int32_t ulen;
1169
1170 status = U_ZERO_ERROR;
1171 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1172 if (U_FAILURE(status))
1173 ereport(ERROR,
1174 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1175 return ulen;
1176}
1177
1178/*
1179 * Parse collation attributes from the given locale string and apply them to
1180 * the open collator.
1181 *
1182 * First, the locale string is canonicalized to an ICU format locale ID such
1183 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1184 * the key-value arguments.
1185 *
1186 * Starting with ICU version 54, the attributes are processed automatically by
1187 * ucol_open(), so this is only necessary for emulating this behavior on older
1188 * versions.
1189 */
1191static void
1193 UErrorCode *status)
1194{
1195 int32_t len;
1196 char *icu_locale_id;
1197 char *lower_str;
1198 char *str;
1199 char *token;
1200
1201 /*
1202 * The input locale may be a BCP 47 language tag, e.g.
1203 * "und-u-kc-ks-level1", which expresses the same attributes in a
1204 * different form. It will be converted to the equivalent ICU format
1205 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1206 * uloc_canonicalize().
1207 */
1208 *status = U_ZERO_ERROR;
1209 len = uloc_canonicalize(loc, NULL, 0, status);
1210 icu_locale_id = palloc(len + 1);
1211 *status = U_ZERO_ERROR;
1212 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1213 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1214 return;
1215
1217
1219
1220 str = strchr(lower_str, '@');
1221 if (!str)
1222 return;
1223 str++;
1224
1225 while ((token = strsep(&str, ";")))
1226 {
1227 char *e = strchr(token, '=');
1228
1229 if (e)
1230 {
1231 char *name;
1232 char *value;
1235
1236 *status = U_ZERO_ERROR;
1237
1238 *e = '\0';
1239 name = token;
1240 value = e + 1;
1241
1242 /*
1243 * See attribute name and value lists in ICU i18n/coll.cpp
1244 */
1245 if (strcmp(name, "colstrength") == 0)
1247 else if (strcmp(name, "colbackwards") == 0)
1249 else if (strcmp(name, "colcaselevel") == 0)
1251 else if (strcmp(name, "colcasefirst") == 0)
1253 else if (strcmp(name, "colalternate") == 0)
1255 else if (strcmp(name, "colnormalization") == 0)
1257 else if (strcmp(name, "colnumeric") == 0)
1259 else
1260 /* ignore if unknown */
1261 continue;
1262
1263 if (strcmp(value, "primary") == 0)
1265 else if (strcmp(value, "secondary") == 0)
1267 else if (strcmp(value, "tertiary") == 0)
1269 else if (strcmp(value, "quaternary") == 0)
1271 else if (strcmp(value, "identical") == 0)
1273 else if (strcmp(value, "no") == 0)
1274 uvalue = UCOL_OFF;
1275 else if (strcmp(value, "yes") == 0)
1276 uvalue = UCOL_ON;
1277 else if (strcmp(value, "shifted") == 0)
1279 else if (strcmp(value, "non-ignorable") == 0)
1281 else if (strcmp(value, "lower") == 0)
1283 else if (strcmp(value, "upper") == 0)
1285 else
1286 {
1287 *status = U_ILLEGAL_ARGUMENT_ERROR;
1288 break;
1289 }
1290
1292 }
1293 }
1294
1296}
1297
1298#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition builtins.h:98
#define pg_attribute_unused()
Definition c.h:132
#define IS_HIGHBIT_SET(ch)
Definition c.h:1144
#define Assert(condition)
Definition c.h:873
uint32_t uint32
Definition c.h:546
size_t Size
Definition c.h:619
Oid collid
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
const char * get_encoding_name_for_icu(int encoding)
Definition encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Oid MyDatabaseId
Definition globals.c:94
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
#define token
static struct @172 value
int i
Definition isn.c:77
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
int GetDatabaseEncoding(void)
Definition mbutils.c:1264
int pg_database_encoding_max_length(void)
Definition mbutils.c:1549
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition mcxt.c:1768
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1266
char * pstrdup(const char *in)
Definition mcxt.c:1781
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
void * palloc_extended(Size size, int flags)
Definition mcxt.c:1439
FormData_pg_collation * Form_pg_collation
const void size_t len
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
void report_newlocale_failure(const char *localename)
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pg_encoding_to_char
Definition pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition strsep.c:49
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition port.h:188
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:262
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
e
static int fb(int x)
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition pg_locale.h:66
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition pg_locale.h:92
const struct ctype_methods * ctype
Definition pg_locale.h:146
const struct collate_methods * collate
Definition pg_locale.h:145
const char * locale
Definition pg_locale.h:152
void ReleaseSysCache(HeapTuple tuple)
Definition syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition syscache.c:220
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition syscache.c:595
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition syscache.c:625
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
const char * name
#define locale_t
Definition win32_port.h:429
#define tolower_l
Definition win32_port.h:430
#define isupper_l
Definition win32_port.h:440