PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucasemap.h>
16#include <unicode/ucnv.h>
17#include <unicode/ucol.h>
18#include <unicode/ustring.h>
19
20/*
21 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
22 * (see
23 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
24 */
25#if U_ICU_VERSION_MAJOR_NUM >= 53
26#define HAVE_UCOL_STRCOLLUTF8 1
27#else
28#undef HAVE_UCOL_STRCOLLUTF8
29#endif
30
31#endif
32
33#include "access/htup_details.h"
34#include "catalog/pg_database.h"
36#include "mb/pg_wchar.h"
37#include "miscadmin.h"
38#include "utils/builtins.h"
39#include "utils/formatting.h"
40#include "utils/memutils.h"
41#include "utils/pg_locale.h"
42#include "utils/syscache.h"
43
44/*
45 * Size of stack buffer to use for string transformations, used to avoid heap
46 * allocations in typical cases. This should be large enough that most strings
47 * will fit, but small enough that we feel comfortable putting it on the
48 * stack.
49 */
50#define TEXTBUFLEN 1024
51
53
54#ifdef USE_ICU
55
56extern UCollator *pg_ucol_open(const char *loc_str);
57static UCaseMap *pg_ucasemap_open(const char *loc_str);
58
59static size_t strlower_icu(char *dest, size_t destsize, const char *src,
60 size_t srclen, pg_locale_t locale);
61static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
62 size_t srclen, pg_locale_t locale);
63static size_t strupper_icu(char *dest, size_t destsize, const char *src,
64 size_t srclen, pg_locale_t locale);
65static size_t strfold_icu(char *dest, size_t destsize, const char *src,
66 size_t srclen, pg_locale_t locale);
67static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
68 size_t srclen, pg_locale_t locale);
69static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
70 size_t srclen, pg_locale_t locale);
71static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
72 size_t srclen, pg_locale_t locale);
73static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
74 size_t srclen, pg_locale_t locale);
75static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
76 size_t srclen, pg_locale_t locale);
77static int strncoll_icu(const char *arg1, size_t len1,
78 const char *arg2, size_t len2,
79 pg_locale_t locale);
80static int strcoll_icu(const char *arg1, const char *arg2,
81 pg_locale_t locale);
82static size_t strnxfrm_icu(char *dest, size_t destsize,
83 const char *src, size_t srclen,
84 pg_locale_t locale);
85static size_t strxfrm_icu(char *dest, size_t destsize, const char *src,
86 pg_locale_t locale);
87extern char *get_collation_actual_version_icu(const char *collcollate);
88
90 const UChar *src, int32_t srcLength,
91 const char *locale,
93
94/*
95 * Converter object for converting between ICU's UChar strings and C strings
96 * in database encoding. Since the database encoding doesn't change, we only
97 * need one of these per session.
98 */
100
101static UCollator *make_icu_collator(const char *iculocstr,
102 const char *icurules);
103static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
104 const char *src, size_t srclen,
105 pg_locale_t locale);
106static size_t strxfrm_prefix_icu(char *dest, size_t destsize, const char *src,
107 pg_locale_t locale);
108#ifdef HAVE_UCOL_STRCOLLUTF8
109static int strncoll_icu_utf8(const char *arg1, size_t len1,
110 const char *arg2, size_t len2,
111 pg_locale_t locale);
112static int strcoll_icu_utf8(const char *arg1,
113 const char *arg2,
114 pg_locale_t locale);
115#endif
116static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
117 const char *src, size_t srclen,
118 pg_locale_t locale);
119static size_t strxfrm_prefix_icu_utf8(char *dest, size_t destsize, const char *src,
120 pg_locale_t locale);
121static void init_icu_converter(void);
123 const char *str, int32_t len);
125 UChar *dest, int32_t destlen,
126 const char *src, int32_t srclen);
127static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
128 size_t nbytes);
129static size_t icu_from_uchar(char *dest, size_t destsize,
131static void icu_set_collation_attributes(UCollator *collator, const char *loc,
132 UErrorCode *status);
133static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
134 size_t destsize, const char *src,
135 size_t srclen, pg_locale_t locale);
137 const UChar *src, int32_t srcLength,
138 const char *locale,
141 const UChar *src, int32_t srcLength,
142 const char *locale,
144static int32_t foldcase_options(const char *locale);
145
146/*
147 * XXX: many of the functions below rely on casts directly from pg_wchar to
148 * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
149 */
150
151static pg_wchar
153{
154 return u_toupper(wc);
155}
156
157static pg_wchar
159{
160 return u_tolower(wc);
161}
162
163static const struct collate_methods collate_methods_icu = {
165 .strcoll = strcoll_icu,
166 .strnxfrm = strnxfrm_icu,
167 .strxfrm = strxfrm_icu,
168 .strnxfrm_prefix = strnxfrm_prefix_icu,
169 .strxfrm_prefix = strxfrm_prefix_icu,
170 .strxfrm_is_safe = true,
171};
172
173static const struct collate_methods collate_methods_icu_utf8 = {
174#ifdef HAVE_UCOL_STRCOLLUTF8
176 .strcoll = strcoll_icu_utf8,
177#else
178 .strncoll = strncoll_icu,
179 .strcoll = strcoll_icu,
180#endif
181 .strnxfrm = strnxfrm_icu,
182 .strxfrm = strxfrm_icu,
183 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
184 .strxfrm_prefix = strxfrm_prefix_icu_utf8,
185 .strxfrm_is_safe = true,
186};
187
188static bool
190{
191 return u_isdigit(wc);
192}
193
194static bool
196{
197 return u_isalpha(wc);
198}
199
200static bool
202{
203 return u_isalnum(wc);
204}
205
206static bool
208{
209 return u_isupper(wc);
210}
211
212static bool
214{
215 return u_islower(wc);
216}
217
218static bool
220{
221 return u_isgraph(wc);
222}
223
224static bool
226{
227 return u_isprint(wc);
228}
229
230static bool
232{
233 return u_ispunct(wc);
234}
235
236static bool
238{
239 return u_isspace(wc);
240}
241
242static bool
244{
245 return u_isxdigit(wc);
246}
247
248static bool
250{
252}
253
254static const struct ctype_methods ctype_methods_icu = {
256 .strtitle = strtitle_icu,
257 .strupper = strupper_icu,
258 .strfold = strfold_icu,
259 .downcase_ident = downcase_ident_icu,
260 .wc_isdigit = wc_isdigit_icu,
261 .wc_isalpha = wc_isalpha_icu,
262 .wc_isalnum = wc_isalnum_icu,
263 .wc_isupper = wc_isupper_icu,
264 .wc_islower = wc_islower_icu,
265 .wc_isgraph = wc_isgraph_icu,
266 .wc_isprint = wc_isprint_icu,
267 .wc_ispunct = wc_ispunct_icu,
268 .wc_isspace = wc_isspace_icu,
269 .wc_isxdigit = wc_isxdigit_icu,
270 .wc_iscased = wc_iscased_icu,
271 .wc_toupper = toupper_icu,
272 .wc_tolower = tolower_icu,
273};
274
275static const struct ctype_methods ctype_methods_icu_utf8 = {
277 .strtitle = strtitle_icu_utf8,
278 .strupper = strupper_icu_utf8,
279 .strfold = strfold_icu_utf8,
280 /* uses plain ASCII semantics for historical reasons */
281 .downcase_ident = NULL,
282 .wc_isdigit = wc_isdigit_icu,
283 .wc_isalpha = wc_isalpha_icu,
284 .wc_isalnum = wc_isalnum_icu,
285 .wc_isupper = wc_isupper_icu,
286 .wc_islower = wc_islower_icu,
287 .wc_isgraph = wc_isgraph_icu,
288 .wc_isprint = wc_isprint_icu,
289 .wc_ispunct = wc_ispunct_icu,
290 .wc_isspace = wc_isspace_icu,
291 .wc_isxdigit = wc_isxdigit_icu,
292 .wc_iscased = wc_iscased_icu,
293 .wc_toupper = toupper_icu,
294 .wc_tolower = tolower_icu,
295};
296
297/*
298 * ICU still depends on libc for compatibility with certain historical
299 * behavior for single-byte encodings. See downcase_ident_icu().
300 *
301 * XXX: consider fixing by decoding the single byte into a code point, and
302 * using u_tolower().
303 */
304static locale_t
305make_libc_ctype_locale(const char *ctype)
306{
307 locale_t loc;
308
309#ifndef WIN32
310 loc = newlocale(LC_CTYPE_MASK, ctype, NULL);
311#else
312 loc = _create_locale(LC_ALL, ctype);
313#endif
314 if (!loc)
316
317 return loc;
318}
319#endif
320
323{
324#ifdef USE_ICU
325 bool deterministic;
326 const char *iculocstr;
327 const char *icurules = NULL;
329 locale_t loc = (locale_t) 0;
331
333 {
334 HeapTuple tp;
335 Datum datum;
336 bool isnull;
337
339 if (!HeapTupleIsValid(tp))
340 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
341
342 /* default database collation is always deterministic */
343 deterministic = true;
347 datum = SysCacheGetAttr(DATABASEOID, tp,
349 if (!isnull)
351
352 /* libc only needed for default locale and single-byte encoding */
354 {
355 const char *ctype;
356
359 ctype = TextDatumGetCString(datum);
360
361 loc = make_libc_ctype_locale(ctype);
362 }
363
364 ReleaseSysCache(tp);
365 }
366 else
367 {
369 HeapTuple tp;
370 Datum datum;
371 bool isnull;
372
374 if (!HeapTupleIsValid(tp))
375 elog(ERROR, "cache lookup failed for collation %u", collid);
377 deterministic = collform->collisdeterministic;
381 datum = SysCacheGetAttr(COLLOID, tp,
383 if (!isnull)
385
386 ReleaseSysCache(tp);
387 }
388
390
391 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
392 result->icu.locale = MemoryContextStrdup(context, iculocstr);
393 result->icu.ucol = collator;
394 result->icu.lt = loc;
395 result->deterministic = deterministic;
396 result->collate_is_c = false;
397 result->ctype_is_c = false;
399 {
400 result->icu.ucasemap = pg_ucasemap_open(iculocstr);
403 }
404 else
405 {
406 result->collate = &collate_methods_icu;
407 result->ctype = &ctype_methods_icu;
408 }
409
410 return result;
411#else
412 /* could get here if a collation was created by a build with ICU */
415 errmsg("ICU is not supported in this build")));
416
417 return NULL;
418#endif
419}
420
421#ifdef USE_ICU
422
423/*
424 * Check locale string and fix it if necessary. Returns a new palloc'd string.
425 *
426 * In ICU versions 54 and earlier, "und" is not a recognized spelling of the
427 * root locale. If the first component of the locale is "und", replace with
428 * "root" before opening.
429 */
430static char *
431fix_icu_locale_str(const char *loc_str)
432{
433 /*
434 * Must never open default collator, because it depends on the environment
435 * and may change at any time. Should not happen, but check here to catch
436 * bugs that might be hard to catch otherwise.
437 *
438 * NB: the default collator is not the same as the collator for the root
439 * locale. The root locale may be specified as the empty string, "und", or
440 * "root". The default collator is opened by passing NULL to ucol_open().
441 */
442 if (loc_str == NULL)
443 elog(ERROR, "opening default collator is not supported");
444
446 {
447 char lang[ULOC_LANG_CAPACITY];
448 UErrorCode status = U_ZERO_ERROR;
449
451 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
452 {
455 errmsg("could not get language from locale \"%s\": %s",
456 loc_str, u_errorName(status))));
457 }
458
459 if (strcmp(lang, "und") == 0)
460 {
461 const char *remainder = loc_str + strlen("und");
462 char *fixed_str;
463
464 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
465 strcpy(fixed_str, "root");
467
468 return fixed_str;
469 }
470 }
471
472 return pstrdup(loc_str);
473}
474
475/*
476 * Wrapper around ucol_open() to handle API differences for older ICU
477 * versions.
478 *
479 * Ensure that no path leaks a UCollator.
480 */
481UCollator *
482pg_ucol_open(const char *loc_str)
483{
485 UErrorCode status;
486 char *fixed_str;
487
489
490 status = U_ZERO_ERROR;
491 collator = ucol_open(fixed_str, &status);
492 if (U_FAILURE(status))
494 /* use original string for error report */
496 errmsg("could not open collator for locale \"%s\": %s",
497 loc_str, u_errorName(status))));
498
500 {
501 status = U_ZERO_ERROR;
503
504 /*
505 * Pretend the error came from ucol_open(), for consistent error
506 * message across ICU versions.
507 */
508 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
509 {
513 errmsg("could not open collator for locale \"%s\": %s",
514 loc_str, u_errorName(status))));
515 }
516 }
517
519
520 return collator;
521}
522
523/*
524 * Wrapper around ucasemap_open() to handle API differences for older ICU
525 * versions.
526 *
527 * Additionally makes sure we get the right options for case folding.
528 */
529static UCaseMap *
530pg_ucasemap_open(const char *loc_str)
531{
532 UErrorCode status = U_ZERO_ERROR;
534 char *fixed_str;
535
537
539 if (U_FAILURE(status))
540 /* use original string for error report */
543 errmsg("could not open casemap for locale \"%s\": %s",
544 loc_str, u_errorName(status)));
545
547
548 return casemap;
549}
550
551/*
552 * Create a UCollator with the given locale string and rules.
553 *
554 * Ensure that no path leaks a UCollator.
555 */
556static UCollator *
557make_icu_collator(const char *iculocstr, const char *icurules)
558{
559 if (!icurules)
560 {
561 /* simple case without rules */
562 return pg_ucol_open(iculocstr);
563 }
564 else
565 {
568 const UChar *std_rules;
571 int32_t length;
572 int32_t total;
573 UErrorCode status;
574
575 /*
576 * If rules are specified, we extract the rules of the standard
577 * collation, add our own rules, and make a new collator with the
578 * combined rules.
579 */
581
583
585
586 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
587
588 /* avoid leaking collator on OOM */
590 if (!all_rules)
591 {
595 errmsg("out of memory")));
596 }
597
600
602
603 status = U_ZERO_ERROR;
606 NULL, &status);
607 if (U_FAILURE(status))
608 {
611 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
612 iculocstr, icurules, u_errorName(status))));
613 }
614
617 return collator_all_rules;
618 }
619}
620
621static size_t
622strlower_icu(char *dest, size_t destsize, const char *src, size_t srclen,
623 pg_locale_t locale)
624{
625 return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
626}
627
628static size_t
629strtitle_icu(char *dest, size_t destsize, const char *src, size_t srclen,
630 pg_locale_t locale)
631{
632 return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
633}
634
635static size_t
636strupper_icu(char *dest, size_t destsize, const char *src, size_t srclen,
637 pg_locale_t locale)
638{
639 return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
640}
641
642static size_t
643strfold_icu(char *dest, size_t destsize, const char *src, size_t srclen,
644 pg_locale_t locale)
645{
646 return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
647}
648
649static size_t
650strlower_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
651 pg_locale_t locale)
652{
653 UErrorCode status = U_ZERO_ERROR;
655
656 needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
657 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
659 errmsg("case conversion failed: %s", u_errorName(status)));
660 return needed;
661}
662
663static size_t
664strtitle_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
665 pg_locale_t locale)
666{
667 UErrorCode status = U_ZERO_ERROR;
669
670 needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
671 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
673 errmsg("case conversion failed: %s", u_errorName(status)));
674 return needed;
675}
676
677static size_t
678strupper_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
679 pg_locale_t locale)
680{
681 UErrorCode status = U_ZERO_ERROR;
683
684 needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
685 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
687 errmsg("case conversion failed: %s", u_errorName(status)));
688 return needed;
689}
690
691static size_t
692strfold_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
693 pg_locale_t locale)
694{
695 UErrorCode status = U_ZERO_ERROR;
697
698 needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
699 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
701 errmsg("case conversion failed: %s", u_errorName(status)));
702 return needed;
703}
704
705/*
706 * For historical compatibility, behavior is not multibyte-aware.
707 *
708 * NB: uses libc tolower() for single-byte encodings (also for historical
709 * compatibility), and therefore relies on the global LC_CTYPE setting.
710 */
711static size_t
712downcase_ident_icu(char *dst, size_t dstsize, const char *src,
713 size_t srclen, pg_locale_t locale)
714{
715 int i;
716 bool libc_lower;
717 locale_t lt = locale->icu.lt;
718
720
721 for (i = 0; i < srclen && i < dstsize; i++)
722 {
723 unsigned char ch = (unsigned char) src[i];
724
725 if (ch >= 'A' && ch <= 'Z')
727 else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt))
728 ch = tolower_l(ch, lt);
729 dst[i] = (char) ch;
730 }
731
732 if (i < dstsize)
733 dst[i] = '\0';
734
735 return srclen;
736}
737
738/*
739 * strncoll_icu_utf8
740 *
741 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
742 * database encoding.
743 */
744#ifdef HAVE_UCOL_STRCOLLUTF8
745int
746strncoll_icu_utf8(const char *arg1, size_t len1, const char *arg2, size_t len2,
747 pg_locale_t locale)
748{
749 int result;
750 UErrorCode status;
751
753
754 status = U_ZERO_ERROR;
755 result = ucol_strcollUTF8(locale->icu.ucol,
756 arg1, len1,
757 arg2, len2,
758 &status);
759 if (U_FAILURE(status))
761 (errmsg("collation failed: %s", u_errorName(status))));
762
763 return result;
764}
765
766int
767strcoll_icu_utf8(const char *arg1, const char *arg2, pg_locale_t locale)
768{
769 int result;
770 UErrorCode status;
771
773
774 status = U_ZERO_ERROR;
775 result = ucol_strcollUTF8(locale->icu.ucol,
776 arg1, -1,
777 arg2, -1,
778 &status);
779 if (U_FAILURE(status))
781 (errmsg("collation failed: %s", u_errorName(status))));
782
783 return result;
784}
785#endif
786
787static size_t
788strnxfrm_icu_internal(char *dest, size_t destsize, const char *src, ssize_t srclen,
789 pg_locale_t locale)
790{
791 UChar sbuf[TEXTBUFLEN / sizeof(UChar)];
792 UChar *uchar = sbuf;
795
797
799
800 if (ulen >= lengthof(sbuf))
802
804
805 result_bsize = ucol_getSortKey(locale->icu.ucol,
806 uchar, ulen,
807 (uint8_t *) dest, destsize);
808
809 /*
810 * ucol_getSortKey() counts the nul-terminator in the result length, but
811 * this function should not.
812 */
813 Assert(result_bsize > 0);
814 result_bsize--;
815
816 if (uchar != sbuf)
817 pfree(uchar);
818
819 /* if dest is defined, it should be nul-terminated */
820 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
821
822 return result_bsize;
823}
824
825static size_t
826strnxfrm_icu(char *dest, size_t destsize, const char *src, size_t srclen,
827 pg_locale_t locale)
828{
829 return strnxfrm_icu_internal(dest, destsize, src, srclen, locale);
830}
831
832static size_t
833strxfrm_icu(char *dest, size_t destsize, const char *src,
834 pg_locale_t locale)
835{
836 return strnxfrm_icu_internal(dest, destsize, src, -1, locale);
837}
838
839static size_t
841 const char *src, ssize_t srclen,
842 pg_locale_t locale)
843{
844 size_t result;
845 UCharIterator iter;
846 uint32_t state[2];
847 UErrorCode status;
848
850
851 uiter_setUTF8(&iter, src, srclen);
852 state[0] = state[1] = 0; /* won't need that again */
853 status = U_ZERO_ERROR;
854 result = ucol_nextSortKeyPart(locale->icu.ucol,
855 &iter,
856 state,
857 (uint8_t *) dest,
858 destsize,
859 &status);
860 if (U_FAILURE(status))
862 (errmsg("sort key generation failed: %s",
863 u_errorName(status))));
864
865 return result;
866}
867
868static size_t
869strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
870 const char *src, size_t srclen,
871 pg_locale_t locale)
872{
873 return strnxfrm_prefix_icu_utf8_internal(dest, destsize, src, srclen, locale);
874}
875
876static size_t
877strxfrm_prefix_icu_utf8(char *dest, size_t destsize, const char *src,
878 pg_locale_t locale)
879{
880 return strnxfrm_prefix_icu_utf8_internal(dest, destsize, src, -1, locale);
881}
882
883char *
885{
889
891
894
896 return pstrdup(buf);
897}
898
899/*
900 * Convert a string in the database encoding into a string of UChars.
901 *
902 * The source string at buff is of length nbytes
903 * (it needn't be nul-terminated)
904 *
905 * *buff_uchar receives a pointer to the palloc'd result string, and
906 * the function's result is the number of UChars generated.
907 *
908 * The result string is nul-terminated, though most callers rely on the
909 * result length instead.
910 */
911static int32_t
912icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
913{
915
917
919
922 *buff_uchar, len_uchar + 1, buff, nbytes);
923
924 return len_uchar;
925}
926
927/*
928 * Convert a string of UChars into the database encoding.
929 *
930 * The source string at buff_uchar is of length len_uchar
931 * (it needn't be nul-terminated)
932 *
933 * *result receives a pointer to the palloc'd result string, and the
934 * function's result is the number of bytes generated (not counting nul).
935 *
936 * The result string is nul-terminated.
937 */
938static size_t
939icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
940{
941 UErrorCode status;
943
945
946 status = U_ZERO_ERROR;
948 buff_uchar, len_uchar, &status);
949 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
951 (errmsg("%s failed: %s", "ucnv_fromUChars",
952 u_errorName(status))));
953
954 if (len_result + 1 > destsize)
955 return len_result;
956
957 status = U_ZERO_ERROR;
959 buff_uchar, len_uchar, &status);
960 if (U_FAILURE(status) ||
963 (errmsg("%s failed: %s", "ucnv_fromUChars",
964 u_errorName(status))));
965
966 return len_result;
967}
968
969static int32_t
972{
973 UErrorCode status;
975
976 len_dest = len_source; /* try first with same length */
978 status = U_ZERO_ERROR;
980 mylocale->icu.locale, &status);
981 if (status == U_BUFFER_OVERFLOW_ERROR)
982 {
983 /* try again with adjusted length */
986 status = U_ZERO_ERROR;
988 mylocale->icu.locale, &status);
989 }
990 if (U_FAILURE(status))
992 (errmsg("case conversion failed: %s", u_errorName(status))));
993 return len_dest;
994}
995
996static int32_t
997icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
998 const char *src, size_t srclen, pg_locale_t locale)
999{
1004 size_t result_len;
1005
1007 len_conv = convert_case_uchar(func, locale, &buff_conv,
1012
1013 return result_len;
1014}
1015
1016static int32_t
1018 const UChar *src, int32_t srcLength,
1019 const char *locale,
1021{
1022 return u_strToTitle(dest, destCapacity, src, srcLength,
1023 NULL, locale, pErrorCode);
1024}
1025
1026static int32_t
1028 const UChar *src, int32_t srcLength,
1029 const char *locale,
1031{
1032 return u_strFoldCase(dest, destCapacity, src, srcLength,
1033 foldcase_options(locale), pErrorCode);
1034}
1035
1036/*
1037 * Return the correct u_strFoldCase() options for the given locale.
1038 *
1039 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
1040 * folding does not accept a locale. Instead it just supports a single option
1041 * relevant to Turkic languages 'az' and 'tr'; check for those languages.
1042 */
1043static int32_t
1044foldcase_options(const char *locale)
1045{
1047 char lang[ULOC_LANG_CAPACITY];
1048 UErrorCode status = U_ZERO_ERROR;
1049
1050 uloc_getLanguage(locale, lang, ULOC_LANG_CAPACITY, &status);
1051 if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING)
1052 {
1053 /*
1054 * The option name is confusing, but it causes u_strFoldCase to use
1055 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
1056 */
1057 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
1059 }
1060
1061 return options;
1062}
1063
1064/*
1065 * strncoll_icu
1066 *
1067 * Convert the arguments from the database encoding to UChar strings, then
1068 * call ucol_strcoll().
1069 *
1070 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1071 * caller should call that instead.
1072 */
1073static int
1074strncoll_icu_internal(const char *arg1, ssize_t len1,
1075 const char *arg2, ssize_t len2,
1076 pg_locale_t locale)
1077{
1078 UChar sbuf[TEXTBUFLEN / sizeof(UChar)];
1079 UChar *buf = sbuf;
1080 int32_t ulen1;
1081 int32_t ulen2;
1082 size_t bufsize;
1083 UChar *uchar1,
1084 *uchar2;
1085 int result;
1086
1087 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
1088#ifdef HAVE_UCOL_STRCOLLUTF8
1090#endif
1091
1093
1096
1097 /* ulen1+1 or ulen2+1 doesn't risk overflow, but summing them might */
1098 bufsize = add_size(ulen1 + 1, ulen2 + 1);
1099 if (bufsize > lengthof(sbuf))
1101
1102 uchar1 = buf;
1103 uchar2 = buf + ulen1 + 1;
1104
1107
1108 result = ucol_strcoll(locale->icu.ucol,
1109 uchar1, ulen1,
1110 uchar2, ulen2);
1111
1112 if (buf != sbuf)
1113 pfree(buf);
1114
1115 return result;
1116}
1117
1118static int
1119strncoll_icu(const char *arg1, size_t len1, const char *arg2, size_t len2,
1120 pg_locale_t locale)
1121{
1122 return strncoll_icu_internal(arg1, len1, arg2, len2, locale);
1123}
1124
1125static int
1126strcoll_icu(const char *arg1, const char *arg2, pg_locale_t locale)
1127{
1128 return strncoll_icu_internal(arg1, -1, arg2, -1, locale);
1129}
1130
1131static size_t
1132strnxfrm_prefix_icu_internal(char *dest, size_t destsize,
1133 const char *src, ssize_t srclen,
1134 pg_locale_t locale)
1135{
1136 UChar sbuf[TEXTBUFLEN / sizeof(UChar)];
1137 UChar *uchar = sbuf;
1138 UCharIterator iter;
1139 uint32_t state[2];
1140 UErrorCode status;
1141 int32_t ulen;
1143
1144 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
1146
1148
1150
1151 if (ulen >= lengthof(sbuf))
1152 uchar = palloc_array(UChar, ulen + 1);
1153
1155
1156 uiter_setString(&iter, uchar, ulen);
1157 state[0] = state[1] = 0; /* won't need that again */
1158 status = U_ZERO_ERROR;
1159 result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
1160 &iter,
1161 state,
1162 (uint8_t *) dest,
1163 destsize,
1164 &status);
1165 if (U_FAILURE(status))
1166 ereport(ERROR,
1167 (errmsg("sort key generation failed: %s",
1168 u_errorName(status))));
1169
1170 if (uchar != sbuf)
1171 pfree(uchar);
1172
1173 return result_bsize;
1174}
1175
1176static size_t
1177strnxfrm_prefix_icu(char *dest, size_t destsize, const char *src, size_t srclen,
1178 pg_locale_t locale)
1179{
1180 return strnxfrm_prefix_icu_internal(dest, destsize, src, srclen, locale);
1181}
1182
1183static size_t
1184strxfrm_prefix_icu(char *dest, size_t destsize, const char *src,
1185 pg_locale_t locale)
1186{
1187 return strnxfrm_prefix_icu_internal(dest, destsize, src, -1, locale);
1188}
1189
1190static void
1192{
1193 const char *icu_encoding_name;
1194 UErrorCode status;
1196
1197 if (icu_converter)
1198 return; /* already done */
1199
1201 if (!icu_encoding_name)
1202 ereport(ERROR,
1204 errmsg("encoding \"%s\" not supported by ICU",
1206
1207 status = U_ZERO_ERROR;
1208 conv = ucnv_open(icu_encoding_name, &status);
1209 if (U_FAILURE(status))
1210 ereport(ERROR,
1211 (errmsg("could not open ICU converter for encoding \"%s\": %s",
1212 icu_encoding_name, u_errorName(status))));
1213
1215}
1216
1217/*
1218 * Find length, in UChars, of given string if converted to UChar string.
1219 *
1220 * A length of -1 indicates that the input string is NUL-terminated.
1221 *
1222 * Note: given the assumption that the input string fits in MaxAllocSize,
1223 * the result cannot overflow int32_t. But callers must be careful about
1224 * multiplying the result by sizeof(UChar).
1225 */
1226static int32_t
1228{
1229 UErrorCode status = U_ZERO_ERROR;
1230 int32_t ulen;
1231
1232 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
1233 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1234 ereport(ERROR,
1235 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1236 return ulen;
1237}
1238
1239/*
1240 * Convert the given source string into a UChar string, stored in dest, and
1241 * return the length (in UChars).
1242 *
1243 * A srclen of -1 indicates that the input string is NUL-terminated.
1244 */
1245static int32_t
1247 const char *src, int32_t srclen)
1248{
1249 UErrorCode status = U_ZERO_ERROR;
1250 int32_t ulen;
1251
1252 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1253 if (U_FAILURE(status))
1254 ereport(ERROR,
1255 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1256 return ulen;
1257}
1258
1259/*
1260 * Parse collation attributes from the given locale string and apply them to
1261 * the open collator.
1262 *
1263 * First, the locale string is canonicalized to an ICU format locale ID such
1264 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1265 * the key-value arguments.
1266 *
1267 * Starting with ICU version 54, the attributes are processed automatically by
1268 * ucol_open(), so this is only necessary for emulating this behavior on older
1269 * versions.
1270 */
1272static void
1274 UErrorCode *status)
1275{
1276 int32_t len;
1277 char *icu_locale_id;
1278 char *lower_str;
1279 char *str;
1280 char *token;
1281
1282 /*
1283 * The input locale may be a BCP 47 language tag, e.g.
1284 * "und-u-kc-ks-level1", which expresses the same attributes in a
1285 * different form. It will be converted to the equivalent ICU format
1286 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1287 * uloc_canonicalize().
1288 */
1289 *status = U_ZERO_ERROR;
1290 len = uloc_canonicalize(loc, NULL, 0, status);
1291 icu_locale_id = palloc(len + 1);
1292 *status = U_ZERO_ERROR;
1293 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1294 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1295 return;
1296
1298
1300
1301 str = strchr(lower_str, '@');
1302 if (!str)
1303 return;
1304 str++;
1305
1306 while ((token = strsep(&str, ";")))
1307 {
1308 char *e = strchr(token, '=');
1309
1310 if (e)
1311 {
1312 char *name;
1313 char *value;
1316
1317 *status = U_ZERO_ERROR;
1318
1319 *e = '\0';
1320 name = token;
1321 value = e + 1;
1322
1323 /*
1324 * See attribute name and value lists in ICU i18n/coll.cpp
1325 */
1326 if (strcmp(name, "colstrength") == 0)
1328 else if (strcmp(name, "colbackwards") == 0)
1330 else if (strcmp(name, "colcaselevel") == 0)
1332 else if (strcmp(name, "colcasefirst") == 0)
1334 else if (strcmp(name, "colalternate") == 0)
1336 else if (strcmp(name, "colnormalization") == 0)
1338 else if (strcmp(name, "colnumeric") == 0)
1340 else
1341 /* ignore if unknown */
1342 continue;
1343
1344 if (strcmp(value, "primary") == 0)
1346 else if (strcmp(value, "secondary") == 0)
1348 else if (strcmp(value, "tertiary") == 0)
1350 else if (strcmp(value, "quaternary") == 0)
1352 else if (strcmp(value, "identical") == 0)
1354 else if (strcmp(value, "no") == 0)
1355 uvalue = UCOL_OFF;
1356 else if (strcmp(value, "yes") == 0)
1357 uvalue = UCOL_ON;
1358 else if (strcmp(value, "shifted") == 0)
1360 else if (strcmp(value, "non-ignorable") == 0)
1362 else if (strcmp(value, "lower") == 0)
1364 else if (strcmp(value, "upper") == 0)
1366 else
1367 {
1368 *status = U_ILLEGAL_ARGUMENT_ERROR;
1369 break;
1370 }
1371
1373 }
1374 }
1375
1377}
1378
1379#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition builtins.h:99
#define pg_attribute_unused()
Definition c.h:149
#define IS_HIGHBIT_SET(ch)
Definition c.h:1244
#define Assert(condition)
Definition c.h:943
uint32_t uint32
Definition c.h:624
#define lengthof(array)
Definition c.h:873
size_t Size
Definition c.h:689
uint32 result
Oid collid
int errcode(int sqlerrcode)
Definition elog.c:875
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
const char * get_encoding_name_for_icu(int encoding)
Definition encnames.c:467
#define palloc_array(type, count)
Definition fe_memutils.h:91
#define MCXT_ALLOC_NO_OOM
Definition fe_memutils.h:29
#define palloc_array_extended(type, count, flags)
Definition fe_memutils.h:93
char * asc_tolower(const char *buff, size_t nbytes)
Oid MyDatabaseId
Definition globals.c:96
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
#define token
#define bufsize
static struct @177 value
int i
Definition isn.c:77
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
int GetDatabaseEncoding(void)
Definition mbutils.c:1389
int pg_database_encoding_max_length(void)
Definition mbutils.c:1673
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition mcxt.c:1897
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1269
Size add_size(Size s1, Size s2)
Definition mcxt.c:1733
char * pstrdup(const char *in)
Definition mcxt.c:1910
void pfree(void *pointer)
Definition mcxt.c:1619
void * palloc(Size size)
Definition mcxt.c:1390
static char * errmsg
END_CATALOG_STRUCT typedef FormData_pg_collation * Form_pg_collation
const void size_t len
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
void report_newlocale_failure(const char *localename)
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pg_encoding_to_char
Definition pg_wchar.h:483
char * strsep(char **stringp, const char *delim)
Definition strsep.c:50
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition port.h:189
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:252
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
e
static int fb(int x)
int(* strncoll)(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale)
Definition pg_locale.h:66
size_t(* strlower)(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
Definition pg_locale.h:101
void ReleaseSysCache(HeapTuple tuple)
Definition syscache.c:265
Datum SysCacheGetAttrNotNull(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition syscache.c:626
HeapTuple SearchSysCache1(SysCacheIdentifier cacheId, Datum key1)
Definition syscache.c:221
Datum SysCacheGetAttr(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition syscache.c:596
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
const char * name
#define locale_t
Definition win32_port.h:429
#define tolower_l
Definition win32_port.h:430
#define isupper_l
Definition win32_port.h:440