PostgreSQL Source Code git master
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucnv.h>
16#include <unicode/ustring.h>
17
18/*
19 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 * (see
21 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 */
23#if U_ICU_VERSION_MAJOR_NUM >= 53
24#define HAVE_UCOL_STRCOLLUTF8 1
25#else
26#undef HAVE_UCOL_STRCOLLUTF8
27#endif
28
29#endif
30
31#include "access/htup_details.h"
32#include "catalog/pg_database.h"
34#include "mb/pg_wchar.h"
35#include "miscadmin.h"
36#include "utils/builtins.h"
37#include "utils/formatting.h"
38#include "utils/memutils.h"
39#include "utils/pg_locale.h"
40#include "utils/syscache.h"
41
42/*
43 * Size of stack buffer to use for string transformations, used to avoid heap
44 * allocations in typical cases. This should be large enough that most strings
45 * will fit, but small enough that we feel comfortable putting it on the
46 * stack.
47 */
48#define TEXTBUFLEN 1024
49
51extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
52 ssize_t srclen, pg_locale_t locale);
53extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
54 ssize_t srclen, pg_locale_t locale);
55extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
56 ssize_t srclen, pg_locale_t locale);
57extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
58 ssize_t srclen, pg_locale_t locale);
59
60#ifdef USE_ICU
61
62extern UCollator *pg_ucol_open(const char *loc_str);
63
64static int strncoll_icu(const char *arg1, ssize_t len1,
65 const char *arg2, ssize_t len2,
67static size_t strnxfrm_icu(char *dest, size_t destsize,
68 const char *src, ssize_t srclen,
70static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
71 const char *src, ssize_t srclen,
73extern char *get_collation_actual_version_icu(const char *collcollate);
74
75typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
76 const UChar *src, int32_t srcLength,
77 const char *locale,
78 UErrorCode *pErrorCode);
79
80/*
81 * Converter object for converting between ICU's UChar strings and C strings
82 * in database encoding. Since the database encoding doesn't change, we only
83 * need one of these per session.
84 */
85static UConverter *icu_converter = NULL;
86
87static UCollator *make_icu_collator(const char *iculocstr,
88 const char *icurules);
89static int strncoll_icu(const char *arg1, ssize_t len1,
90 const char *arg2, ssize_t len2,
92static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
93 const char *src, ssize_t srclen,
95#ifdef HAVE_UCOL_STRCOLLUTF8
96static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
97 const char *arg2, ssize_t len2,
99#endif
100static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
101 const char *src, ssize_t srclen,
103static void init_icu_converter(void);
104static size_t uchar_length(UConverter *converter,
105 const char *str, int32_t len);
106static int32_t uchar_convert(UConverter *converter,
107 UChar *dest, int32_t destlen,
108 const char *src, int32_t srclen);
109static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
110 size_t nbytes);
111static size_t icu_from_uchar(char *dest, size_t destsize,
112 const UChar *buff_uchar, int32_t len_uchar);
113static void icu_set_collation_attributes(UCollator *collator, const char *loc,
114 UErrorCode *status);
115static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
116 UChar **buff_dest, UChar *buff_source,
117 int32_t len_source);
118static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
119 const UChar *src, int32_t srcLength,
120 const char *locale,
121 UErrorCode *pErrorCode);
122static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
123 const UChar *src, int32_t srcLength,
124 const char *locale,
125 UErrorCode *pErrorCode);
126
127static const struct collate_methods collate_methods_icu = {
128 .strncoll = strncoll_icu,
129 .strnxfrm = strnxfrm_icu,
130 .strnxfrm_prefix = strnxfrm_prefix_icu,
131 .strxfrm_is_safe = true,
132};
133
134static const struct collate_methods collate_methods_icu_utf8 = {
135#ifdef HAVE_UCOL_STRCOLLUTF8
136 .strncoll = strncoll_icu_utf8,
137#else
138 .strncoll = strncoll_icu,
139#endif
140 .strnxfrm = strnxfrm_icu,
141 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
142 .strxfrm_is_safe = true,
143};
144
145#endif
146
149{
150#ifdef USE_ICU
151 bool deterministic;
152 const char *iculocstr;
153 const char *icurules = NULL;
154 UCollator *collator;
155 pg_locale_t result;
156
157 if (collid == DEFAULT_COLLATION_OID)
158 {
159 HeapTuple tp;
160 Datum datum;
161 bool isnull;
162
164 if (!HeapTupleIsValid(tp))
165 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
166
167 /* default database collation is always deterministic */
168 deterministic = true;
169 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
170 Anum_pg_database_datlocale);
171 iculocstr = TextDatumGetCString(datum);
172 datum = SysCacheGetAttr(DATABASEOID, tp,
173 Anum_pg_database_daticurules, &isnull);
174 if (!isnull)
175 icurules = TextDatumGetCString(datum);
176
177 ReleaseSysCache(tp);
178 }
179 else
180 {
181 Form_pg_collation collform;
182 HeapTuple tp;
183 Datum datum;
184 bool isnull;
185
187 if (!HeapTupleIsValid(tp))
188 elog(ERROR, "cache lookup failed for collation %u", collid);
189 collform = (Form_pg_collation) GETSTRUCT(tp);
190 deterministic = collform->collisdeterministic;
191 datum = SysCacheGetAttrNotNull(COLLOID, tp,
192 Anum_pg_collation_colllocale);
193 iculocstr = TextDatumGetCString(datum);
194 datum = SysCacheGetAttr(COLLOID, tp,
195 Anum_pg_collation_collicurules, &isnull);
196 if (!isnull)
197 icurules = TextDatumGetCString(datum);
198
199 ReleaseSysCache(tp);
200 }
201
202 collator = make_icu_collator(iculocstr, icurules);
203
204 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
205 result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
206 result->info.icu.ucol = collator;
207 result->provider = COLLPROVIDER_ICU;
208 result->deterministic = deterministic;
209 result->collate_is_c = false;
210 result->ctype_is_c = false;
212 result->collate = &collate_methods_icu_utf8;
213 else
214 result->collate = &collate_methods_icu;
215
216 return result;
217#else
218 /* could get here if a collation was created by a build with ICU */
220 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
221 errmsg("ICU is not supported in this build")));
222
223 return NULL;
224#endif
225}
226
227#ifdef USE_ICU
228
229/*
230 * Wrapper around ucol_open() to handle API differences for older ICU
231 * versions.
232 *
233 * Ensure that no path leaks a UCollator.
234 */
235UCollator *
236pg_ucol_open(const char *loc_str)
237{
238 UCollator *collator;
239 UErrorCode status;
240 const char *orig_str = loc_str;
241 char *fixed_str = NULL;
242
243 /*
244 * Must never open default collator, because it depends on the environment
245 * and may change at any time. Should not happen, but check here to catch
246 * bugs that might be hard to catch otherwise.
247 *
248 * NB: the default collator is not the same as the collator for the root
249 * locale. The root locale may be specified as the empty string, "und", or
250 * "root". The default collator is opened by passing NULL to ucol_open().
251 */
252 if (loc_str == NULL)
253 elog(ERROR, "opening default collator is not supported");
254
255 /*
256 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
257 * the root locale. If the first component of the locale is "und", replace
258 * with "root" before opening.
259 */
260 if (U_ICU_VERSION_MAJOR_NUM < 55)
261 {
262 char lang[ULOC_LANG_CAPACITY];
263
264 status = U_ZERO_ERROR;
265 uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
266 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
267 {
269 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
270 errmsg("could not get language from locale \"%s\": %s",
271 loc_str, u_errorName(status))));
272 }
273
274 if (strcmp(lang, "und") == 0)
275 {
276 const char *remainder = loc_str + strlen("und");
277
278 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
279 strcpy(fixed_str, "root");
280 strcat(fixed_str, remainder);
281
282 loc_str = fixed_str;
283 }
284 }
285
286 status = U_ZERO_ERROR;
287 collator = ucol_open(loc_str, &status);
288 if (U_FAILURE(status))
290 /* use original string for error report */
291 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
292 errmsg("could not open collator for locale \"%s\": %s",
293 orig_str, u_errorName(status))));
294
295 if (U_ICU_VERSION_MAJOR_NUM < 54)
296 {
297 status = U_ZERO_ERROR;
298 icu_set_collation_attributes(collator, loc_str, &status);
299
300 /*
301 * Pretend the error came from ucol_open(), for consistent error
302 * message across ICU versions.
303 */
304 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
305 {
306 ucol_close(collator);
308 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
309 errmsg("could not open collator for locale \"%s\": %s",
310 orig_str, u_errorName(status))));
311 }
312 }
313
314 if (fixed_str != NULL)
315 pfree(fixed_str);
316
317 return collator;
318}
319
320/*
321 * Create a UCollator with the given locale string and rules.
322 *
323 * Ensure that no path leaks a UCollator.
324 */
325static UCollator *
326make_icu_collator(const char *iculocstr, const char *icurules)
327{
328 if (!icurules)
329 {
330 /* simple case without rules */
331 return pg_ucol_open(iculocstr);
332 }
333 else
334 {
335 UCollator *collator_std_rules;
336 UCollator *collator_all_rules;
337 const UChar *std_rules;
338 UChar *my_rules;
339 UChar *all_rules;
340 int32_t length;
341 int32_t total;
342 UErrorCode status;
343
344 /*
345 * If rules are specified, we extract the rules of the standard
346 * collation, add our own rules, and make a new collator with the
347 * combined rules.
348 */
349 icu_to_uchar(&my_rules, icurules, strlen(icurules));
350
351 collator_std_rules = pg_ucol_open(iculocstr);
352
353 std_rules = ucol_getRules(collator_std_rules, &length);
354
355 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
356
357 /* avoid leaking collator on OOM */
358 all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
359 if (!all_rules)
360 {
361 ucol_close(collator_std_rules);
363 (errcode(ERRCODE_OUT_OF_MEMORY),
364 errmsg("out of memory")));
365 }
366
367 u_strcpy(all_rules, std_rules);
368 u_strcat(all_rules, my_rules);
369
370 ucol_close(collator_std_rules);
371
372 status = U_ZERO_ERROR;
373 collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
374 UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
375 NULL, &status);
376 if (U_FAILURE(status))
377 {
379 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
380 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
381 iculocstr, icurules, u_errorName(status))));
382 }
383
384 return collator_all_rules;
385 }
386}
387
388size_t
389strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
391{
392 int32_t len_uchar;
393 int32_t len_conv;
394 UChar *buff_uchar;
395 UChar *buff_conv;
396 size_t result_len;
397
398 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
399 len_conv = icu_convert_case(u_strToLower, locale,
400 &buff_conv, buff_uchar, len_uchar);
401 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
402 pfree(buff_uchar);
403 pfree(buff_conv);
404
405 return result_len;
406}
407
408size_t
409strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
411{
412 int32_t len_uchar;
413 int32_t len_conv;
414 UChar *buff_uchar;
415 UChar *buff_conv;
416 size_t result_len;
417
418 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
419 len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
420 &buff_conv, buff_uchar, len_uchar);
421 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
422 pfree(buff_uchar);
423 pfree(buff_conv);
424
425 return result_len;
426}
427
428size_t
429strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
431{
432 int32_t len_uchar;
433 int32_t len_conv;
434 UChar *buff_uchar;
435 UChar *buff_conv;
436 size_t result_len;
437
438 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
439 len_conv = icu_convert_case(u_strToUpper, locale,
440 &buff_conv, buff_uchar, len_uchar);
441 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
442 pfree(buff_uchar);
443 pfree(buff_conv);
444
445 return result_len;
446}
447
448size_t
449strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
451{
452 int32_t len_uchar;
453 int32_t len_conv;
454 UChar *buff_uchar;
455 UChar *buff_conv;
456 size_t result_len;
457
458 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
459 len_conv = icu_convert_case(u_strFoldCase_default, locale,
460 &buff_conv, buff_uchar, len_uchar);
461 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
462 pfree(buff_uchar);
463 pfree(buff_conv);
464
465 return result_len;
466}
467
468/*
469 * strncoll_icu_utf8
470 *
471 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
472 * database encoding. An argument length of -1 means the string is
473 * NUL-terminated.
474 */
475#ifdef HAVE_UCOL_STRCOLLUTF8
476int
477strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
479{
480 int result;
481 UErrorCode status;
482
483 Assert(locale->provider == COLLPROVIDER_ICU);
484
486
487 status = U_ZERO_ERROR;
488 result = ucol_strcollUTF8(locale->info.icu.ucol,
489 arg1, len1,
490 arg2, len2,
491 &status);
492 if (U_FAILURE(status))
494 (errmsg("collation failed: %s", u_errorName(status))));
495
496 return result;
497}
498#endif
499
500/* 'srclen' of -1 means the strings are NUL-terminated */
501size_t
502strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
504{
505 char sbuf[TEXTBUFLEN];
506 char *buf = sbuf;
507 UChar *uchar;
508 int32_t ulen;
509 size_t uchar_bsize;
510 Size result_bsize;
511
512 Assert(locale->provider == COLLPROVIDER_ICU);
513
514 init_icu_converter();
515
516 ulen = uchar_length(icu_converter, src, srclen);
517
518 uchar_bsize = (ulen + 1) * sizeof(UChar);
519
520 if (uchar_bsize > TEXTBUFLEN)
521 buf = palloc(uchar_bsize);
522
523 uchar = (UChar *) buf;
524
525 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
526
527 result_bsize = ucol_getSortKey(locale->info.icu.ucol,
528 uchar, ulen,
529 (uint8_t *) dest, destsize);
530
531 /*
532 * ucol_getSortKey() counts the nul-terminator in the result length, but
533 * this function should not.
534 */
535 Assert(result_bsize > 0);
536 result_bsize--;
537
538 if (buf != sbuf)
539 pfree(buf);
540
541 /* if dest is defined, it should be nul-terminated */
542 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
543
544 return result_bsize;
545}
546
547/* 'srclen' of -1 means the strings are NUL-terminated */
548size_t
549strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
550 const char *src, ssize_t srclen,
552{
553 size_t result;
554 UCharIterator iter;
555 uint32_t state[2];
556 UErrorCode status;
557
558 Assert(locale->provider == COLLPROVIDER_ICU);
559
561
562 uiter_setUTF8(&iter, src, srclen);
563 state[0] = state[1] = 0; /* won't need that again */
564 status = U_ZERO_ERROR;
565 result = ucol_nextSortKeyPart(locale->info.icu.ucol,
566 &iter,
567 state,
568 (uint8_t *) dest,
569 destsize,
570 &status);
571 if (U_FAILURE(status))
573 (errmsg("sort key generation failed: %s",
574 u_errorName(status))));
575
576 return result;
577}
578
579char *
580get_collation_actual_version_icu(const char *collcollate)
581{
582 UCollator *collator;
583 UVersionInfo versioninfo;
584 char buf[U_MAX_VERSION_STRING_LENGTH];
585
586 collator = pg_ucol_open(collcollate);
587
588 ucol_getVersion(collator, versioninfo);
589 ucol_close(collator);
590
591 u_versionToString(versioninfo, buf);
592 return pstrdup(buf);
593}
594
595/*
596 * Convert a string in the database encoding into a string of UChars.
597 *
598 * The source string at buff is of length nbytes
599 * (it needn't be nul-terminated)
600 *
601 * *buff_uchar receives a pointer to the palloc'd result string, and
602 * the function's result is the number of UChars generated.
603 *
604 * The result string is nul-terminated, though most callers rely on the
605 * result length instead.
606 */
607static int32_t
608icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
609{
610 int32_t len_uchar;
611
612 init_icu_converter();
613
614 len_uchar = uchar_length(icu_converter, buff, nbytes);
615
616 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
617 len_uchar = uchar_convert(icu_converter,
618 *buff_uchar, len_uchar + 1, buff, nbytes);
619
620 return len_uchar;
621}
622
623/*
624 * Convert a string of UChars into the database encoding.
625 *
626 * The source string at buff_uchar is of length len_uchar
627 * (it needn't be nul-terminated)
628 *
629 * *result receives a pointer to the palloc'd result string, and the
630 * function's result is the number of bytes generated (not counting nul).
631 *
632 * The result string is nul-terminated.
633 */
634static size_t
635icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
636{
637 UErrorCode status;
638 int32_t len_result;
639
640 init_icu_converter();
641
642 status = U_ZERO_ERROR;
643 len_result = ucnv_fromUChars(icu_converter, NULL, 0,
644 buff_uchar, len_uchar, &status);
645 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
647 (errmsg("%s failed: %s", "ucnv_fromUChars",
648 u_errorName(status))));
649
650 if (len_result + 1 > destsize)
651 return len_result;
652
653 status = U_ZERO_ERROR;
654 len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
655 buff_uchar, len_uchar, &status);
656 if (U_FAILURE(status) ||
657 status == U_STRING_NOT_TERMINATED_WARNING)
659 (errmsg("%s failed: %s", "ucnv_fromUChars",
660 u_errorName(status))));
661
662 return len_result;
663}
664
665static int32_t
666icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
667 UChar **buff_dest, UChar *buff_source, int32_t len_source)
668{
669 UErrorCode status;
670 int32_t len_dest;
671
672 len_dest = len_source; /* try first with same length */
673 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
674 status = U_ZERO_ERROR;
675 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
676 mylocale->info.icu.locale, &status);
677 if (status == U_BUFFER_OVERFLOW_ERROR)
678 {
679 /* try again with adjusted length */
680 pfree(*buff_dest);
681 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
682 status = U_ZERO_ERROR;
683 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
684 mylocale->info.icu.locale, &status);
685 }
686 if (U_FAILURE(status))
688 (errmsg("case conversion failed: %s", u_errorName(status))));
689 return len_dest;
690}
691
692static int32_t
693u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
694 const UChar *src, int32_t srcLength,
695 const char *locale,
696 UErrorCode *pErrorCode)
697{
698 return u_strToTitle(dest, destCapacity, src, srcLength,
699 NULL, locale, pErrorCode);
700}
701
702static int32_t
703u_strFoldCase_default(UChar *dest, int32_t destCapacity,
704 const UChar *src, int32_t srcLength,
705 const char *locale,
706 UErrorCode *pErrorCode)
707{
708 uint32 options = U_FOLD_CASE_DEFAULT;
709 char lang[3];
710 UErrorCode status;
711
712 /*
713 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
714 * folding does not accept a locale. Instead it just supports a single
715 * option relevant to Turkic languages 'az' and 'tr'; check for those
716 * languages to enable the option.
717 */
718 status = U_ZERO_ERROR;
719 uloc_getLanguage(locale, lang, 3, &status);
720 if (U_SUCCESS(status))
721 {
722 /*
723 * The option name is confusing, but it causes u_strFoldCase to use
724 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
725 */
726 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
727 options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
728 }
729
730 return u_strFoldCase(dest, destCapacity, src, srcLength,
731 options, pErrorCode);
732}
733
734/*
735 * strncoll_icu
736 *
737 * Convert the arguments from the database encoding to UChar strings, then
738 * call ucol_strcoll(). An argument length of -1 means that the string is
739 * NUL-terminated.
740 *
741 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
742 * caller should call that instead.
743 */
744static int
745strncoll_icu(const char *arg1, ssize_t len1,
746 const char *arg2, ssize_t len2, pg_locale_t locale)
747{
748 char sbuf[TEXTBUFLEN];
749 char *buf = sbuf;
750 int32_t ulen1;
751 int32_t ulen2;
752 size_t bufsize1;
753 size_t bufsize2;
754 UChar *uchar1,
755 *uchar2;
756 int result;
757
758 Assert(locale->provider == COLLPROVIDER_ICU);
759
760 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
761#ifdef HAVE_UCOL_STRCOLLUTF8
763#endif
764
765 init_icu_converter();
766
767 ulen1 = uchar_length(icu_converter, arg1, len1);
768 ulen2 = uchar_length(icu_converter, arg2, len2);
769
770 bufsize1 = (ulen1 + 1) * sizeof(UChar);
771 bufsize2 = (ulen2 + 1) * sizeof(UChar);
772
773 if (bufsize1 + bufsize2 > TEXTBUFLEN)
774 buf = palloc(bufsize1 + bufsize2);
775
776 uchar1 = (UChar *) buf;
777 uchar2 = (UChar *) (buf + bufsize1);
778
779 ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
780 ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
781
782 result = ucol_strcoll(locale->info.icu.ucol,
783 uchar1, ulen1,
784 uchar2, ulen2);
785
786 if (buf != sbuf)
787 pfree(buf);
788
789 return result;
790}
791
792/* 'srclen' of -1 means the strings are NUL-terminated */
793static size_t
794strnxfrm_prefix_icu(char *dest, size_t destsize,
795 const char *src, ssize_t srclen,
797{
798 char sbuf[TEXTBUFLEN];
799 char *buf = sbuf;
800 UCharIterator iter;
801 uint32_t state[2];
802 UErrorCode status;
803 int32_t ulen = -1;
804 UChar *uchar = NULL;
805 size_t uchar_bsize;
806 Size result_bsize;
807
808 Assert(locale->provider == COLLPROVIDER_ICU);
809
810 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
812
813 init_icu_converter();
814
815 ulen = uchar_length(icu_converter, src, srclen);
816
817 uchar_bsize = (ulen + 1) * sizeof(UChar);
818
819 if (uchar_bsize > TEXTBUFLEN)
820 buf = palloc(uchar_bsize);
821
822 uchar = (UChar *) buf;
823
824 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
825
826 uiter_setString(&iter, uchar, ulen);
827 state[0] = state[1] = 0; /* won't need that again */
828 status = U_ZERO_ERROR;
829 result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
830 &iter,
831 state,
832 (uint8_t *) dest,
833 destsize,
834 &status);
835 if (U_FAILURE(status))
837 (errmsg("sort key generation failed: %s",
838 u_errorName(status))));
839
840 return result_bsize;
841}
842
843static void
844init_icu_converter(void)
845{
846 const char *icu_encoding_name;
847 UErrorCode status;
848 UConverter *conv;
849
850 if (icu_converter)
851 return; /* already done */
852
853 icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
854 if (!icu_encoding_name)
856 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
857 errmsg("encoding \"%s\" not supported by ICU",
859
860 status = U_ZERO_ERROR;
861 conv = ucnv_open(icu_encoding_name, &status);
862 if (U_FAILURE(status))
864 (errmsg("could not open ICU converter for encoding \"%s\": %s",
865 icu_encoding_name, u_errorName(status))));
866
867 icu_converter = conv;
868}
869
870/*
871 * Find length, in UChars, of given string if converted to UChar string.
872 *
873 * A length of -1 indicates that the input string is NUL-terminated.
874 */
875static size_t
876uchar_length(UConverter *converter, const char *str, int32_t len)
877{
878 UErrorCode status = U_ZERO_ERROR;
879 int32_t ulen;
880
881 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
882 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
884 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
885 return ulen;
886}
887
888/*
889 * Convert the given source string into a UChar string, stored in dest, and
890 * return the length (in UChars).
891 *
892 * A srclen of -1 indicates that the input string is NUL-terminated.
893 */
894static int32_t
895uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
896 const char *src, int32_t srclen)
897{
898 UErrorCode status = U_ZERO_ERROR;
899 int32_t ulen;
900
901 status = U_ZERO_ERROR;
902 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
903 if (U_FAILURE(status))
905 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
906 return ulen;
907}
908
909/*
910 * Parse collation attributes from the given locale string and apply them to
911 * the open collator.
912 *
913 * First, the locale string is canonicalized to an ICU format locale ID such
914 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
915 * the key-value arguments.
916 *
917 * Starting with ICU version 54, the attributes are processed automatically by
918 * ucol_open(), so this is only necessary for emulating this behavior on older
919 * versions.
920 */
922static void
923icu_set_collation_attributes(UCollator *collator, const char *loc,
924 UErrorCode *status)
925{
926 int32_t len;
927 char *icu_locale_id;
928 char *lower_str;
929 char *str;
930 char *token;
931
932 /*
933 * The input locale may be a BCP 47 language tag, e.g.
934 * "und-u-kc-ks-level1", which expresses the same attributes in a
935 * different form. It will be converted to the equivalent ICU format
936 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
937 * uloc_canonicalize().
938 */
939 *status = U_ZERO_ERROR;
940 len = uloc_canonicalize(loc, NULL, 0, status);
941 icu_locale_id = palloc(len + 1);
942 *status = U_ZERO_ERROR;
943 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
944 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
945 return;
946
947 lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
948
949 pfree(icu_locale_id);
950
951 str = strchr(lower_str, '@');
952 if (!str)
953 return;
954 str++;
955
956 while ((token = strsep(&str, ";")))
957 {
958 char *e = strchr(token, '=');
959
960 if (e)
961 {
962 char *name;
963 char *value;
964 UColAttribute uattr;
965 UColAttributeValue uvalue;
966
967 *status = U_ZERO_ERROR;
968
969 *e = '\0';
970 name = token;
971 value = e + 1;
972
973 /*
974 * See attribute name and value lists in ICU i18n/coll.cpp
975 */
976 if (strcmp(name, "colstrength") == 0)
977 uattr = UCOL_STRENGTH;
978 else if (strcmp(name, "colbackwards") == 0)
979 uattr = UCOL_FRENCH_COLLATION;
980 else if (strcmp(name, "colcaselevel") == 0)
981 uattr = UCOL_CASE_LEVEL;
982 else if (strcmp(name, "colcasefirst") == 0)
983 uattr = UCOL_CASE_FIRST;
984 else if (strcmp(name, "colalternate") == 0)
985 uattr = UCOL_ALTERNATE_HANDLING;
986 else if (strcmp(name, "colnormalization") == 0)
987 uattr = UCOL_NORMALIZATION_MODE;
988 else if (strcmp(name, "colnumeric") == 0)
989 uattr = UCOL_NUMERIC_COLLATION;
990 else
991 /* ignore if unknown */
992 continue;
993
994 if (strcmp(value, "primary") == 0)
995 uvalue = UCOL_PRIMARY;
996 else if (strcmp(value, "secondary") == 0)
997 uvalue = UCOL_SECONDARY;
998 else if (strcmp(value, "tertiary") == 0)
999 uvalue = UCOL_TERTIARY;
1000 else if (strcmp(value, "quaternary") == 0)
1001 uvalue = UCOL_QUATERNARY;
1002 else if (strcmp(value, "identical") == 0)
1003 uvalue = UCOL_IDENTICAL;
1004 else if (strcmp(value, "no") == 0)
1005 uvalue = UCOL_OFF;
1006 else if (strcmp(value, "yes") == 0)
1007 uvalue = UCOL_ON;
1008 else if (strcmp(value, "shifted") == 0)
1009 uvalue = UCOL_SHIFTED;
1010 else if (strcmp(value, "non-ignorable") == 0)
1011 uvalue = UCOL_NON_IGNORABLE;
1012 else if (strcmp(value, "lower") == 0)
1013 uvalue = UCOL_LOWER_FIRST;
1014 else if (strcmp(value, "upper") == 0)
1015 uvalue = UCOL_UPPER_FIRST;
1016 else
1017 {
1018 *status = U_ILLEGAL_ARGUMENT_ERROR;
1019 break;
1020 }
1021
1022 ucol_setAttribute(collator, uattr, uvalue, status);
1023 }
1024 }
1025
1026 pfree(lower_str);
1027}
1028
1029#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define pg_attribute_unused()
Definition: c.h:133
#define Assert(condition)
Definition: c.h:815
uint32_t uint32
Definition: c.h:488
size_t Size
Definition: c.h:562
Oid collid
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:1898
Oid MyDatabaseId
Definition: globals.c:93
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
#define token
Definition: indent_globs.h:126
static struct @162 value
static char * locale
Definition: initdb.c:140
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1683
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1215
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
void * palloc_extended(Size size, int flags)
Definition: mcxt.c:1368
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
size_t strfold_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale)
size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale)
size_t strlower_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale)
size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale)
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
Definition: pg_locale_icu.c:48
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
#define pg_encoding_to_char
Definition: pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition: strsep.c:49
uintptr_t Datum
Definition: postgres.h:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
unsigned int Oid
Definition: postgres_ext.h:32
e
Definition: preproc-init.c:82
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:57
const struct collate_methods * collate
Definition: pg_locale.h:104
union pg_locale_struct::@158 info
const char * locale
Definition: pg_locale.h:110
bool deterministic
Definition: pg_locale.h:99
Definition: regguts.h:323
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:600
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:631
const char * name