PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucnv.h>
16#include <unicode/ustring.h>
17
18/*
19 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 * (see
21 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 */
23#if U_ICU_VERSION_MAJOR_NUM >= 53
24#define HAVE_UCOL_STRCOLLUTF8 1
25#else
26#undef HAVE_UCOL_STRCOLLUTF8
27#endif
28
29#endif
30
31#include "access/htup_details.h"
32#include "catalog/pg_database.h"
34#include "mb/pg_wchar.h"
35#include "miscadmin.h"
36#include "utils/builtins.h"
37#include "utils/formatting.h"
38#include "utils/memutils.h"
39#include "utils/pg_locale.h"
40#include "utils/syscache.h"
41
42/*
43 * Size of stack buffer to use for string transformations, used to avoid heap
44 * allocations in typical cases. This should be large enough that most strings
45 * will fit, but small enough that we feel comfortable putting it on the
46 * stack.
47 */
48#define TEXTBUFLEN 1024
49
51extern size_t strlower_icu(char *dest, size_t destsize, const char *src,
52 ssize_t srclen, pg_locale_t locale);
53extern size_t strtitle_icu(char *dest, size_t destsize, const char *src,
54 ssize_t srclen, pg_locale_t locale);
55extern size_t strupper_icu(char *dest, size_t destsize, const char *src,
56 ssize_t srclen, pg_locale_t locale);
57extern size_t strfold_icu(char *dest, size_t destsize, const char *src,
58 ssize_t srclen, pg_locale_t locale);
59
60#ifdef USE_ICU
61
62extern UCollator *pg_ucol_open(const char *loc_str);
63
64static size_t strnxfrm_icu(char *dest, size_t destsize,
65 const char *src, ssize_t srclen,
67extern char *get_collation_actual_version_icu(const char *collcollate);
68
69typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
70 const UChar *src, int32_t srcLength,
71 const char *locale,
72 UErrorCode *pErrorCode);
73
74/*
75 * Converter object for converting between ICU's UChar strings and C strings
76 * in database encoding. Since the database encoding doesn't change, we only
77 * need one of these per session.
78 */
79static UConverter *icu_converter = NULL;
80
81static UCollator *make_icu_collator(const char *iculocstr,
82 const char *icurules);
83static int strncoll_icu(const char *arg1, ssize_t len1,
84 const char *arg2, ssize_t len2,
86static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
87 const char *src, ssize_t srclen,
89#ifdef HAVE_UCOL_STRCOLLUTF8
90static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
91 const char *arg2, ssize_t len2,
93#endif
94static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
95 const char *src, ssize_t srclen,
97static void init_icu_converter(void);
98static size_t uchar_length(UConverter *converter,
99 const char *str, int32_t len);
100static int32_t uchar_convert(UConverter *converter,
101 UChar *dest, int32_t destlen,
102 const char *src, int32_t srclen);
103static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
104 size_t nbytes);
105static size_t icu_from_uchar(char *dest, size_t destsize,
106 const UChar *buff_uchar, int32_t len_uchar);
107static void icu_set_collation_attributes(UCollator *collator, const char *loc,
108 UErrorCode *status);
109static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
110 UChar **buff_dest, UChar *buff_source,
111 int32_t len_source);
112static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
113 const UChar *src, int32_t srcLength,
114 const char *locale,
115 UErrorCode *pErrorCode);
116static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
117 const UChar *src, int32_t srcLength,
118 const char *locale,
119 UErrorCode *pErrorCode);
120
121static const struct collate_methods collate_methods_icu = {
122 .strncoll = strncoll_icu,
123 .strnxfrm = strnxfrm_icu,
124 .strnxfrm_prefix = strnxfrm_prefix_icu,
125 .strxfrm_is_safe = true,
126};
127
128static const struct collate_methods collate_methods_icu_utf8 = {
129#ifdef HAVE_UCOL_STRCOLLUTF8
130 .strncoll = strncoll_icu_utf8,
131#else
132 .strncoll = strncoll_icu,
133#endif
134 .strnxfrm = strnxfrm_icu,
135 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
136 .strxfrm_is_safe = true,
137};
138
139#endif
140
143{
144#ifdef USE_ICU
145 bool deterministic;
146 const char *iculocstr;
147 const char *icurules = NULL;
148 UCollator *collator;
149 pg_locale_t result;
150
151 if (collid == DEFAULT_COLLATION_OID)
152 {
153 HeapTuple tp;
154 Datum datum;
155 bool isnull;
156
158 if (!HeapTupleIsValid(tp))
159 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
160
161 /* default database collation is always deterministic */
162 deterministic = true;
163 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
164 Anum_pg_database_datlocale);
165 iculocstr = TextDatumGetCString(datum);
166 datum = SysCacheGetAttr(DATABASEOID, tp,
167 Anum_pg_database_daticurules, &isnull);
168 if (!isnull)
169 icurules = TextDatumGetCString(datum);
170
171 ReleaseSysCache(tp);
172 }
173 else
174 {
175 Form_pg_collation collform;
176 HeapTuple tp;
177 Datum datum;
178 bool isnull;
179
181 if (!HeapTupleIsValid(tp))
182 elog(ERROR, "cache lookup failed for collation %u", collid);
183 collform = (Form_pg_collation) GETSTRUCT(tp);
184 deterministic = collform->collisdeterministic;
185 datum = SysCacheGetAttrNotNull(COLLOID, tp,
186 Anum_pg_collation_colllocale);
187 iculocstr = TextDatumGetCString(datum);
188 datum = SysCacheGetAttr(COLLOID, tp,
189 Anum_pg_collation_collicurules, &isnull);
190 if (!isnull)
191 icurules = TextDatumGetCString(datum);
192
193 ReleaseSysCache(tp);
194 }
195
196 collator = make_icu_collator(iculocstr, icurules);
197
198 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
199 result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
200 result->info.icu.ucol = collator;
201 result->provider = COLLPROVIDER_ICU;
202 result->deterministic = deterministic;
203 result->collate_is_c = false;
204 result->ctype_is_c = false;
206 result->collate = &collate_methods_icu_utf8;
207 else
208 result->collate = &collate_methods_icu;
209
210 return result;
211#else
212 /* could get here if a collation was created by a build with ICU */
214 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
215 errmsg("ICU is not supported in this build")));
216
217 return NULL;
218#endif
219}
220
221#ifdef USE_ICU
222
223/*
224 * Wrapper around ucol_open() to handle API differences for older ICU
225 * versions.
226 *
227 * Ensure that no path leaks a UCollator.
228 */
229UCollator *
230pg_ucol_open(const char *loc_str)
231{
232 UCollator *collator;
233 UErrorCode status;
234 const char *orig_str = loc_str;
235 char *fixed_str = NULL;
236
237 /*
238 * Must never open default collator, because it depends on the environment
239 * and may change at any time. Should not happen, but check here to catch
240 * bugs that might be hard to catch otherwise.
241 *
242 * NB: the default collator is not the same as the collator for the root
243 * locale. The root locale may be specified as the empty string, "und", or
244 * "root". The default collator is opened by passing NULL to ucol_open().
245 */
246 if (loc_str == NULL)
247 elog(ERROR, "opening default collator is not supported");
248
249 /*
250 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
251 * the root locale. If the first component of the locale is "und", replace
252 * with "root" before opening.
253 */
254 if (U_ICU_VERSION_MAJOR_NUM < 55)
255 {
256 char lang[ULOC_LANG_CAPACITY];
257
258 status = U_ZERO_ERROR;
259 uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
260 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
261 {
263 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
264 errmsg("could not get language from locale \"%s\": %s",
265 loc_str, u_errorName(status))));
266 }
267
268 if (strcmp(lang, "und") == 0)
269 {
270 const char *remainder = loc_str + strlen("und");
271
272 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
273 strcpy(fixed_str, "root");
274 strcat(fixed_str, remainder);
275
276 loc_str = fixed_str;
277 }
278 }
279
280 status = U_ZERO_ERROR;
281 collator = ucol_open(loc_str, &status);
282 if (U_FAILURE(status))
284 /* use original string for error report */
285 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
286 errmsg("could not open collator for locale \"%s\": %s",
287 orig_str, u_errorName(status))));
288
289 if (U_ICU_VERSION_MAJOR_NUM < 54)
290 {
291 status = U_ZERO_ERROR;
292 icu_set_collation_attributes(collator, loc_str, &status);
293
294 /*
295 * Pretend the error came from ucol_open(), for consistent error
296 * message across ICU versions.
297 */
298 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
299 {
300 ucol_close(collator);
302 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
303 errmsg("could not open collator for locale \"%s\": %s",
304 orig_str, u_errorName(status))));
305 }
306 }
307
308 if (fixed_str != NULL)
309 pfree(fixed_str);
310
311 return collator;
312}
313
314/*
315 * Create a UCollator with the given locale string and rules.
316 *
317 * Ensure that no path leaks a UCollator.
318 */
319static UCollator *
320make_icu_collator(const char *iculocstr, const char *icurules)
321{
322 if (!icurules)
323 {
324 /* simple case without rules */
325 return pg_ucol_open(iculocstr);
326 }
327 else
328 {
329 UCollator *collator_std_rules;
330 UCollator *collator_all_rules;
331 const UChar *std_rules;
332 UChar *my_rules;
333 UChar *all_rules;
334 int32_t length;
335 int32_t total;
336 UErrorCode status;
337
338 /*
339 * If rules are specified, we extract the rules of the standard
340 * collation, add our own rules, and make a new collator with the
341 * combined rules.
342 */
343 icu_to_uchar(&my_rules, icurules, strlen(icurules));
344
345 collator_std_rules = pg_ucol_open(iculocstr);
346
347 std_rules = ucol_getRules(collator_std_rules, &length);
348
349 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
350
351 /* avoid leaking collator on OOM */
352 all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
353 if (!all_rules)
354 {
355 ucol_close(collator_std_rules);
357 (errcode(ERRCODE_OUT_OF_MEMORY),
358 errmsg("out of memory")));
359 }
360
361 u_strcpy(all_rules, std_rules);
362 u_strcat(all_rules, my_rules);
363
364 ucol_close(collator_std_rules);
365
366 status = U_ZERO_ERROR;
367 collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
368 UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
369 NULL, &status);
370 if (U_FAILURE(status))
371 {
373 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
374 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
375 iculocstr, icurules, u_errorName(status))));
376 }
377
378 return collator_all_rules;
379 }
380}
381
382size_t
383strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
385{
386 int32_t len_uchar;
387 int32_t len_conv;
388 UChar *buff_uchar;
389 UChar *buff_conv;
390 size_t result_len;
391
392 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
393 len_conv = icu_convert_case(u_strToLower, locale,
394 &buff_conv, buff_uchar, len_uchar);
395 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
396 pfree(buff_uchar);
397 pfree(buff_conv);
398
399 return result_len;
400}
401
402size_t
403strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
405{
406 int32_t len_uchar;
407 int32_t len_conv;
408 UChar *buff_uchar;
409 UChar *buff_conv;
410 size_t result_len;
411
412 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
413 len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
414 &buff_conv, buff_uchar, len_uchar);
415 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
416 pfree(buff_uchar);
417 pfree(buff_conv);
418
419 return result_len;
420}
421
422size_t
423strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
425{
426 int32_t len_uchar;
427 int32_t len_conv;
428 UChar *buff_uchar;
429 UChar *buff_conv;
430 size_t result_len;
431
432 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
433 len_conv = icu_convert_case(u_strToUpper, locale,
434 &buff_conv, buff_uchar, len_uchar);
435 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
436 pfree(buff_uchar);
437 pfree(buff_conv);
438
439 return result_len;
440}
441
442size_t
443strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
445{
446 int32_t len_uchar;
447 int32_t len_conv;
448 UChar *buff_uchar;
449 UChar *buff_conv;
450 size_t result_len;
451
452 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
453 len_conv = icu_convert_case(u_strFoldCase_default, locale,
454 &buff_conv, buff_uchar, len_uchar);
455 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
456 pfree(buff_uchar);
457 pfree(buff_conv);
458
459 return result_len;
460}
461
462/*
463 * strncoll_icu_utf8
464 *
465 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
466 * database encoding. An argument length of -1 means the string is
467 * NUL-terminated.
468 */
469#ifdef HAVE_UCOL_STRCOLLUTF8
470int
471strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
473{
474 int result;
475 UErrorCode status;
476
477 Assert(locale->provider == COLLPROVIDER_ICU);
478
480
481 status = U_ZERO_ERROR;
482 result = ucol_strcollUTF8(locale->info.icu.ucol,
483 arg1, len1,
484 arg2, len2,
485 &status);
486 if (U_FAILURE(status))
488 (errmsg("collation failed: %s", u_errorName(status))));
489
490 return result;
491}
492#endif
493
494/* 'srclen' of -1 means the strings are NUL-terminated */
495size_t
496strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
498{
499 char sbuf[TEXTBUFLEN];
500 char *buf = sbuf;
501 UChar *uchar;
502 int32_t ulen;
503 size_t uchar_bsize;
504 Size result_bsize;
505
506 Assert(locale->provider == COLLPROVIDER_ICU);
507
508 init_icu_converter();
509
510 ulen = uchar_length(icu_converter, src, srclen);
511
512 uchar_bsize = (ulen + 1) * sizeof(UChar);
513
514 if (uchar_bsize > TEXTBUFLEN)
515 buf = palloc(uchar_bsize);
516
517 uchar = (UChar *) buf;
518
519 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
520
521 result_bsize = ucol_getSortKey(locale->info.icu.ucol,
522 uchar, ulen,
523 (uint8_t *) dest, destsize);
524
525 /*
526 * ucol_getSortKey() counts the nul-terminator in the result length, but
527 * this function should not.
528 */
529 Assert(result_bsize > 0);
530 result_bsize--;
531
532 if (buf != sbuf)
533 pfree(buf);
534
535 /* if dest is defined, it should be nul-terminated */
536 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
537
538 return result_bsize;
539}
540
541/* 'srclen' of -1 means the strings are NUL-terminated */
542size_t
543strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
544 const char *src, ssize_t srclen,
546{
547 size_t result;
548 UCharIterator iter;
549 uint32_t state[2];
550 UErrorCode status;
551
552 Assert(locale->provider == COLLPROVIDER_ICU);
553
555
556 uiter_setUTF8(&iter, src, srclen);
557 state[0] = state[1] = 0; /* won't need that again */
558 status = U_ZERO_ERROR;
559 result = ucol_nextSortKeyPart(locale->info.icu.ucol,
560 &iter,
561 state,
562 (uint8_t *) dest,
563 destsize,
564 &status);
565 if (U_FAILURE(status))
567 (errmsg("sort key generation failed: %s",
568 u_errorName(status))));
569
570 return result;
571}
572
573char *
574get_collation_actual_version_icu(const char *collcollate)
575{
576 UCollator *collator;
577 UVersionInfo versioninfo;
578 char buf[U_MAX_VERSION_STRING_LENGTH];
579
580 collator = pg_ucol_open(collcollate);
581
582 ucol_getVersion(collator, versioninfo);
583 ucol_close(collator);
584
585 u_versionToString(versioninfo, buf);
586 return pstrdup(buf);
587}
588
589/*
590 * Convert a string in the database encoding into a string of UChars.
591 *
592 * The source string at buff is of length nbytes
593 * (it needn't be nul-terminated)
594 *
595 * *buff_uchar receives a pointer to the palloc'd result string, and
596 * the function's result is the number of UChars generated.
597 *
598 * The result string is nul-terminated, though most callers rely on the
599 * result length instead.
600 */
601static int32_t
602icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
603{
604 int32_t len_uchar;
605
606 init_icu_converter();
607
608 len_uchar = uchar_length(icu_converter, buff, nbytes);
609
610 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
611 len_uchar = uchar_convert(icu_converter,
612 *buff_uchar, len_uchar + 1, buff, nbytes);
613
614 return len_uchar;
615}
616
617/*
618 * Convert a string of UChars into the database encoding.
619 *
620 * The source string at buff_uchar is of length len_uchar
621 * (it needn't be nul-terminated)
622 *
623 * *result receives a pointer to the palloc'd result string, and the
624 * function's result is the number of bytes generated (not counting nul).
625 *
626 * The result string is nul-terminated.
627 */
628static size_t
629icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
630{
631 UErrorCode status;
632 int32_t len_result;
633
634 init_icu_converter();
635
636 status = U_ZERO_ERROR;
637 len_result = ucnv_fromUChars(icu_converter, NULL, 0,
638 buff_uchar, len_uchar, &status);
639 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
641 (errmsg("%s failed: %s", "ucnv_fromUChars",
642 u_errorName(status))));
643
644 if (len_result + 1 > destsize)
645 return len_result;
646
647 status = U_ZERO_ERROR;
648 len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
649 buff_uchar, len_uchar, &status);
650 if (U_FAILURE(status) ||
651 status == U_STRING_NOT_TERMINATED_WARNING)
653 (errmsg("%s failed: %s", "ucnv_fromUChars",
654 u_errorName(status))));
655
656 return len_result;
657}
658
659static int32_t
660icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
661 UChar **buff_dest, UChar *buff_source, int32_t len_source)
662{
663 UErrorCode status;
664 int32_t len_dest;
665
666 len_dest = len_source; /* try first with same length */
667 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
668 status = U_ZERO_ERROR;
669 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
670 mylocale->info.icu.locale, &status);
671 if (status == U_BUFFER_OVERFLOW_ERROR)
672 {
673 /* try again with adjusted length */
674 pfree(*buff_dest);
675 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
676 status = U_ZERO_ERROR;
677 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
678 mylocale->info.icu.locale, &status);
679 }
680 if (U_FAILURE(status))
682 (errmsg("case conversion failed: %s", u_errorName(status))));
683 return len_dest;
684}
685
686static int32_t
687u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
688 const UChar *src, int32_t srcLength,
689 const char *locale,
690 UErrorCode *pErrorCode)
691{
692 return u_strToTitle(dest, destCapacity, src, srcLength,
693 NULL, locale, pErrorCode);
694}
695
696static int32_t
697u_strFoldCase_default(UChar *dest, int32_t destCapacity,
698 const UChar *src, int32_t srcLength,
699 const char *locale,
700 UErrorCode *pErrorCode)
701{
702 uint32 options = U_FOLD_CASE_DEFAULT;
703 char lang[3];
704 UErrorCode status;
705
706 /*
707 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
708 * folding does not accept a locale. Instead it just supports a single
709 * option relevant to Turkic languages 'az' and 'tr'; check for those
710 * languages to enable the option.
711 */
712 status = U_ZERO_ERROR;
713 uloc_getLanguage(locale, lang, 3, &status);
714 if (U_SUCCESS(status))
715 {
716 /*
717 * The option name is confusing, but it causes u_strFoldCase to use
718 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
719 */
720 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
721 options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
722 }
723
724 return u_strFoldCase(dest, destCapacity, src, srcLength,
725 options, pErrorCode);
726}
727
728/*
729 * strncoll_icu
730 *
731 * Convert the arguments from the database encoding to UChar strings, then
732 * call ucol_strcoll(). An argument length of -1 means that the string is
733 * NUL-terminated.
734 *
735 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
736 * caller should call that instead.
737 */
738static int
739strncoll_icu(const char *arg1, ssize_t len1,
740 const char *arg2, ssize_t len2, pg_locale_t locale)
741{
742 char sbuf[TEXTBUFLEN];
743 char *buf = sbuf;
744 int32_t ulen1;
745 int32_t ulen2;
746 size_t bufsize1;
747 size_t bufsize2;
748 UChar *uchar1,
749 *uchar2;
750 int result;
751
752 Assert(locale->provider == COLLPROVIDER_ICU);
753
754 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
755#ifdef HAVE_UCOL_STRCOLLUTF8
757#endif
758
759 init_icu_converter();
760
761 ulen1 = uchar_length(icu_converter, arg1, len1);
762 ulen2 = uchar_length(icu_converter, arg2, len2);
763
764 bufsize1 = (ulen1 + 1) * sizeof(UChar);
765 bufsize2 = (ulen2 + 1) * sizeof(UChar);
766
767 if (bufsize1 + bufsize2 > TEXTBUFLEN)
768 buf = palloc(bufsize1 + bufsize2);
769
770 uchar1 = (UChar *) buf;
771 uchar2 = (UChar *) (buf + bufsize1);
772
773 ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
774 ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
775
776 result = ucol_strcoll(locale->info.icu.ucol,
777 uchar1, ulen1,
778 uchar2, ulen2);
779
780 if (buf != sbuf)
781 pfree(buf);
782
783 return result;
784}
785
786/* 'srclen' of -1 means the strings are NUL-terminated */
787static size_t
788strnxfrm_prefix_icu(char *dest, size_t destsize,
789 const char *src, ssize_t srclen,
791{
792 char sbuf[TEXTBUFLEN];
793 char *buf = sbuf;
794 UCharIterator iter;
795 uint32_t state[2];
796 UErrorCode status;
797 int32_t ulen = -1;
798 UChar *uchar = NULL;
799 size_t uchar_bsize;
800 Size result_bsize;
801
802 Assert(locale->provider == COLLPROVIDER_ICU);
803
804 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
806
807 init_icu_converter();
808
809 ulen = uchar_length(icu_converter, src, srclen);
810
811 uchar_bsize = (ulen + 1) * sizeof(UChar);
812
813 if (uchar_bsize > TEXTBUFLEN)
814 buf = palloc(uchar_bsize);
815
816 uchar = (UChar *) buf;
817
818 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
819
820 uiter_setString(&iter, uchar, ulen);
821 state[0] = state[1] = 0; /* won't need that again */
822 status = U_ZERO_ERROR;
823 result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
824 &iter,
825 state,
826 (uint8_t *) dest,
827 destsize,
828 &status);
829 if (U_FAILURE(status))
831 (errmsg("sort key generation failed: %s",
832 u_errorName(status))));
833
834 return result_bsize;
835}
836
837static void
838init_icu_converter(void)
839{
840 const char *icu_encoding_name;
841 UErrorCode status;
842 UConverter *conv;
843
844 if (icu_converter)
845 return; /* already done */
846
847 icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
848 if (!icu_encoding_name)
850 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
851 errmsg("encoding \"%s\" not supported by ICU",
853
854 status = U_ZERO_ERROR;
855 conv = ucnv_open(icu_encoding_name, &status);
856 if (U_FAILURE(status))
858 (errmsg("could not open ICU converter for encoding \"%s\": %s",
859 icu_encoding_name, u_errorName(status))));
860
861 icu_converter = conv;
862}
863
864/*
865 * Find length, in UChars, of given string if converted to UChar string.
866 *
867 * A length of -1 indicates that the input string is NUL-terminated.
868 */
869static size_t
870uchar_length(UConverter *converter, const char *str, int32_t len)
871{
872 UErrorCode status = U_ZERO_ERROR;
873 int32_t ulen;
874
875 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
876 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
878 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
879 return ulen;
880}
881
882/*
883 * Convert the given source string into a UChar string, stored in dest, and
884 * return the length (in UChars).
885 *
886 * A srclen of -1 indicates that the input string is NUL-terminated.
887 */
888static int32_t
889uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
890 const char *src, int32_t srclen)
891{
892 UErrorCode status = U_ZERO_ERROR;
893 int32_t ulen;
894
895 status = U_ZERO_ERROR;
896 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
897 if (U_FAILURE(status))
899 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
900 return ulen;
901}
902
903/*
904 * Parse collation attributes from the given locale string and apply them to
905 * the open collator.
906 *
907 * First, the locale string is canonicalized to an ICU format locale ID such
908 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
909 * the key-value arguments.
910 *
911 * Starting with ICU version 54, the attributes are processed automatically by
912 * ucol_open(), so this is only necessary for emulating this behavior on older
913 * versions.
914 */
916static void
917icu_set_collation_attributes(UCollator *collator, const char *loc,
918 UErrorCode *status)
919{
920 int32_t len;
921 char *icu_locale_id;
922 char *lower_str;
923 char *str;
924 char *token;
925
926 /*
927 * The input locale may be a BCP 47 language tag, e.g.
928 * "und-u-kc-ks-level1", which expresses the same attributes in a
929 * different form. It will be converted to the equivalent ICU format
930 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
931 * uloc_canonicalize().
932 */
933 *status = U_ZERO_ERROR;
934 len = uloc_canonicalize(loc, NULL, 0, status);
935 icu_locale_id = palloc(len + 1);
936 *status = U_ZERO_ERROR;
937 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
938 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
939 return;
940
941 lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
942
943 pfree(icu_locale_id);
944
945 str = strchr(lower_str, '@');
946 if (!str)
947 return;
948 str++;
949
950 while ((token = strsep(&str, ";")))
951 {
952 char *e = strchr(token, '=');
953
954 if (e)
955 {
956 char *name;
957 char *value;
958 UColAttribute uattr;
959 UColAttributeValue uvalue;
960
961 *status = U_ZERO_ERROR;
962
963 *e = '\0';
964 name = token;
965 value = e + 1;
966
967 /*
968 * See attribute name and value lists in ICU i18n/coll.cpp
969 */
970 if (strcmp(name, "colstrength") == 0)
971 uattr = UCOL_STRENGTH;
972 else if (strcmp(name, "colbackwards") == 0)
973 uattr = UCOL_FRENCH_COLLATION;
974 else if (strcmp(name, "colcaselevel") == 0)
975 uattr = UCOL_CASE_LEVEL;
976 else if (strcmp(name, "colcasefirst") == 0)
977 uattr = UCOL_CASE_FIRST;
978 else if (strcmp(name, "colalternate") == 0)
979 uattr = UCOL_ALTERNATE_HANDLING;
980 else if (strcmp(name, "colnormalization") == 0)
981 uattr = UCOL_NORMALIZATION_MODE;
982 else if (strcmp(name, "colnumeric") == 0)
983 uattr = UCOL_NUMERIC_COLLATION;
984 else
985 /* ignore if unknown */
986 continue;
987
988 if (strcmp(value, "primary") == 0)
989 uvalue = UCOL_PRIMARY;
990 else if (strcmp(value, "secondary") == 0)
991 uvalue = UCOL_SECONDARY;
992 else if (strcmp(value, "tertiary") == 0)
993 uvalue = UCOL_TERTIARY;
994 else if (strcmp(value, "quaternary") == 0)
995 uvalue = UCOL_QUATERNARY;
996 else if (strcmp(value, "identical") == 0)
997 uvalue = UCOL_IDENTICAL;
998 else if (strcmp(value, "no") == 0)
999 uvalue = UCOL_OFF;
1000 else if (strcmp(value, "yes") == 0)
1001 uvalue = UCOL_ON;
1002 else if (strcmp(value, "shifted") == 0)
1003 uvalue = UCOL_SHIFTED;
1004 else if (strcmp(value, "non-ignorable") == 0)
1005 uvalue = UCOL_NON_IGNORABLE;
1006 else if (strcmp(value, "lower") == 0)
1007 uvalue = UCOL_LOWER_FIRST;
1008 else if (strcmp(value, "upper") == 0)
1009 uvalue = UCOL_UPPER_FIRST;
1010 else
1011 {
1012 *status = U_ILLEGAL_ARGUMENT_ERROR;
1013 break;
1014 }
1015
1016 ucol_setAttribute(collator, uattr, uvalue, status);
1017 }
1018 }
1019
1020 pfree(lower_str);
1021}
1022
1023#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define pg_attribute_unused()
Definition: c.h:133
uint32_t uint32
Definition: c.h:502
size_t Size
Definition: c.h:576
Oid collid
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:1898
Oid MyDatabaseId
Definition: globals.c:95
Assert(PointerIsAligned(start, uint64))
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
#define token
Definition: indent_globs.h:126
static struct @165 value
static char * locale
Definition: initdb.c:140
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:2312
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1294
char * pstrdup(const char *in)
Definition: mcxt.c:2325
void pfree(void *pointer)
Definition: mcxt.c:2150
void * palloc(Size size)
Definition: mcxt.c:1943
void * palloc_extended(Size size, int flags)
Definition: mcxt.c:1995
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
#define TEXTBUFLEN
Definition: pg_locale_icu.c:48
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
#define pg_encoding_to_char
Definition: pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition: strsep.c:49
uintptr_t Datum
Definition: postgres.h:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
unsigned int Oid
Definition: postgres_ext.h:30
e
Definition: preproc-init.c:82
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:57
const struct collate_methods * collate
Definition: pg_locale.h:104
const char * locale
Definition: pg_locale.h:110
bool deterministic
Definition: pg_locale.h:99
union pg_locale_struct::@161 info
Definition: regguts.h:323
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:600
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:631
const char * name