PostgreSQL Source Code git master
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucnv.h>
16#include <unicode/ustring.h>
17
18/*
19 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 * (see
21 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 */
23#if U_ICU_VERSION_MAJOR_NUM >= 53
24#define HAVE_UCOL_STRCOLLUTF8 1
25#else
26#undef HAVE_UCOL_STRCOLLUTF8
27#endif
28
29#endif
30
31#include "access/htup_details.h"
32#include "catalog/pg_database.h"
34#include "mb/pg_wchar.h"
35#include "miscadmin.h"
36#include "utils/builtins.h"
37#include "utils/formatting.h"
38#include "utils/memutils.h"
39#include "utils/pg_locale.h"
40#include "utils/syscache.h"
41
42/*
43 * Size of stack buffer to use for string transformations, used to avoid heap
44 * allocations in typical cases. This should be large enough that most strings
45 * will fit, but small enough that we feel comfortable putting it on the
46 * stack.
47 */
48#define TEXTBUFLEN 1024
49
51
52#ifdef USE_ICU
53
54extern UCollator *pg_ucol_open(const char *loc_str);
55static UCaseMap *pg_ucasemap_open(const char *loc_str);
56
57static size_t strlower_icu(char *dest, size_t destsize, const char *src,
58 ssize_t srclen, pg_locale_t locale);
59static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
60 ssize_t srclen, pg_locale_t locale);
61static size_t strupper_icu(char *dest, size_t destsize, const char *src,
62 ssize_t srclen, pg_locale_t locale);
63static size_t strfold_icu(char *dest, size_t destsize, const char *src,
64 ssize_t srclen, pg_locale_t locale);
65static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
66 ssize_t srclen, pg_locale_t locale);
67static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
68 ssize_t srclen, pg_locale_t locale);
69static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
70 ssize_t srclen, pg_locale_t locale);
71static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
72 ssize_t srclen, pg_locale_t locale);
73static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
74 ssize_t srclen, pg_locale_t locale);
75static int strncoll_icu(const char *arg1, ssize_t len1,
76 const char *arg2, ssize_t len2,
78static size_t strnxfrm_icu(char *dest, size_t destsize,
79 const char *src, ssize_t srclen,
81extern char *get_collation_actual_version_icu(const char *collcollate);
82
83typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
84 const UChar *src, int32_t srcLength,
85 const char *locale,
86 UErrorCode *pErrorCode);
87
88/*
89 * Converter object for converting between ICU's UChar strings and C strings
90 * in database encoding. Since the database encoding doesn't change, we only
91 * need one of these per session.
92 */
93static UConverter *icu_converter = NULL;
94
95static UCollator *make_icu_collator(const char *iculocstr,
96 const char *icurules);
97static int strncoll_icu(const char *arg1, ssize_t len1,
98 const char *arg2, ssize_t len2,
100static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
101 const char *src, ssize_t srclen,
103#ifdef HAVE_UCOL_STRCOLLUTF8
104static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
105 const char *arg2, ssize_t len2,
107#endif
108static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
109 const char *src, ssize_t srclen,
111static void init_icu_converter(void);
112static size_t uchar_length(UConverter *converter,
113 const char *str, int32_t len);
114static int32_t uchar_convert(UConverter *converter,
115 UChar *dest, int32_t destlen,
116 const char *src, int32_t srclen);
117static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
118 size_t nbytes);
119static size_t icu_from_uchar(char *dest, size_t destsize,
120 const UChar *buff_uchar, int32_t len_uchar);
121static void icu_set_collation_attributes(UCollator *collator, const char *loc,
122 UErrorCode *status);
123static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
124 size_t destsize, const char *src,
125 ssize_t srclen, pg_locale_t locale);
126static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
127 const UChar *src, int32_t srcLength,
128 const char *locale,
129 UErrorCode *pErrorCode);
130static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
131 const UChar *src, int32_t srcLength,
132 const char *locale,
133 UErrorCode *pErrorCode);
134static int32_t foldcase_options(const char *locale);
135
136/*
137 * XXX: many of the functions below rely on casts directly from pg_wchar to
138 * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
139 */
140
141static pg_wchar
142toupper_icu(pg_wchar wc, pg_locale_t locale)
143{
144 return u_toupper(wc);
145}
146
147static pg_wchar
148tolower_icu(pg_wchar wc, pg_locale_t locale)
149{
150 return u_tolower(wc);
151}
152
153static const struct collate_methods collate_methods_icu = {
154 .strncoll = strncoll_icu,
155 .strnxfrm = strnxfrm_icu,
156 .strnxfrm_prefix = strnxfrm_prefix_icu,
157 .strxfrm_is_safe = true,
158};
159
160static const struct collate_methods collate_methods_icu_utf8 = {
161#ifdef HAVE_UCOL_STRCOLLUTF8
162 .strncoll = strncoll_icu_utf8,
163#else
164 .strncoll = strncoll_icu,
165#endif
166 .strnxfrm = strnxfrm_icu,
167 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
168 .strxfrm_is_safe = true,
169};
170
171static bool
172wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
173{
174 return u_isdigit(wc);
175}
176
177static bool
178wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
179{
180 return u_isalpha(wc);
181}
182
183static bool
184wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
185{
186 return u_isalnum(wc);
187}
188
189static bool
190wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
191{
192 return u_isupper(wc);
193}
194
195static bool
196wc_islower_icu(pg_wchar wc, pg_locale_t locale)
197{
198 return u_islower(wc);
199}
200
201static bool
202wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
203{
204 return u_isgraph(wc);
205}
206
207static bool
208wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
209{
210 return u_isprint(wc);
211}
212
213static bool
214wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
215{
216 return u_ispunct(wc);
217}
218
219static bool
220wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
221{
222 return u_isspace(wc);
223}
224
225static bool
226wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
227{
228 return u_isxdigit(wc);
229}
230
231static bool
232wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
233{
234 return u_hasBinaryProperty(wc, UCHAR_CASED);
235}
236
237static const struct ctype_methods ctype_methods_icu = {
238 .strlower = strlower_icu,
239 .strtitle = strtitle_icu,
240 .strupper = strupper_icu,
241 .strfold = strfold_icu,
242 .downcase_ident = downcase_ident_icu,
243 .wc_isdigit = wc_isdigit_icu,
244 .wc_isalpha = wc_isalpha_icu,
245 .wc_isalnum = wc_isalnum_icu,
246 .wc_isupper = wc_isupper_icu,
247 .wc_islower = wc_islower_icu,
248 .wc_isgraph = wc_isgraph_icu,
249 .wc_isprint = wc_isprint_icu,
250 .wc_ispunct = wc_ispunct_icu,
251 .wc_isspace = wc_isspace_icu,
252 .wc_isxdigit = wc_isxdigit_icu,
253 .wc_iscased = wc_iscased_icu,
254 .wc_toupper = toupper_icu,
255 .wc_tolower = tolower_icu,
256};
257
258static const struct ctype_methods ctype_methods_icu_utf8 = {
259 .strlower = strlower_icu_utf8,
260 .strtitle = strtitle_icu_utf8,
261 .strupper = strupper_icu_utf8,
262 .strfold = strfold_icu_utf8,
263 /* uses plain ASCII semantics for historical reasons */
264 .downcase_ident = NULL,
265 .wc_isdigit = wc_isdigit_icu,
266 .wc_isalpha = wc_isalpha_icu,
267 .wc_isalnum = wc_isalnum_icu,
268 .wc_isupper = wc_isupper_icu,
269 .wc_islower = wc_islower_icu,
270 .wc_isgraph = wc_isgraph_icu,
271 .wc_isprint = wc_isprint_icu,
272 .wc_ispunct = wc_ispunct_icu,
273 .wc_isspace = wc_isspace_icu,
274 .wc_isxdigit = wc_isxdigit_icu,
275 .wc_iscased = wc_iscased_icu,
276 .wc_toupper = toupper_icu,
277 .wc_tolower = tolower_icu,
278};
279
280/*
281 * ICU still depends on libc for compatibility with certain historical
282 * behavior for single-byte encodings. See downcase_ident_icu().
283 *
284 * XXX: consider fixing by decoding the single byte into a code point, and
285 * using u_tolower().
286 */
287static locale_t
288make_libc_ctype_locale(const char *ctype)
289{
290 locale_t loc;
291
292#ifndef WIN32
293 loc = newlocale(LC_CTYPE_MASK, ctype, NULL);
294#else
295 loc = _create_locale(LC_ALL, ctype);
296#endif
297 if (!loc)
299
300 return loc;
301}
302#endif
303
306{
307#ifdef USE_ICU
308 bool deterministic;
309 const char *iculocstr;
310 const char *icurules = NULL;
311 UCollator *collator;
312 locale_t loc = (locale_t) 0;
313 pg_locale_t result;
314
315 if (collid == DEFAULT_COLLATION_OID)
316 {
317 HeapTuple tp;
318 Datum datum;
319 bool isnull;
320
322 if (!HeapTupleIsValid(tp))
323 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
324
325 /* default database collation is always deterministic */
326 deterministic = true;
327 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
328 Anum_pg_database_datlocale);
329 iculocstr = TextDatumGetCString(datum);
330 datum = SysCacheGetAttr(DATABASEOID, tp,
331 Anum_pg_database_daticurules, &isnull);
332 if (!isnull)
333 icurules = TextDatumGetCString(datum);
334
335 /* libc only needed for default locale and single-byte encoding */
337 {
338 const char *ctype;
339
340 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
341 Anum_pg_database_datctype);
342 ctype = TextDatumGetCString(datum);
343
344 loc = make_libc_ctype_locale(ctype);
345 }
346
347 ReleaseSysCache(tp);
348 }
349 else
350 {
351 Form_pg_collation collform;
352 HeapTuple tp;
353 Datum datum;
354 bool isnull;
355
357 if (!HeapTupleIsValid(tp))
358 elog(ERROR, "cache lookup failed for collation %u", collid);
359 collform = (Form_pg_collation) GETSTRUCT(tp);
360 deterministic = collform->collisdeterministic;
361 datum = SysCacheGetAttrNotNull(COLLOID, tp,
362 Anum_pg_collation_colllocale);
363 iculocstr = TextDatumGetCString(datum);
364 datum = SysCacheGetAttr(COLLOID, tp,
365 Anum_pg_collation_collicurules, &isnull);
366 if (!isnull)
367 icurules = TextDatumGetCString(datum);
368
369 ReleaseSysCache(tp);
370 }
371
372 collator = make_icu_collator(iculocstr, icurules);
373
374 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
375 result->icu.locale = MemoryContextStrdup(context, iculocstr);
376 result->icu.ucol = collator;
377 result->icu.lt = loc;
378 result->deterministic = deterministic;
379 result->collate_is_c = false;
380 result->ctype_is_c = false;
382 {
383 result->icu.ucasemap = pg_ucasemap_open(iculocstr);
384 result->collate = &collate_methods_icu_utf8;
385 result->ctype = &ctype_methods_icu_utf8;
386 }
387 else
388 {
389 result->collate = &collate_methods_icu;
390 result->ctype = &ctype_methods_icu;
391 }
392
393 return result;
394#else
395 /* could get here if a collation was created by a build with ICU */
397 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
398 errmsg("ICU is not supported in this build")));
399
400 return NULL;
401#endif
402}
403
404#ifdef USE_ICU
405
406/*
407 * Check locale string and fix it if necessary. Returns a new palloc'd string.
408 *
409 * In ICU versions 54 and earlier, "und" is not a recognized spelling of the
410 * root locale. If the first component of the locale is "und", replace with
411 * "root" before opening.
412 */
413static char *
414fix_icu_locale_str(const char *loc_str)
415{
416 /*
417 * Must never open default collator, because it depends on the environment
418 * and may change at any time. Should not happen, but check here to catch
419 * bugs that might be hard to catch otherwise.
420 *
421 * NB: the default collator is not the same as the collator for the root
422 * locale. The root locale may be specified as the empty string, "und", or
423 * "root". The default collator is opened by passing NULL to ucol_open().
424 */
425 if (loc_str == NULL)
426 elog(ERROR, "opening default collator is not supported");
427
428 if (U_ICU_VERSION_MAJOR_NUM < 55)
429 {
430 char lang[ULOC_LANG_CAPACITY];
431 UErrorCode status = U_ZERO_ERROR;
432
433 uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
434 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
435 {
437 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
438 errmsg("could not get language from locale \"%s\": %s",
439 loc_str, u_errorName(status))));
440 }
441
442 if (strcmp(lang, "und") == 0)
443 {
444 const char *remainder = loc_str + strlen("und");
445 char *fixed_str;
446
447 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
448 strcpy(fixed_str, "root");
449 strcat(fixed_str, remainder);
450
451 return fixed_str;
452 }
453 }
454
455 return pstrdup(loc_str);
456}
457
458/*
459 * Wrapper around ucol_open() to handle API differences for older ICU
460 * versions.
461 *
462 * Ensure that no path leaks a UCollator.
463 */
464UCollator *
465pg_ucol_open(const char *loc_str)
466{
467 UCollator *collator;
468 UErrorCode status;
469 char *fixed_str;
470
471 fixed_str = fix_icu_locale_str(loc_str);
472
473 status = U_ZERO_ERROR;
474 collator = ucol_open(fixed_str, &status);
475 if (U_FAILURE(status))
477 /* use original string for error report */
478 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
479 errmsg("could not open collator for locale \"%s\": %s",
480 loc_str, u_errorName(status))));
481
482 if (U_ICU_VERSION_MAJOR_NUM < 54)
483 {
484 status = U_ZERO_ERROR;
485 icu_set_collation_attributes(collator, fixed_str, &status);
486
487 /*
488 * Pretend the error came from ucol_open(), for consistent error
489 * message across ICU versions.
490 */
491 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
492 {
493 ucol_close(collator);
495 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
496 errmsg("could not open collator for locale \"%s\": %s",
497 loc_str, u_errorName(status))));
498 }
499 }
500
501 pfree(fixed_str);
502
503 return collator;
504}
505
506/*
507 * Wrapper around ucasemap_open() to handle API differences for older ICU
508 * versions.
509 *
510 * Additionally makes sure we get the right options for case folding.
511 */
512static UCaseMap *
513pg_ucasemap_open(const char *loc_str)
514{
515 UErrorCode status = U_ZERO_ERROR;
516 UCaseMap *casemap;
517 char *fixed_str;
518
519 fixed_str = fix_icu_locale_str(loc_str);
520
521 casemap = ucasemap_open(fixed_str, foldcase_options(fixed_str), &status);
522 if (U_FAILURE(status))
523 /* use original string for error report */
525 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
526 errmsg("could not open casemap for locale \"%s\": %s",
527 loc_str, u_errorName(status)));
528
529 pfree(fixed_str);
530
531 return casemap;
532}
533
534/*
535 * Create a UCollator with the given locale string and rules.
536 *
537 * Ensure that no path leaks a UCollator.
538 */
539static UCollator *
540make_icu_collator(const char *iculocstr, const char *icurules)
541{
542 if (!icurules)
543 {
544 /* simple case without rules */
545 return pg_ucol_open(iculocstr);
546 }
547 else
548 {
549 UCollator *collator_std_rules;
550 UCollator *collator_all_rules;
551 const UChar *std_rules;
552 UChar *my_rules;
553 UChar *all_rules;
554 int32_t length;
555 int32_t total;
556 UErrorCode status;
557
558 /*
559 * If rules are specified, we extract the rules of the standard
560 * collation, add our own rules, and make a new collator with the
561 * combined rules.
562 */
563 icu_to_uchar(&my_rules, icurules, strlen(icurules));
564
565 collator_std_rules = pg_ucol_open(iculocstr);
566
567 std_rules = ucol_getRules(collator_std_rules, &length);
568
569 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
570
571 /* avoid leaking collator on OOM */
572 all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
573 if (!all_rules)
574 {
575 ucol_close(collator_std_rules);
577 (errcode(ERRCODE_OUT_OF_MEMORY),
578 errmsg("out of memory")));
579 }
580
581 u_strcpy(all_rules, std_rules);
582 u_strcat(all_rules, my_rules);
583
584 ucol_close(collator_std_rules);
585
586 status = U_ZERO_ERROR;
587 collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
588 UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
589 NULL, &status);
590 if (U_FAILURE(status))
591 {
593 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
594 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
595 iculocstr, icurules, u_errorName(status))));
596 }
597
598 return collator_all_rules;
599 }
600}
601
602static size_t
603strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
605{
606 return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
607}
608
609static size_t
610strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
612{
613 return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
614}
615
616static size_t
617strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
619{
620 return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
621}
622
623static size_t
624strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
626{
627 return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
628}
629
630static size_t
631strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
633{
634 UErrorCode status = U_ZERO_ERROR;
635 int32_t needed;
636
637 needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
638 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
640 errmsg("case conversion failed: %s", u_errorName(status)));
641 return needed;
642}
643
644static size_t
645strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
647{
648 UErrorCode status = U_ZERO_ERROR;
649 int32_t needed;
650
651 needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
652 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
654 errmsg("case conversion failed: %s", u_errorName(status)));
655 return needed;
656}
657
658static size_t
659strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
661{
662 UErrorCode status = U_ZERO_ERROR;
663 int32_t needed;
664
665 needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
666 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
668 errmsg("case conversion failed: %s", u_errorName(status)));
669 return needed;
670}
671
672static size_t
673strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
675{
676 UErrorCode status = U_ZERO_ERROR;
677 int32_t needed;
678
679 needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
680 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
682 errmsg("case conversion failed: %s", u_errorName(status)));
683 return needed;
684}
685
686/*
687 * For historical compatibility, behavior is not multibyte-aware.
688 *
689 * NB: uses libc tolower() for single-byte encodings (also for historical
690 * compatibility), and therefore relies on the global LC_CTYPE setting.
691 */
692static size_t
693downcase_ident_icu(char *dst, size_t dstsize, const char *src,
694 ssize_t srclen, pg_locale_t locale)
695{
696 int i;
697 bool libc_lower;
698 locale_t lt = locale->icu.lt;
699
700 libc_lower = lt && (pg_database_encoding_max_length() == 1);
701
702 for (i = 0; i < srclen && i < dstsize; i++)
703 {
704 unsigned char ch = (unsigned char) src[i];
705
706 if (ch >= 'A' && ch <= 'Z')
707 ch = pg_ascii_tolower(ch);
708 else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt))
709 ch = tolower_l(ch, lt);
710 dst[i] = (char) ch;
711 }
712
713 if (i < dstsize)
714 dst[i] = '\0';
715
716 return srclen;
717}
718
719/*
720 * strncoll_icu_utf8
721 *
722 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
723 * database encoding. An argument length of -1 means the string is
724 * NUL-terminated.
725 */
726#ifdef HAVE_UCOL_STRCOLLUTF8
727int
728strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
730{
731 int result;
732 UErrorCode status;
733
735
736 status = U_ZERO_ERROR;
737 result = ucol_strcollUTF8(locale->icu.ucol,
738 arg1, len1,
739 arg2, len2,
740 &status);
741 if (U_FAILURE(status))
743 (errmsg("collation failed: %s", u_errorName(status))));
744
745 return result;
746}
747#endif
748
749/* 'srclen' of -1 means the strings are NUL-terminated */
750size_t
751strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
753{
754 char sbuf[TEXTBUFLEN];
755 char *buf = sbuf;
756 UChar *uchar;
757 int32_t ulen;
758 size_t uchar_bsize;
759 Size result_bsize;
760
761 init_icu_converter();
762
763 ulen = uchar_length(icu_converter, src, srclen);
764
765 uchar_bsize = (ulen + 1) * sizeof(UChar);
766
767 if (uchar_bsize > TEXTBUFLEN)
768 buf = palloc(uchar_bsize);
769
770 uchar = (UChar *) buf;
771
772 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
773
774 result_bsize = ucol_getSortKey(locale->icu.ucol,
775 uchar, ulen,
776 (uint8_t *) dest, destsize);
777
778 /*
779 * ucol_getSortKey() counts the nul-terminator in the result length, but
780 * this function should not.
781 */
782 Assert(result_bsize > 0);
783 result_bsize--;
784
785 if (buf != sbuf)
786 pfree(buf);
787
788 /* if dest is defined, it should be nul-terminated */
789 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
790
791 return result_bsize;
792}
793
794/* 'srclen' of -1 means the strings are NUL-terminated */
795size_t
796strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
797 const char *src, ssize_t srclen,
799{
800 size_t result;
801 UCharIterator iter;
802 uint32_t state[2];
803 UErrorCode status;
804
806
807 uiter_setUTF8(&iter, src, srclen);
808 state[0] = state[1] = 0; /* won't need that again */
809 status = U_ZERO_ERROR;
810 result = ucol_nextSortKeyPart(locale->icu.ucol,
811 &iter,
812 state,
813 (uint8_t *) dest,
814 destsize,
815 &status);
816 if (U_FAILURE(status))
818 (errmsg("sort key generation failed: %s",
819 u_errorName(status))));
820
821 return result;
822}
823
824char *
825get_collation_actual_version_icu(const char *collcollate)
826{
827 UCollator *collator;
828 UVersionInfo versioninfo;
829 char buf[U_MAX_VERSION_STRING_LENGTH];
830
831 collator = pg_ucol_open(collcollate);
832
833 ucol_getVersion(collator, versioninfo);
834 ucol_close(collator);
835
836 u_versionToString(versioninfo, buf);
837 return pstrdup(buf);
838}
839
840/*
841 * Convert a string in the database encoding into a string of UChars.
842 *
843 * The source string at buff is of length nbytes
844 * (it needn't be nul-terminated)
845 *
846 * *buff_uchar receives a pointer to the palloc'd result string, and
847 * the function's result is the number of UChars generated.
848 *
849 * The result string is nul-terminated, though most callers rely on the
850 * result length instead.
851 */
852static int32_t
853icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
854{
855 int32_t len_uchar;
856
857 init_icu_converter();
858
859 len_uchar = uchar_length(icu_converter, buff, nbytes);
860
861 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
862 len_uchar = uchar_convert(icu_converter,
863 *buff_uchar, len_uchar + 1, buff, nbytes);
864
865 return len_uchar;
866}
867
868/*
869 * Convert a string of UChars into the database encoding.
870 *
871 * The source string at buff_uchar is of length len_uchar
872 * (it needn't be nul-terminated)
873 *
874 * *result receives a pointer to the palloc'd result string, and the
875 * function's result is the number of bytes generated (not counting nul).
876 *
877 * The result string is nul-terminated.
878 */
879static size_t
880icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
881{
882 UErrorCode status;
883 int32_t len_result;
884
885 init_icu_converter();
886
887 status = U_ZERO_ERROR;
888 len_result = ucnv_fromUChars(icu_converter, NULL, 0,
889 buff_uchar, len_uchar, &status);
890 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
892 (errmsg("%s failed: %s", "ucnv_fromUChars",
893 u_errorName(status))));
894
895 if (len_result + 1 > destsize)
896 return len_result;
897
898 status = U_ZERO_ERROR;
899 len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
900 buff_uchar, len_uchar, &status);
901 if (U_FAILURE(status) ||
902 status == U_STRING_NOT_TERMINATED_WARNING)
904 (errmsg("%s failed: %s", "ucnv_fromUChars",
905 u_errorName(status))));
906
907 return len_result;
908}
909
910static int32_t
911convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
912 UChar **buff_dest, UChar *buff_source, int32_t len_source)
913{
914 UErrorCode status;
915 int32_t len_dest;
916
917 len_dest = len_source; /* try first with same length */
918 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
919 status = U_ZERO_ERROR;
920 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
921 mylocale->icu.locale, &status);
922 if (status == U_BUFFER_OVERFLOW_ERROR)
923 {
924 /* try again with adjusted length */
925 pfree(*buff_dest);
926 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
927 status = U_ZERO_ERROR;
928 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
929 mylocale->icu.locale, &status);
930 }
931 if (U_FAILURE(status))
933 (errmsg("case conversion failed: %s", u_errorName(status))));
934 return len_dest;
935}
936
937static int32_t
938icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
939 const char *src, ssize_t srclen, pg_locale_t locale)
940{
941 int32_t len_uchar;
942 int32_t len_conv;
943 UChar *buff_uchar;
944 UChar *buff_conv;
945 size_t result_len;
946
947 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
948 len_conv = convert_case_uchar(func, locale, &buff_conv,
949 buff_uchar, len_uchar);
950 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
951 pfree(buff_uchar);
952 pfree(buff_conv);
953
954 return result_len;
955}
956
957static int32_t
958u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
959 const UChar *src, int32_t srcLength,
960 const char *locale,
961 UErrorCode *pErrorCode)
962{
963 return u_strToTitle(dest, destCapacity, src, srcLength,
964 NULL, locale, pErrorCode);
965}
966
967static int32_t
968u_strFoldCase_default(UChar *dest, int32_t destCapacity,
969 const UChar *src, int32_t srcLength,
970 const char *locale,
971 UErrorCode *pErrorCode)
972{
973 return u_strFoldCase(dest, destCapacity, src, srcLength,
974 foldcase_options(locale), pErrorCode);
975}
976
977/*
978 * Return the correct u_strFoldCase() options for the given locale.
979 *
980 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
981 * folding does not accept a locale. Instead it just supports a single option
982 * relevant to Turkic languages 'az' and 'tr'; check for those languages.
983 */
984static int32_t
985foldcase_options(const char *locale)
986{
987 uint32 options = U_FOLD_CASE_DEFAULT;
988 char lang[3];
989 UErrorCode status = U_ZERO_ERROR;
990
991 uloc_getLanguage(locale, lang, 3, &status);
992 if (U_SUCCESS(status))
993 {
994 /*
995 * The option name is confusing, but it causes u_strFoldCase to use
996 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
997 */
998 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
999 options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
1000 }
1001
1002 return options;
1003}
1004
1005/*
1006 * strncoll_icu
1007 *
1008 * Convert the arguments from the database encoding to UChar strings, then
1009 * call ucol_strcoll(). An argument length of -1 means that the string is
1010 * NUL-terminated.
1011 *
1012 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1013 * caller should call that instead.
1014 */
1015static int
1016strncoll_icu(const char *arg1, ssize_t len1,
1017 const char *arg2, ssize_t len2, pg_locale_t locale)
1018{
1019 char sbuf[TEXTBUFLEN];
1020 char *buf = sbuf;
1021 int32_t ulen1;
1022 int32_t ulen2;
1023 size_t bufsize1;
1024 size_t bufsize2;
1025 UChar *uchar1,
1026 *uchar2;
1027 int result;
1028
1029 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
1030#ifdef HAVE_UCOL_STRCOLLUTF8
1032#endif
1033
1034 init_icu_converter();
1035
1036 ulen1 = uchar_length(icu_converter, arg1, len1);
1037 ulen2 = uchar_length(icu_converter, arg2, len2);
1038
1039 bufsize1 = (ulen1 + 1) * sizeof(UChar);
1040 bufsize2 = (ulen2 + 1) * sizeof(UChar);
1041
1042 if (bufsize1 + bufsize2 > TEXTBUFLEN)
1043 buf = palloc(bufsize1 + bufsize2);
1044
1045 uchar1 = (UChar *) buf;
1046 uchar2 = (UChar *) (buf + bufsize1);
1047
1048 ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
1049 ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
1050
1051 result = ucol_strcoll(locale->icu.ucol,
1052 uchar1, ulen1,
1053 uchar2, ulen2);
1054
1055 if (buf != sbuf)
1056 pfree(buf);
1057
1058 return result;
1059}
1060
1061/* 'srclen' of -1 means the strings are NUL-terminated */
1062static size_t
1063strnxfrm_prefix_icu(char *dest, size_t destsize,
1064 const char *src, ssize_t srclen,
1066{
1067 char sbuf[TEXTBUFLEN];
1068 char *buf = sbuf;
1069 UCharIterator iter;
1070 uint32_t state[2];
1071 UErrorCode status;
1072 int32_t ulen = -1;
1073 UChar *uchar = NULL;
1074 size_t uchar_bsize;
1075 Size result_bsize;
1076
1077 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
1079
1080 init_icu_converter();
1081
1082 ulen = uchar_length(icu_converter, src, srclen);
1083
1084 uchar_bsize = (ulen + 1) * sizeof(UChar);
1085
1086 if (uchar_bsize > TEXTBUFLEN)
1087 buf = palloc(uchar_bsize);
1088
1089 uchar = (UChar *) buf;
1090
1091 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
1092
1093 uiter_setString(&iter, uchar, ulen);
1094 state[0] = state[1] = 0; /* won't need that again */
1095 status = U_ZERO_ERROR;
1096 result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
1097 &iter,
1098 state,
1099 (uint8_t *) dest,
1100 destsize,
1101 &status);
1102 if (U_FAILURE(status))
1103 ereport(ERROR,
1104 (errmsg("sort key generation failed: %s",
1105 u_errorName(status))));
1106
1107 return result_bsize;
1108}
1109
1110static void
1111init_icu_converter(void)
1112{
1113 const char *icu_encoding_name;
1114 UErrorCode status;
1115 UConverter *conv;
1116
1117 if (icu_converter)
1118 return; /* already done */
1119
1120 icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
1121 if (!icu_encoding_name)
1122 ereport(ERROR,
1123 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1124 errmsg("encoding \"%s\" not supported by ICU",
1126
1127 status = U_ZERO_ERROR;
1128 conv = ucnv_open(icu_encoding_name, &status);
1129 if (U_FAILURE(status))
1130 ereport(ERROR,
1131 (errmsg("could not open ICU converter for encoding \"%s\": %s",
1132 icu_encoding_name, u_errorName(status))));
1133
1134 icu_converter = conv;
1135}
1136
1137/*
1138 * Find length, in UChars, of given string if converted to UChar string.
1139 *
1140 * A length of -1 indicates that the input string is NUL-terminated.
1141 */
1142static size_t
1143uchar_length(UConverter *converter, const char *str, int32_t len)
1144{
1145 UErrorCode status = U_ZERO_ERROR;
1146 int32_t ulen;
1147
1148 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
1149 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1150 ereport(ERROR,
1151 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1152 return ulen;
1153}
1154
1155/*
1156 * Convert the given source string into a UChar string, stored in dest, and
1157 * return the length (in UChars).
1158 *
1159 * A srclen of -1 indicates that the input string is NUL-terminated.
1160 */
1161static int32_t
1162uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
1163 const char *src, int32_t srclen)
1164{
1165 UErrorCode status = U_ZERO_ERROR;
1166 int32_t ulen;
1167
1168 status = U_ZERO_ERROR;
1169 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1170 if (U_FAILURE(status))
1171 ereport(ERROR,
1172 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1173 return ulen;
1174}
1175
1176/*
1177 * Parse collation attributes from the given locale string and apply them to
1178 * the open collator.
1179 *
1180 * First, the locale string is canonicalized to an ICU format locale ID such
1181 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1182 * the key-value arguments.
1183 *
1184 * Starting with ICU version 54, the attributes are processed automatically by
1185 * ucol_open(), so this is only necessary for emulating this behavior on older
1186 * versions.
1187 */
1189static void
1190icu_set_collation_attributes(UCollator *collator, const char *loc,
1191 UErrorCode *status)
1192{
1193 int32_t len;
1194 char *icu_locale_id;
1195 char *lower_str;
1196 char *str;
1197 char *token;
1198
1199 /*
1200 * The input locale may be a BCP 47 language tag, e.g.
1201 * "und-u-kc-ks-level1", which expresses the same attributes in a
1202 * different form. It will be converted to the equivalent ICU format
1203 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1204 * uloc_canonicalize().
1205 */
1206 *status = U_ZERO_ERROR;
1207 len = uloc_canonicalize(loc, NULL, 0, status);
1208 icu_locale_id = palloc(len + 1);
1209 *status = U_ZERO_ERROR;
1210 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1211 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1212 return;
1213
1214 lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1215
1216 pfree(icu_locale_id);
1217
1218 str = strchr(lower_str, '@');
1219 if (!str)
1220 return;
1221 str++;
1222
1223 while ((token = strsep(&str, ";")))
1224 {
1225 char *e = strchr(token, '=');
1226
1227 if (e)
1228 {
1229 char *name;
1230 char *value;
1231 UColAttribute uattr;
1232 UColAttributeValue uvalue;
1233
1234 *status = U_ZERO_ERROR;
1235
1236 *e = '\0';
1237 name = token;
1238 value = e + 1;
1239
1240 /*
1241 * See attribute name and value lists in ICU i18n/coll.cpp
1242 */
1243 if (strcmp(name, "colstrength") == 0)
1244 uattr = UCOL_STRENGTH;
1245 else if (strcmp(name, "colbackwards") == 0)
1246 uattr = UCOL_FRENCH_COLLATION;
1247 else if (strcmp(name, "colcaselevel") == 0)
1248 uattr = UCOL_CASE_LEVEL;
1249 else if (strcmp(name, "colcasefirst") == 0)
1250 uattr = UCOL_CASE_FIRST;
1251 else if (strcmp(name, "colalternate") == 0)
1252 uattr = UCOL_ALTERNATE_HANDLING;
1253 else if (strcmp(name, "colnormalization") == 0)
1254 uattr = UCOL_NORMALIZATION_MODE;
1255 else if (strcmp(name, "colnumeric") == 0)
1256 uattr = UCOL_NUMERIC_COLLATION;
1257 else
1258 /* ignore if unknown */
1259 continue;
1260
1261 if (strcmp(value, "primary") == 0)
1262 uvalue = UCOL_PRIMARY;
1263 else if (strcmp(value, "secondary") == 0)
1264 uvalue = UCOL_SECONDARY;
1265 else if (strcmp(value, "tertiary") == 0)
1266 uvalue = UCOL_TERTIARY;
1267 else if (strcmp(value, "quaternary") == 0)
1268 uvalue = UCOL_QUATERNARY;
1269 else if (strcmp(value, "identical") == 0)
1270 uvalue = UCOL_IDENTICAL;
1271 else if (strcmp(value, "no") == 0)
1272 uvalue = UCOL_OFF;
1273 else if (strcmp(value, "yes") == 0)
1274 uvalue = UCOL_ON;
1275 else if (strcmp(value, "shifted") == 0)
1276 uvalue = UCOL_SHIFTED;
1277 else if (strcmp(value, "non-ignorable") == 0)
1278 uvalue = UCOL_NON_IGNORABLE;
1279 else if (strcmp(value, "lower") == 0)
1280 uvalue = UCOL_LOWER_FIRST;
1281 else if (strcmp(value, "upper") == 0)
1282 uvalue = UCOL_UPPER_FIRST;
1283 else
1284 {
1285 *status = U_ILLEGAL_ARGUMENT_ERROR;
1286 break;
1287 }
1288
1289 ucol_setAttribute(collator, uattr, uvalue, status);
1290 }
1291 }
1292
1293 pfree(lower_str);
1294}
1295
1296#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define pg_attribute_unused()
Definition: c.h:138
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1132
uint32_t uint32
Definition: c.h:552
size_t Size
Definition: c.h:624
Oid collid
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:1888
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
#define token
Definition: indent_globs.h:126
static struct @171 value
static char * locale
Definition: initdb.c:140
int i
Definition: isn.c:77
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1549
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1768
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1266
char * pstrdup(const char *in)
Definition: mcxt.c:1781
void pfree(void *pointer)
Definition: mcxt.c:1616
void * palloc(Size size)
Definition: mcxt.c:1387
void * palloc_extended(Size size, int flags)
Definition: mcxt.c:1439
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
Definition: pg_locale_icu.c:48
void report_newlocale_failure(const char *localename)
static char ** options
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
@ PG_UTF8
Definition: pg_wchar.h:232
#define pg_encoding_to_char
Definition: pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition: strsep.c:49
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition: port.h:188
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
e
Definition: preproc-init.c:82
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:76
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:102
const struct ctype_methods * ctype
Definition: pg_locale.h:156
const struct collate_methods * collate
Definition: pg_locale.h:155
const char * locale
Definition: pg_locale.h:162
Definition: regguts.h:323
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:595
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
Definition: unicode_case.c:397
const char * name
#define locale_t
Definition: win32_port.h:429
#define tolower_l
Definition: win32_port.h:430
#define isupper_l
Definition: win32_port.h:440