PostgreSQL Source Code git master
pg_locale_icu.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for ICU
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_icu.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#ifdef USE_ICU
15#include <unicode/ucnv.h>
16#include <unicode/ustring.h>
17
18/*
19 * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 * (see
21 * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 */
23#if U_ICU_VERSION_MAJOR_NUM >= 53
24#define HAVE_UCOL_STRCOLLUTF8 1
25#else
26#undef HAVE_UCOL_STRCOLLUTF8
27#endif
28
29#endif
30
31#include "access/htup_details.h"
32#include "catalog/pg_database.h"
34#include "mb/pg_wchar.h"
35#include "miscadmin.h"
36#include "utils/builtins.h"
37#include "utils/formatting.h"
38#include "utils/memutils.h"
39#include "utils/pg_locale.h"
40#include "utils/syscache.h"
41
42/*
43 * Size of stack buffer to use for string transformations, used to avoid heap
44 * allocations in typical cases. This should be large enough that most strings
45 * will fit, but small enough that we feel comfortable putting it on the
46 * stack.
47 */
48#define TEXTBUFLEN 1024
49
51
52#ifdef USE_ICU
53
54extern UCollator *pg_ucol_open(const char *loc_str);
55
56static size_t strlower_icu(char *dest, size_t destsize, const char *src,
57 ssize_t srclen, pg_locale_t locale);
58static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
59 ssize_t srclen, pg_locale_t locale);
60static size_t strupper_icu(char *dest, size_t destsize, const char *src,
61 ssize_t srclen, pg_locale_t locale);
62static size_t strfold_icu(char *dest, size_t destsize, const char *src,
63 ssize_t srclen, pg_locale_t locale);
64static int strncoll_icu(const char *arg1, ssize_t len1,
65 const char *arg2, ssize_t len2,
67static size_t strnxfrm_icu(char *dest, size_t destsize,
68 const char *src, ssize_t srclen,
70extern char *get_collation_actual_version_icu(const char *collcollate);
71
72typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
73 const UChar *src, int32_t srcLength,
74 const char *locale,
75 UErrorCode *pErrorCode);
76
77/*
78 * Converter object for converting between ICU's UChar strings and C strings
79 * in database encoding. Since the database encoding doesn't change, we only
80 * need one of these per session.
81 */
82static UConverter *icu_converter = NULL;
83
84static UCollator *make_icu_collator(const char *iculocstr,
85 const char *icurules);
86static int strncoll_icu(const char *arg1, ssize_t len1,
87 const char *arg2, ssize_t len2,
89static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
90 const char *src, ssize_t srclen,
92#ifdef HAVE_UCOL_STRCOLLUTF8
93static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
94 const char *arg2, ssize_t len2,
96#endif
97static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
98 const char *src, ssize_t srclen,
100static void init_icu_converter(void);
101static size_t uchar_length(UConverter *converter,
102 const char *str, int32_t len);
103static int32_t uchar_convert(UConverter *converter,
104 UChar *dest, int32_t destlen,
105 const char *src, int32_t srclen);
106static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
107 size_t nbytes);
108static size_t icu_from_uchar(char *dest, size_t destsize,
109 const UChar *buff_uchar, int32_t len_uchar);
110static void icu_set_collation_attributes(UCollator *collator, const char *loc,
111 UErrorCode *status);
112static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
113 UChar **buff_dest, UChar *buff_source,
114 int32_t len_source);
115static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
116 const UChar *src, int32_t srcLength,
117 const char *locale,
118 UErrorCode *pErrorCode);
119static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
120 const UChar *src, int32_t srcLength,
121 const char *locale,
122 UErrorCode *pErrorCode);
123
124static bool
125char_is_cased_icu(char ch, pg_locale_t locale)
126{
127 return IS_HIGHBIT_SET(ch) ||
128 (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
129}
130
131/*
132 * XXX: many of the functions below rely on casts directly from pg_wchar to
133 * UChar32, which is correct for the UTF-8 encoding, but not in general.
134 */
135
136static pg_wchar
137toupper_icu(pg_wchar wc, pg_locale_t locale)
138{
139 return u_toupper(wc);
140}
141
142static pg_wchar
143tolower_icu(pg_wchar wc, pg_locale_t locale)
144{
145 return u_tolower(wc);
146}
147
148static const struct collate_methods collate_methods_icu = {
149 .strncoll = strncoll_icu,
150 .strnxfrm = strnxfrm_icu,
151 .strnxfrm_prefix = strnxfrm_prefix_icu,
152 .strxfrm_is_safe = true,
153};
154
155static const struct collate_methods collate_methods_icu_utf8 = {
156#ifdef HAVE_UCOL_STRCOLLUTF8
157 .strncoll = strncoll_icu_utf8,
158#else
159 .strncoll = strncoll_icu,
160#endif
161 .strnxfrm = strnxfrm_icu,
162 .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
163 .strxfrm_is_safe = true,
164};
165
166static bool
167wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
168{
169 return u_isdigit(wc);
170}
171
172static bool
173wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
174{
175 return u_isalpha(wc);
176}
177
178static bool
179wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
180{
181 return u_isalnum(wc);
182}
183
184static bool
185wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
186{
187 return u_isupper(wc);
188}
189
190static bool
191wc_islower_icu(pg_wchar wc, pg_locale_t locale)
192{
193 return u_islower(wc);
194}
195
196static bool
197wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
198{
199 return u_isgraph(wc);
200}
201
202static bool
203wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
204{
205 return u_isprint(wc);
206}
207
208static bool
209wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
210{
211 return u_ispunct(wc);
212}
213
214static bool
215wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
216{
217 return u_isspace(wc);
218}
219
220static bool
221wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
222{
223 return u_isxdigit(wc);
224}
225
226static bool
227wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
228{
229 return u_hasBinaryProperty(wc, UCHAR_CASED);
230}
231
232static const struct ctype_methods ctype_methods_icu = {
233 .strlower = strlower_icu,
234 .strtitle = strtitle_icu,
235 .strupper = strupper_icu,
236 .strfold = strfold_icu,
237 .wc_isdigit = wc_isdigit_icu,
238 .wc_isalpha = wc_isalpha_icu,
239 .wc_isalnum = wc_isalnum_icu,
240 .wc_isupper = wc_isupper_icu,
241 .wc_islower = wc_islower_icu,
242 .wc_isgraph = wc_isgraph_icu,
243 .wc_isprint = wc_isprint_icu,
244 .wc_ispunct = wc_ispunct_icu,
245 .wc_isspace = wc_isspace_icu,
246 .wc_isxdigit = wc_isxdigit_icu,
247 .char_is_cased = char_is_cased_icu,
248 .wc_iscased = wc_iscased_icu,
249 .wc_toupper = toupper_icu,
250 .wc_tolower = tolower_icu,
251};
252#endif
253
256{
257#ifdef USE_ICU
258 bool deterministic;
259 const char *iculocstr;
260 const char *icurules = NULL;
261 UCollator *collator;
262 pg_locale_t result;
263
264 if (collid == DEFAULT_COLLATION_OID)
265 {
266 HeapTuple tp;
267 Datum datum;
268 bool isnull;
269
271 if (!HeapTupleIsValid(tp))
272 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
273
274 /* default database collation is always deterministic */
275 deterministic = true;
276 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
277 Anum_pg_database_datlocale);
278 iculocstr = TextDatumGetCString(datum);
279 datum = SysCacheGetAttr(DATABASEOID, tp,
280 Anum_pg_database_daticurules, &isnull);
281 if (!isnull)
282 icurules = TextDatumGetCString(datum);
283
284 ReleaseSysCache(tp);
285 }
286 else
287 {
288 Form_pg_collation collform;
289 HeapTuple tp;
290 Datum datum;
291 bool isnull;
292
294 if (!HeapTupleIsValid(tp))
295 elog(ERROR, "cache lookup failed for collation %u", collid);
296 collform = (Form_pg_collation) GETSTRUCT(tp);
297 deterministic = collform->collisdeterministic;
298 datum = SysCacheGetAttrNotNull(COLLOID, tp,
299 Anum_pg_collation_colllocale);
300 iculocstr = TextDatumGetCString(datum);
301 datum = SysCacheGetAttr(COLLOID, tp,
302 Anum_pg_collation_collicurules, &isnull);
303 if (!isnull)
304 icurules = TextDatumGetCString(datum);
305
306 ReleaseSysCache(tp);
307 }
308
309 collator = make_icu_collator(iculocstr, icurules);
310
311 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
312 result->icu.locale = MemoryContextStrdup(context, iculocstr);
313 result->icu.ucol = collator;
314 result->deterministic = deterministic;
315 result->collate_is_c = false;
316 result->ctype_is_c = false;
318 result->collate = &collate_methods_icu_utf8;
319 else
320 result->collate = &collate_methods_icu;
321 result->ctype = &ctype_methods_icu;
322
323 return result;
324#else
325 /* could get here if a collation was created by a build with ICU */
327 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
328 errmsg("ICU is not supported in this build")));
329
330 return NULL;
331#endif
332}
333
334#ifdef USE_ICU
335
336/*
337 * Wrapper around ucol_open() to handle API differences for older ICU
338 * versions.
339 *
340 * Ensure that no path leaks a UCollator.
341 */
342UCollator *
343pg_ucol_open(const char *loc_str)
344{
345 UCollator *collator;
346 UErrorCode status;
347 const char *orig_str = loc_str;
348 char *fixed_str = NULL;
349
350 /*
351 * Must never open default collator, because it depends on the environment
352 * and may change at any time. Should not happen, but check here to catch
353 * bugs that might be hard to catch otherwise.
354 *
355 * NB: the default collator is not the same as the collator for the root
356 * locale. The root locale may be specified as the empty string, "und", or
357 * "root". The default collator is opened by passing NULL to ucol_open().
358 */
359 if (loc_str == NULL)
360 elog(ERROR, "opening default collator is not supported");
361
362 /*
363 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
364 * the root locale. If the first component of the locale is "und", replace
365 * with "root" before opening.
366 */
367 if (U_ICU_VERSION_MAJOR_NUM < 55)
368 {
369 char lang[ULOC_LANG_CAPACITY];
370
371 status = U_ZERO_ERROR;
372 uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
373 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
374 {
376 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
377 errmsg("could not get language from locale \"%s\": %s",
378 loc_str, u_errorName(status))));
379 }
380
381 if (strcmp(lang, "und") == 0)
382 {
383 const char *remainder = loc_str + strlen("und");
384
385 fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
386 strcpy(fixed_str, "root");
387 strcat(fixed_str, remainder);
388
389 loc_str = fixed_str;
390 }
391 }
392
393 status = U_ZERO_ERROR;
394 collator = ucol_open(loc_str, &status);
395 if (U_FAILURE(status))
397 /* use original string for error report */
398 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
399 errmsg("could not open collator for locale \"%s\": %s",
400 orig_str, u_errorName(status))));
401
402 if (U_ICU_VERSION_MAJOR_NUM < 54)
403 {
404 status = U_ZERO_ERROR;
405 icu_set_collation_attributes(collator, loc_str, &status);
406
407 /*
408 * Pretend the error came from ucol_open(), for consistent error
409 * message across ICU versions.
410 */
411 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
412 {
413 ucol_close(collator);
415 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
416 errmsg("could not open collator for locale \"%s\": %s",
417 orig_str, u_errorName(status))));
418 }
419 }
420
421 if (fixed_str != NULL)
422 pfree(fixed_str);
423
424 return collator;
425}
426
427/*
428 * Create a UCollator with the given locale string and rules.
429 *
430 * Ensure that no path leaks a UCollator.
431 */
432static UCollator *
433make_icu_collator(const char *iculocstr, const char *icurules)
434{
435 if (!icurules)
436 {
437 /* simple case without rules */
438 return pg_ucol_open(iculocstr);
439 }
440 else
441 {
442 UCollator *collator_std_rules;
443 UCollator *collator_all_rules;
444 const UChar *std_rules;
445 UChar *my_rules;
446 UChar *all_rules;
447 int32_t length;
448 int32_t total;
449 UErrorCode status;
450
451 /*
452 * If rules are specified, we extract the rules of the standard
453 * collation, add our own rules, and make a new collator with the
454 * combined rules.
455 */
456 icu_to_uchar(&my_rules, icurules, strlen(icurules));
457
458 collator_std_rules = pg_ucol_open(iculocstr);
459
460 std_rules = ucol_getRules(collator_std_rules, &length);
461
462 total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
463
464 /* avoid leaking collator on OOM */
465 all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
466 if (!all_rules)
467 {
468 ucol_close(collator_std_rules);
470 (errcode(ERRCODE_OUT_OF_MEMORY),
471 errmsg("out of memory")));
472 }
473
474 u_strcpy(all_rules, std_rules);
475 u_strcat(all_rules, my_rules);
476
477 ucol_close(collator_std_rules);
478
479 status = U_ZERO_ERROR;
480 collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
481 UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
482 NULL, &status);
483 if (U_FAILURE(status))
484 {
486 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
487 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
488 iculocstr, icurules, u_errorName(status))));
489 }
490
491 return collator_all_rules;
492 }
493}
494
495static size_t
496strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
498{
499 int32_t len_uchar;
500 int32_t len_conv;
501 UChar *buff_uchar;
502 UChar *buff_conv;
503 size_t result_len;
504
505 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
506 len_conv = icu_convert_case(u_strToLower, locale,
507 &buff_conv, buff_uchar, len_uchar);
508 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
509 pfree(buff_uchar);
510 pfree(buff_conv);
511
512 return result_len;
513}
514
515static size_t
516strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
518{
519 int32_t len_uchar;
520 int32_t len_conv;
521 UChar *buff_uchar;
522 UChar *buff_conv;
523 size_t result_len;
524
525 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
526 len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
527 &buff_conv, buff_uchar, len_uchar);
528 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
529 pfree(buff_uchar);
530 pfree(buff_conv);
531
532 return result_len;
533}
534
535static size_t
536strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
538{
539 int32_t len_uchar;
540 int32_t len_conv;
541 UChar *buff_uchar;
542 UChar *buff_conv;
543 size_t result_len;
544
545 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
546 len_conv = icu_convert_case(u_strToUpper, locale,
547 &buff_conv, buff_uchar, len_uchar);
548 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
549 pfree(buff_uchar);
550 pfree(buff_conv);
551
552 return result_len;
553}
554
555static size_t
556strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
558{
559 int32_t len_uchar;
560 int32_t len_conv;
561 UChar *buff_uchar;
562 UChar *buff_conv;
563 size_t result_len;
564
565 len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
566 len_conv = icu_convert_case(u_strFoldCase_default, locale,
567 &buff_conv, buff_uchar, len_uchar);
568 result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
569 pfree(buff_uchar);
570 pfree(buff_conv);
571
572 return result_len;
573}
574
575/*
576 * strncoll_icu_utf8
577 *
578 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
579 * database encoding. An argument length of -1 means the string is
580 * NUL-terminated.
581 */
582#ifdef HAVE_UCOL_STRCOLLUTF8
583int
584strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
586{
587 int result;
588 UErrorCode status;
589
591
592 status = U_ZERO_ERROR;
593 result = ucol_strcollUTF8(locale->icu.ucol,
594 arg1, len1,
595 arg2, len2,
596 &status);
597 if (U_FAILURE(status))
599 (errmsg("collation failed: %s", u_errorName(status))));
600
601 return result;
602}
603#endif
604
605/* 'srclen' of -1 means the strings are NUL-terminated */
606size_t
607strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
609{
610 char sbuf[TEXTBUFLEN];
611 char *buf = sbuf;
612 UChar *uchar;
613 int32_t ulen;
614 size_t uchar_bsize;
615 Size result_bsize;
616
617 init_icu_converter();
618
619 ulen = uchar_length(icu_converter, src, srclen);
620
621 uchar_bsize = (ulen + 1) * sizeof(UChar);
622
623 if (uchar_bsize > TEXTBUFLEN)
624 buf = palloc(uchar_bsize);
625
626 uchar = (UChar *) buf;
627
628 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
629
630 result_bsize = ucol_getSortKey(locale->icu.ucol,
631 uchar, ulen,
632 (uint8_t *) dest, destsize);
633
634 /*
635 * ucol_getSortKey() counts the nul-terminator in the result length, but
636 * this function should not.
637 */
638 Assert(result_bsize > 0);
639 result_bsize--;
640
641 if (buf != sbuf)
642 pfree(buf);
643
644 /* if dest is defined, it should be nul-terminated */
645 Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
646
647 return result_bsize;
648}
649
650/* 'srclen' of -1 means the strings are NUL-terminated */
651size_t
652strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
653 const char *src, ssize_t srclen,
655{
656 size_t result;
657 UCharIterator iter;
658 uint32_t state[2];
659 UErrorCode status;
660
662
663 uiter_setUTF8(&iter, src, srclen);
664 state[0] = state[1] = 0; /* won't need that again */
665 status = U_ZERO_ERROR;
666 result = ucol_nextSortKeyPart(locale->icu.ucol,
667 &iter,
668 state,
669 (uint8_t *) dest,
670 destsize,
671 &status);
672 if (U_FAILURE(status))
674 (errmsg("sort key generation failed: %s",
675 u_errorName(status))));
676
677 return result;
678}
679
680char *
681get_collation_actual_version_icu(const char *collcollate)
682{
683 UCollator *collator;
684 UVersionInfo versioninfo;
685 char buf[U_MAX_VERSION_STRING_LENGTH];
686
687 collator = pg_ucol_open(collcollate);
688
689 ucol_getVersion(collator, versioninfo);
690 ucol_close(collator);
691
692 u_versionToString(versioninfo, buf);
693 return pstrdup(buf);
694}
695
696/*
697 * Convert a string in the database encoding into a string of UChars.
698 *
699 * The source string at buff is of length nbytes
700 * (it needn't be nul-terminated)
701 *
702 * *buff_uchar receives a pointer to the palloc'd result string, and
703 * the function's result is the number of UChars generated.
704 *
705 * The result string is nul-terminated, though most callers rely on the
706 * result length instead.
707 */
708static int32_t
709icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
710{
711 int32_t len_uchar;
712
713 init_icu_converter();
714
715 len_uchar = uchar_length(icu_converter, buff, nbytes);
716
717 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
718 len_uchar = uchar_convert(icu_converter,
719 *buff_uchar, len_uchar + 1, buff, nbytes);
720
721 return len_uchar;
722}
723
724/*
725 * Convert a string of UChars into the database encoding.
726 *
727 * The source string at buff_uchar is of length len_uchar
728 * (it needn't be nul-terminated)
729 *
730 * *result receives a pointer to the palloc'd result string, and the
731 * function's result is the number of bytes generated (not counting nul).
732 *
733 * The result string is nul-terminated.
734 */
735static size_t
736icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
737{
738 UErrorCode status;
739 int32_t len_result;
740
741 init_icu_converter();
742
743 status = U_ZERO_ERROR;
744 len_result = ucnv_fromUChars(icu_converter, NULL, 0,
745 buff_uchar, len_uchar, &status);
746 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
748 (errmsg("%s failed: %s", "ucnv_fromUChars",
749 u_errorName(status))));
750
751 if (len_result + 1 > destsize)
752 return len_result;
753
754 status = U_ZERO_ERROR;
755 len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
756 buff_uchar, len_uchar, &status);
757 if (U_FAILURE(status) ||
758 status == U_STRING_NOT_TERMINATED_WARNING)
760 (errmsg("%s failed: %s", "ucnv_fromUChars",
761 u_errorName(status))));
762
763 return len_result;
764}
765
766static int32_t
767icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
768 UChar **buff_dest, UChar *buff_source, int32_t len_source)
769{
770 UErrorCode status;
771 int32_t len_dest;
772
773 len_dest = len_source; /* try first with same length */
774 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
775 status = U_ZERO_ERROR;
776 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
777 mylocale->icu.locale, &status);
778 if (status == U_BUFFER_OVERFLOW_ERROR)
779 {
780 /* try again with adjusted length */
781 pfree(*buff_dest);
782 *buff_dest = palloc(len_dest * sizeof(**buff_dest));
783 status = U_ZERO_ERROR;
784 len_dest = func(*buff_dest, len_dest, buff_source, len_source,
785 mylocale->icu.locale, &status);
786 }
787 if (U_FAILURE(status))
789 (errmsg("case conversion failed: %s", u_errorName(status))));
790 return len_dest;
791}
792
793static int32_t
794u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
795 const UChar *src, int32_t srcLength,
796 const char *locale,
797 UErrorCode *pErrorCode)
798{
799 return u_strToTitle(dest, destCapacity, src, srcLength,
800 NULL, locale, pErrorCode);
801}
802
803static int32_t
804u_strFoldCase_default(UChar *dest, int32_t destCapacity,
805 const UChar *src, int32_t srcLength,
806 const char *locale,
807 UErrorCode *pErrorCode)
808{
809 uint32 options = U_FOLD_CASE_DEFAULT;
810 char lang[3];
811 UErrorCode status;
812
813 /*
814 * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
815 * folding does not accept a locale. Instead it just supports a single
816 * option relevant to Turkic languages 'az' and 'tr'; check for those
817 * languages to enable the option.
818 */
819 status = U_ZERO_ERROR;
820 uloc_getLanguage(locale, lang, 3, &status);
821 if (U_SUCCESS(status))
822 {
823 /*
824 * The option name is confusing, but it causes u_strFoldCase to use
825 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
826 */
827 if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
828 options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
829 }
830
831 return u_strFoldCase(dest, destCapacity, src, srcLength,
832 options, pErrorCode);
833}
834
835/*
836 * strncoll_icu
837 *
838 * Convert the arguments from the database encoding to UChar strings, then
839 * call ucol_strcoll(). An argument length of -1 means that the string is
840 * NUL-terminated.
841 *
842 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
843 * caller should call that instead.
844 */
845static int
846strncoll_icu(const char *arg1, ssize_t len1,
847 const char *arg2, ssize_t len2, pg_locale_t locale)
848{
849 char sbuf[TEXTBUFLEN];
850 char *buf = sbuf;
851 int32_t ulen1;
852 int32_t ulen2;
853 size_t bufsize1;
854 size_t bufsize2;
855 UChar *uchar1,
856 *uchar2;
857 int result;
858
859 /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
860#ifdef HAVE_UCOL_STRCOLLUTF8
862#endif
863
864 init_icu_converter();
865
866 ulen1 = uchar_length(icu_converter, arg1, len1);
867 ulen2 = uchar_length(icu_converter, arg2, len2);
868
869 bufsize1 = (ulen1 + 1) * sizeof(UChar);
870 bufsize2 = (ulen2 + 1) * sizeof(UChar);
871
872 if (bufsize1 + bufsize2 > TEXTBUFLEN)
873 buf = palloc(bufsize1 + bufsize2);
874
875 uchar1 = (UChar *) buf;
876 uchar2 = (UChar *) (buf + bufsize1);
877
878 ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
879 ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
880
881 result = ucol_strcoll(locale->icu.ucol,
882 uchar1, ulen1,
883 uchar2, ulen2);
884
885 if (buf != sbuf)
886 pfree(buf);
887
888 return result;
889}
890
891/* 'srclen' of -1 means the strings are NUL-terminated */
892static size_t
893strnxfrm_prefix_icu(char *dest, size_t destsize,
894 const char *src, ssize_t srclen,
896{
897 char sbuf[TEXTBUFLEN];
898 char *buf = sbuf;
899 UCharIterator iter;
900 uint32_t state[2];
901 UErrorCode status;
902 int32_t ulen = -1;
903 UChar *uchar = NULL;
904 size_t uchar_bsize;
905 Size result_bsize;
906
907 /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
909
910 init_icu_converter();
911
912 ulen = uchar_length(icu_converter, src, srclen);
913
914 uchar_bsize = (ulen + 1) * sizeof(UChar);
915
916 if (uchar_bsize > TEXTBUFLEN)
917 buf = palloc(uchar_bsize);
918
919 uchar = (UChar *) buf;
920
921 ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
922
923 uiter_setString(&iter, uchar, ulen);
924 state[0] = state[1] = 0; /* won't need that again */
925 status = U_ZERO_ERROR;
926 result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
927 &iter,
928 state,
929 (uint8_t *) dest,
930 destsize,
931 &status);
932 if (U_FAILURE(status))
934 (errmsg("sort key generation failed: %s",
935 u_errorName(status))));
936
937 return result_bsize;
938}
939
940static void
941init_icu_converter(void)
942{
943 const char *icu_encoding_name;
944 UErrorCode status;
945 UConverter *conv;
946
947 if (icu_converter)
948 return; /* already done */
949
950 icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
951 if (!icu_encoding_name)
953 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
954 errmsg("encoding \"%s\" not supported by ICU",
956
957 status = U_ZERO_ERROR;
958 conv = ucnv_open(icu_encoding_name, &status);
959 if (U_FAILURE(status))
961 (errmsg("could not open ICU converter for encoding \"%s\": %s",
962 icu_encoding_name, u_errorName(status))));
963
964 icu_converter = conv;
965}
966
967/*
968 * Find length, in UChars, of given string if converted to UChar string.
969 *
970 * A length of -1 indicates that the input string is NUL-terminated.
971 */
972static size_t
973uchar_length(UConverter *converter, const char *str, int32_t len)
974{
975 UErrorCode status = U_ZERO_ERROR;
976 int32_t ulen;
977
978 ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
979 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
981 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
982 return ulen;
983}
984
985/*
986 * Convert the given source string into a UChar string, stored in dest, and
987 * return the length (in UChars).
988 *
989 * A srclen of -1 indicates that the input string is NUL-terminated.
990 */
991static int32_t
992uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
993 const char *src, int32_t srclen)
994{
995 UErrorCode status = U_ZERO_ERROR;
996 int32_t ulen;
997
998 status = U_ZERO_ERROR;
999 ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1000 if (U_FAILURE(status))
1001 ereport(ERROR,
1002 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1003 return ulen;
1004}
1005
1006/*
1007 * Parse collation attributes from the given locale string and apply them to
1008 * the open collator.
1009 *
1010 * First, the locale string is canonicalized to an ICU format locale ID such
1011 * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1012 * the key-value arguments.
1013 *
1014 * Starting with ICU version 54, the attributes are processed automatically by
1015 * ucol_open(), so this is only necessary for emulating this behavior on older
1016 * versions.
1017 */
1019static void
1020icu_set_collation_attributes(UCollator *collator, const char *loc,
1021 UErrorCode *status)
1022{
1023 int32_t len;
1024 char *icu_locale_id;
1025 char *lower_str;
1026 char *str;
1027 char *token;
1028
1029 /*
1030 * The input locale may be a BCP 47 language tag, e.g.
1031 * "und-u-kc-ks-level1", which expresses the same attributes in a
1032 * different form. It will be converted to the equivalent ICU format
1033 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1034 * uloc_canonicalize().
1035 */
1036 *status = U_ZERO_ERROR;
1037 len = uloc_canonicalize(loc, NULL, 0, status);
1038 icu_locale_id = palloc(len + 1);
1039 *status = U_ZERO_ERROR;
1040 len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1041 if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1042 return;
1043
1044 lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1045
1046 pfree(icu_locale_id);
1047
1048 str = strchr(lower_str, '@');
1049 if (!str)
1050 return;
1051 str++;
1052
1053 while ((token = strsep(&str, ";")))
1054 {
1055 char *e = strchr(token, '=');
1056
1057 if (e)
1058 {
1059 char *name;
1060 char *value;
1061 UColAttribute uattr;
1062 UColAttributeValue uvalue;
1063
1064 *status = U_ZERO_ERROR;
1065
1066 *e = '\0';
1067 name = token;
1068 value = e + 1;
1069
1070 /*
1071 * See attribute name and value lists in ICU i18n/coll.cpp
1072 */
1073 if (strcmp(name, "colstrength") == 0)
1074 uattr = UCOL_STRENGTH;
1075 else if (strcmp(name, "colbackwards") == 0)
1076 uattr = UCOL_FRENCH_COLLATION;
1077 else if (strcmp(name, "colcaselevel") == 0)
1078 uattr = UCOL_CASE_LEVEL;
1079 else if (strcmp(name, "colcasefirst") == 0)
1080 uattr = UCOL_CASE_FIRST;
1081 else if (strcmp(name, "colalternate") == 0)
1082 uattr = UCOL_ALTERNATE_HANDLING;
1083 else if (strcmp(name, "colnormalization") == 0)
1084 uattr = UCOL_NORMALIZATION_MODE;
1085 else if (strcmp(name, "colnumeric") == 0)
1086 uattr = UCOL_NUMERIC_COLLATION;
1087 else
1088 /* ignore if unknown */
1089 continue;
1090
1091 if (strcmp(value, "primary") == 0)
1092 uvalue = UCOL_PRIMARY;
1093 else if (strcmp(value, "secondary") == 0)
1094 uvalue = UCOL_SECONDARY;
1095 else if (strcmp(value, "tertiary") == 0)
1096 uvalue = UCOL_TERTIARY;
1097 else if (strcmp(value, "quaternary") == 0)
1098 uvalue = UCOL_QUATERNARY;
1099 else if (strcmp(value, "identical") == 0)
1100 uvalue = UCOL_IDENTICAL;
1101 else if (strcmp(value, "no") == 0)
1102 uvalue = UCOL_OFF;
1103 else if (strcmp(value, "yes") == 0)
1104 uvalue = UCOL_ON;
1105 else if (strcmp(value, "shifted") == 0)
1106 uvalue = UCOL_SHIFTED;
1107 else if (strcmp(value, "non-ignorable") == 0)
1108 uvalue = UCOL_NON_IGNORABLE;
1109 else if (strcmp(value, "lower") == 0)
1110 uvalue = UCOL_LOWER_FIRST;
1111 else if (strcmp(value, "upper") == 0)
1112 uvalue = UCOL_UPPER_FIRST;
1113 else
1114 {
1115 *status = U_ILLEGAL_ARGUMENT_ERROR;
1116 break;
1117 }
1118
1119 ucol_setAttribute(collator, uattr, uvalue, status);
1120 }
1121 }
1122
1123 pfree(lower_str);
1124}
1125
1126#endif /* USE_ICU */
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define pg_attribute_unused()
Definition: c.h:138
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1153
uint32_t uint32
Definition: c.h:552
size_t Size
Definition: c.h:624
Oid collid
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:29
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:1888
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
size_t remainder
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
#define token
Definition: indent_globs.h:126
static struct @171 value
static char * locale
Definition: initdb.c:140
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1746
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
void * palloc_extended(Size size, int flags)
Definition: mcxt.c:1417
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context)
#define TEXTBUFLEN
Definition: pg_locale_icu.c:48
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
@ PG_UTF8
Definition: pg_wchar.h:232
#define pg_encoding_to_char
Definition: pg_wchar.h:630
char * strsep(char **stringp, const char *delim)
Definition: strsep.c:49
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
e
Definition: preproc-init.c:82
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:75
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:101
const struct ctype_methods * ctype
Definition: pg_locale.h:155
const struct collate_methods * collate
Definition: pg_locale.h:154
const char * locale
Definition: pg_locale.h:161
Definition: regguts.h:323
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:595
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
const char * name