PostgreSQL Source Code  git master
pg_locale.c
Go to the documentation of this file.
1 /*-----------------------------------------------------------------------
2  *
3  * PostgreSQL locale utilities
4  *
5  * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
6  *
7  * src/backend/utils/adt/pg_locale.c
8  *
9  *-----------------------------------------------------------------------
10  */
11 
12 /*----------
13  * Here is how the locale stuff is handled: LC_COLLATE and LC_CTYPE
14  * are fixed at CREATE DATABASE time, stored in pg_database, and cannot
15  * be changed. Thus, the effects of strcoll(), strxfrm(), isupper(),
16  * toupper(), etc. are always in the same fixed locale.
17  *
18  * LC_MESSAGES is settable at run time and will take effect
19  * immediately.
20  *
21  * The other categories, LC_MONETARY, LC_NUMERIC, and LC_TIME are also
22  * settable at run-time. However, we don't actually set those locale
23  * categories permanently. This would have bizarre effects like no
24  * longer accepting standard floating-point literals in some locales.
25  * Instead, we only set these locale categories briefly when needed,
26  * cache the required information obtained from localeconv() or
27  * strftime(), and then set the locale categories back to "C".
28  * The cached information is only used by the formatting functions
29  * (to_char, etc.) and the money type. For the user, this should all be
30  * transparent.
31  *
32  * !!! NOW HEAR THIS !!!
33  *
34  * We've been bitten repeatedly by this bug, so let's try to keep it in
35  * mind in future: on some platforms, the locale functions return pointers
36  * to static data that will be overwritten by any later locale function.
37  * Thus, for example, the obvious-looking sequence
38  * save = setlocale(category, NULL);
39  * if (!setlocale(category, value))
40  * fail = true;
41  * setlocale(category, save);
42  * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
43  * will change the memory save is pointing at. To do this sort of thing
44  * safely, you *must* pstrdup what setlocale returns the first time.
45  *
46  * The POSIX locale standard is available here:
47  *
48  * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
49  *----------
50  */
51 
52 
53 #include "postgres.h"
54 
55 #include <time.h>
56 
57 #include "access/htup_details.h"
58 #include "catalog/pg_collation.h"
59 #include "catalog/pg_database.h"
60 #include "common/hashfn.h"
61 #include "common/string.h"
62 #include "mb/pg_wchar.h"
63 #include "miscadmin.h"
64 #include "utils/builtins.h"
65 #include "utils/formatting.h"
66 #include "utils/guc_hooks.h"
67 #include "utils/lsyscache.h"
68 #include "utils/memutils.h"
69 #include "utils/pg_locale.h"
70 #include "utils/syscache.h"
71 
72 #ifdef USE_ICU
73 #include <unicode/ucnv.h>
74 #include <unicode/ustring.h>
75 #endif
76 
77 #ifdef __GLIBC__
78 #include <gnu/libc-version.h>
79 #endif
80 
81 #ifdef WIN32
82 #include <shlwapi.h>
83 #endif
84 
85 /* Error triggered for locale-sensitive subroutines */
86 #define PGLOCALE_SUPPORT_ERROR(provider) \
87  elog(ERROR, "unsupported collprovider for %s: %c", __func__, provider)
88 
89 /*
90  * This should be large enough that most strings will fit, but small enough
91  * that we feel comfortable putting it on the stack
92  */
93 #define TEXTBUFLEN 1024
94 
95 #define MAX_L10N_DATA 80
96 
97 
98 /* GUC settings */
103 
105 
106 /*
107  * lc_time localization cache.
108  *
109  * We use only the first 7 or 12 entries of these arrays. The last array
110  * element is left as NULL for the convenience of outside code that wants
111  * to sequentially scan these arrays.
112  */
114 char *localized_full_days[7 + 1];
116 char *localized_full_months[12 + 1];
117 
118 /* is the databases's LC_CTYPE the C locale? */
119 bool database_ctype_is_c = false;
120 
121 static struct pg_locale_struct default_locale;
122 
123 /* indicates whether locale information cache is valid */
124 static bool CurrentLocaleConvValid = false;
125 static bool CurrentLCTimeValid = false;
126 
127 /* Cache for collation-related knowledge */
128 
129 typedef struct
130 {
131  Oid collid; /* hash key: pg_collation OID */
132  pg_locale_t locale; /* locale_t struct, or 0 if not valid */
133 
134  /* needed for simplehash */
136  char status;
138 
139 #define SH_PREFIX collation_cache
140 #define SH_ELEMENT_TYPE collation_cache_entry
141 #define SH_KEY_TYPE Oid
142 #define SH_KEY collid
143 #define SH_HASH_KEY(tb, key) murmurhash32((uint32) key)
144 #define SH_EQUAL(tb, a, b) (a == b)
145 #define SH_GET_HASH(tb, a) a->hash
146 #define SH_SCOPE static inline
147 #define SH_STORE_HASH
148 #define SH_DECLARE
149 #define SH_DEFINE
150 #include "lib/simplehash.h"
151 
153 static collation_cache_hash *CollationCache = NULL;
154 
155 /*
156  * The collation cache is often accessed repeatedly for the same collation, so
157  * remember the last one used.
158  */
161 
162 #if defined(WIN32) && defined(LC_MESSAGES)
163 static char *IsoLocaleName(const char *);
164 #endif
165 
166 #ifdef USE_ICU
167 /*
168  * Converter object for converting between ICU's UChar strings and C strings
169  * in database encoding. Since the database encoding doesn't change, we only
170  * need one of these per session.
171  */
172 static UConverter *icu_converter = NULL;
173 
174 static UCollator *pg_ucol_open(const char *loc_str);
175 static void init_icu_converter(void);
176 static size_t uchar_length(UConverter *converter,
177  const char *str, int32_t len);
178 static int32_t uchar_convert(UConverter *converter,
179  UChar *dest, int32_t destlen,
180  const char *src, int32_t srclen);
181 static void icu_set_collation_attributes(UCollator *collator, const char *loc,
182  UErrorCode *status);
183 #endif
184 
185 /*
186  * POSIX doesn't define _l-variants of these functions, but several systems
187  * have them. We provide our own replacements here.
188  */
189 #ifndef HAVE_MBSTOWCS_L
190 static size_t
191 mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
192 {
193 #ifdef WIN32
194  return _mbstowcs_l(dest, src, n, loc);
195 #else
196  size_t result;
197  locale_t save_locale = uselocale(loc);
198 
199  result = mbstowcs(dest, src, n);
200  uselocale(save_locale);
201  return result;
202 #endif
203 }
204 #endif
205 #ifndef HAVE_WCSTOMBS_L
206 static size_t
207 wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
208 {
209 #ifdef WIN32
210  return _wcstombs_l(dest, src, n, loc);
211 #else
212  size_t result;
213  locale_t save_locale = uselocale(loc);
214 
215  result = wcstombs(dest, src, n);
216  uselocale(save_locale);
217  return result;
218 #endif
219 }
220 #endif
221 
222 /*
223  * pg_perm_setlocale
224  *
225  * This wraps the libc function setlocale(), with two additions. First, when
226  * changing LC_CTYPE, update gettext's encoding for the current message
227  * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but
228  * not on Windows. Second, if the operation is successful, the corresponding
229  * LC_XXX environment variable is set to match. By setting the environment
230  * variable, we ensure that any subsequent use of setlocale(..., "") will
231  * preserve the settings made through this routine. Of course, LC_ALL must
232  * also be unset to fully ensure that, but that has to be done elsewhere after
233  * all the individual LC_XXX variables have been set correctly. (Thank you
234  * Perl for making this kluge necessary.)
235  */
236 char *
237 pg_perm_setlocale(int category, const char *locale)
238 {
239  char *result;
240  const char *envvar;
241 
242 #ifndef WIN32
243  result = setlocale(category, locale);
244 #else
245 
246  /*
247  * On Windows, setlocale(LC_MESSAGES) does not work, so just assume that
248  * the given value is good and set it in the environment variables. We
249  * must ignore attempts to set to "", which means "keep using the old
250  * environment value".
251  */
252 #ifdef LC_MESSAGES
253  if (category == LC_MESSAGES)
254  {
255  result = (char *) locale;
256  if (locale == NULL || locale[0] == '\0')
257  return result;
258  }
259  else
260 #endif
261  result = setlocale(category, locale);
262 #endif /* WIN32 */
263 
264  if (result == NULL)
265  return result; /* fall out immediately on failure */
266 
267  /*
268  * Use the right encoding in translated messages. Under ENABLE_NLS, let
269  * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message
270  * format strings are ASCII, but database-encoding strings may enter the
271  * message via %s. This makes the overall message encoding equal to the
272  * database encoding.
273  */
274  if (category == LC_CTYPE)
275  {
276  static char save_lc_ctype[LOCALE_NAME_BUFLEN];
277 
278  /* copy setlocale() return value before callee invokes it again */
279  strlcpy(save_lc_ctype, result, sizeof(save_lc_ctype));
280  result = save_lc_ctype;
281 
282 #ifdef ENABLE_NLS
283  SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL)));
284 #else
286 #endif
287  }
288 
289  switch (category)
290  {
291  case LC_COLLATE:
292  envvar = "LC_COLLATE";
293  break;
294  case LC_CTYPE:
295  envvar = "LC_CTYPE";
296  break;
297 #ifdef LC_MESSAGES
298  case LC_MESSAGES:
299  envvar = "LC_MESSAGES";
300 #ifdef WIN32
301  result = IsoLocaleName(locale);
302  if (result == NULL)
303  result = (char *) locale;
304  elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
305 #endif /* WIN32 */
306  break;
307 #endif /* LC_MESSAGES */
308  case LC_MONETARY:
309  envvar = "LC_MONETARY";
310  break;
311  case LC_NUMERIC:
312  envvar = "LC_NUMERIC";
313  break;
314  case LC_TIME:
315  envvar = "LC_TIME";
316  break;
317  default:
318  elog(FATAL, "unrecognized LC category: %d", category);
319  return NULL; /* keep compiler quiet */
320  }
321 
322  if (setenv(envvar, result, 1) != 0)
323  return NULL;
324 
325  return result;
326 }
327 
328 
329 /*
330  * Is the locale name valid for the locale category?
331  *
332  * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
333  * canonical name is stored there. This is especially useful for figuring out
334  * what locale name "" means (ie, the server environment value). (Actually,
335  * it seems that on most implementations that's the only thing it's good for;
336  * we could wish that setlocale gave back a canonically spelled version of
337  * the locale name, but typically it doesn't.)
338  */
339 bool
340 check_locale(int category, const char *locale, char **canonname)
341 {
342  char *save;
343  char *res;
344 
345  /* Don't let Windows' non-ASCII locale names in. */
346  if (!pg_is_ascii(locale))
347  {
349  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
350  errmsg("locale name \"%s\" contains non-ASCII characters",
351  locale)));
352  return false;
353  }
354 
355  if (canonname)
356  *canonname = NULL; /* in case of failure */
357 
358  save = setlocale(category, NULL);
359  if (!save)
360  return false; /* won't happen, we hope */
361 
362  /* save may be pointing at a modifiable scratch variable, see above. */
363  save = pstrdup(save);
364 
365  /* set the locale with setlocale, to see if it accepts it. */
366  res = setlocale(category, locale);
367 
368  /* save canonical name if requested. */
369  if (res && canonname)
370  *canonname = pstrdup(res);
371 
372  /* restore old value. */
373  if (!setlocale(category, save))
374  elog(WARNING, "failed to restore old locale \"%s\"", save);
375  pfree(save);
376 
377  /* Don't let Windows' non-ASCII locale names out. */
378  if (canonname && *canonname && !pg_is_ascii(*canonname))
379  {
381  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
382  errmsg("locale name \"%s\" contains non-ASCII characters",
383  *canonname)));
384  pfree(*canonname);
385  *canonname = NULL;
386  return false;
387  }
388 
389  return (res != NULL);
390 }
391 
392 
393 /*
394  * GUC check/assign hooks
395  *
396  * For most locale categories, the assign hook doesn't actually set the locale
397  * permanently, just reset flags so that the next use will cache the
398  * appropriate values. (See explanation at the top of this file.)
399  *
400  * Note: we accept value = "" as selecting the postmaster's environment
401  * value, whatever it was (so long as the environment setting is legal).
402  * This will have been locked down by an earlier call to pg_perm_setlocale.
403  */
404 bool
406 {
407  return check_locale(LC_MONETARY, *newval, NULL);
408 }
409 
410 void
411 assign_locale_monetary(const char *newval, void *extra)
412 {
413  CurrentLocaleConvValid = false;
414 }
415 
416 bool
418 {
419  return check_locale(LC_NUMERIC, *newval, NULL);
420 }
421 
422 void
423 assign_locale_numeric(const char *newval, void *extra)
424 {
425  CurrentLocaleConvValid = false;
426 }
427 
428 bool
429 check_locale_time(char **newval, void **extra, GucSource source)
430 {
431  return check_locale(LC_TIME, *newval, NULL);
432 }
433 
434 void
435 assign_locale_time(const char *newval, void *extra)
436 {
437  CurrentLCTimeValid = false;
438 }
439 
440 /*
441  * We allow LC_MESSAGES to actually be set globally.
442  *
443  * Note: we normally disallow value = "" because it wouldn't have consistent
444  * semantics (it'd effectively just use the previous value). However, this
445  * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
446  * not even if the attempted setting fails due to invalid environment value.
447  * The idea there is just to accept the environment setting *if possible*
448  * during startup, until we can read the proper value from postgresql.conf.
449  */
450 bool
452 {
453  if (**newval == '\0')
454  {
455  if (source == PGC_S_DEFAULT)
456  return true;
457  else
458  return false;
459  }
460 
461  /*
462  * LC_MESSAGES category does not exist everywhere, but accept it anyway
463  *
464  * On Windows, we can't even check the value, so accept blindly
465  */
466 #if defined(LC_MESSAGES) && !defined(WIN32)
467  return check_locale(LC_MESSAGES, *newval, NULL);
468 #else
469  return true;
470 #endif
471 }
472 
473 void
474 assign_locale_messages(const char *newval, void *extra)
475 {
476  /*
477  * LC_MESSAGES category does not exist everywhere, but accept it anyway.
478  * We ignore failure, as per comment above.
479  */
480 #ifdef LC_MESSAGES
481  (void) pg_perm_setlocale(LC_MESSAGES, newval);
482 #endif
483 }
484 
485 
486 /*
487  * Frees the malloced content of a struct lconv. (But not the struct
488  * itself.) It's important that this not throw elog(ERROR).
489  */
490 static void
491 free_struct_lconv(struct lconv *s)
492 {
493  free(s->decimal_point);
494  free(s->thousands_sep);
495  free(s->grouping);
496  free(s->int_curr_symbol);
497  free(s->currency_symbol);
498  free(s->mon_decimal_point);
499  free(s->mon_thousands_sep);
500  free(s->mon_grouping);
501  free(s->positive_sign);
502  free(s->negative_sign);
503 }
504 
505 /*
506  * Check that all fields of a struct lconv (or at least, the ones we care
507  * about) are non-NULL. The field list must match free_struct_lconv().
508  */
509 static bool
510 struct_lconv_is_valid(struct lconv *s)
511 {
512  if (s->decimal_point == NULL)
513  return false;
514  if (s->thousands_sep == NULL)
515  return false;
516  if (s->grouping == NULL)
517  return false;
518  if (s->int_curr_symbol == NULL)
519  return false;
520  if (s->currency_symbol == NULL)
521  return false;
522  if (s->mon_decimal_point == NULL)
523  return false;
524  if (s->mon_thousands_sep == NULL)
525  return false;
526  if (s->mon_grouping == NULL)
527  return false;
528  if (s->positive_sign == NULL)
529  return false;
530  if (s->negative_sign == NULL)
531  return false;
532  return true;
533 }
534 
535 
536 /*
537  * Convert the strdup'd string at *str from the specified encoding to the
538  * database encoding.
539  */
540 static void
542 {
543  char *pstr;
544  char *mstr;
545 
546  /* convert the string to the database encoding */
547  pstr = pg_any_to_server(*str, strlen(*str), encoding);
548  if (pstr == *str)
549  return; /* no conversion happened */
550 
551  /* need it malloc'd not palloc'd */
552  mstr = strdup(pstr);
553  if (mstr == NULL)
554  ereport(ERROR,
555  (errcode(ERRCODE_OUT_OF_MEMORY),
556  errmsg("out of memory")));
557 
558  /* replace old string */
559  free(*str);
560  *str = mstr;
561 
562  pfree(pstr);
563 }
564 
565 
566 /*
567  * Return the POSIX lconv struct (contains number/money formatting
568  * information) with locale information for all categories.
569  */
570 struct lconv *
572 {
573  static struct lconv CurrentLocaleConv;
574  static bool CurrentLocaleConvAllocated = false;
575  struct lconv *extlconv;
576  struct lconv worklconv;
577  char *save_lc_monetary;
578  char *save_lc_numeric;
579 #ifdef WIN32
580  char *save_lc_ctype;
581 #endif
582 
583  /* Did we do it already? */
585  return &CurrentLocaleConv;
586 
587  /* Free any already-allocated storage */
588  if (CurrentLocaleConvAllocated)
589  {
590  free_struct_lconv(&CurrentLocaleConv);
591  CurrentLocaleConvAllocated = false;
592  }
593 
594  /*
595  * This is tricky because we really don't want to risk throwing error
596  * while the locale is set to other than our usual settings. Therefore,
597  * the process is: collect the usual settings, set locale to special
598  * setting, copy relevant data into worklconv using strdup(), restore
599  * normal settings, convert data to desired encoding, and finally stash
600  * the collected data in CurrentLocaleConv. This makes it safe if we
601  * throw an error during encoding conversion or run out of memory anywhere
602  * in the process. All data pointed to by struct lconv members is
603  * allocated with strdup, to avoid premature elog(ERROR) and to allow
604  * using a single cleanup routine.
605  */
606  memset(&worklconv, 0, sizeof(worklconv));
607 
608  /* Save prevailing values of monetary and numeric locales */
609  save_lc_monetary = setlocale(LC_MONETARY, NULL);
610  if (!save_lc_monetary)
611  elog(ERROR, "setlocale(NULL) failed");
612  save_lc_monetary = pstrdup(save_lc_monetary);
613 
614  save_lc_numeric = setlocale(LC_NUMERIC, NULL);
615  if (!save_lc_numeric)
616  elog(ERROR, "setlocale(NULL) failed");
617  save_lc_numeric = pstrdup(save_lc_numeric);
618 
619 #ifdef WIN32
620 
621  /*
622  * The POSIX standard explicitly says that it is undefined what happens if
623  * LC_MONETARY or LC_NUMERIC imply an encoding (codeset) different from
624  * that implied by LC_CTYPE. In practice, all Unix-ish platforms seem to
625  * believe that localeconv() should return strings that are encoded in the
626  * codeset implied by the LC_MONETARY or LC_NUMERIC locale name. Hence,
627  * once we have successfully collected the localeconv() results, we will
628  * convert them from that codeset to the desired server encoding.
629  *
630  * Windows, of course, resolutely does things its own way; on that
631  * platform LC_CTYPE has to match LC_MONETARY/LC_NUMERIC to get sane
632  * results. Hence, we must temporarily set that category as well.
633  */
634 
635  /* Save prevailing value of ctype locale */
636  save_lc_ctype = setlocale(LC_CTYPE, NULL);
637  if (!save_lc_ctype)
638  elog(ERROR, "setlocale(NULL) failed");
639  save_lc_ctype = pstrdup(save_lc_ctype);
640 
641  /* Here begins the critical section where we must not throw error */
642 
643  /* use numeric to set the ctype */
644  setlocale(LC_CTYPE, locale_numeric);
645 #endif
646 
647  /* Get formatting information for numeric */
648  setlocale(LC_NUMERIC, locale_numeric);
649  extlconv = localeconv();
650 
651  /* Must copy data now in case setlocale() overwrites it */
652  worklconv.decimal_point = strdup(extlconv->decimal_point);
653  worklconv.thousands_sep = strdup(extlconv->thousands_sep);
654  worklconv.grouping = strdup(extlconv->grouping);
655 
656 #ifdef WIN32
657  /* use monetary to set the ctype */
658  setlocale(LC_CTYPE, locale_monetary);
659 #endif
660 
661  /* Get formatting information for monetary */
662  setlocale(LC_MONETARY, locale_monetary);
663  extlconv = localeconv();
664 
665  /* Must copy data now in case setlocale() overwrites it */
666  worklconv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
667  worklconv.currency_symbol = strdup(extlconv->currency_symbol);
668  worklconv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
669  worklconv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
670  worklconv.mon_grouping = strdup(extlconv->mon_grouping);
671  worklconv.positive_sign = strdup(extlconv->positive_sign);
672  worklconv.negative_sign = strdup(extlconv->negative_sign);
673  /* Copy scalar fields as well */
674  worklconv.int_frac_digits = extlconv->int_frac_digits;
675  worklconv.frac_digits = extlconv->frac_digits;
676  worklconv.p_cs_precedes = extlconv->p_cs_precedes;
677  worklconv.p_sep_by_space = extlconv->p_sep_by_space;
678  worklconv.n_cs_precedes = extlconv->n_cs_precedes;
679  worklconv.n_sep_by_space = extlconv->n_sep_by_space;
680  worklconv.p_sign_posn = extlconv->p_sign_posn;
681  worklconv.n_sign_posn = extlconv->n_sign_posn;
682 
683  /*
684  * Restore the prevailing locale settings; failure to do so is fatal.
685  * Possibly we could limp along with nondefault LC_MONETARY or LC_NUMERIC,
686  * but proceeding with the wrong value of LC_CTYPE would certainly be bad
687  * news; and considering that the prevailing LC_MONETARY and LC_NUMERIC
688  * are almost certainly "C", there's really no reason that restoring those
689  * should fail.
690  */
691 #ifdef WIN32
692  if (!setlocale(LC_CTYPE, save_lc_ctype))
693  elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
694 #endif
695  if (!setlocale(LC_MONETARY, save_lc_monetary))
696  elog(FATAL, "failed to restore LC_MONETARY to \"%s\"", save_lc_monetary);
697  if (!setlocale(LC_NUMERIC, save_lc_numeric))
698  elog(FATAL, "failed to restore LC_NUMERIC to \"%s\"", save_lc_numeric);
699 
700  /*
701  * At this point we've done our best to clean up, and can call functions
702  * that might possibly throw errors with a clean conscience. But let's
703  * make sure we don't leak any already-strdup'd fields in worklconv.
704  */
705  PG_TRY();
706  {
707  int encoding;
708 
709  /* Release the pstrdup'd locale names */
710  pfree(save_lc_monetary);
711  pfree(save_lc_numeric);
712 #ifdef WIN32
713  pfree(save_lc_ctype);
714 #endif
715 
716  /* If any of the preceding strdup calls failed, complain now. */
717  if (!struct_lconv_is_valid(&worklconv))
718  ereport(ERROR,
719  (errcode(ERRCODE_OUT_OF_MEMORY),
720  errmsg("out of memory")));
721 
722  /*
723  * Now we must perform encoding conversion from whatever's associated
724  * with the locales into the database encoding. If we can't identify
725  * the encoding implied by LC_NUMERIC or LC_MONETARY (ie we get -1),
726  * use PG_SQL_ASCII, which will result in just validating that the
727  * strings are OK in the database encoding.
728  */
730  if (encoding < 0)
732 
733  db_encoding_convert(encoding, &worklconv.decimal_point);
734  db_encoding_convert(encoding, &worklconv.thousands_sep);
735  /* grouping is not text and does not require conversion */
736 
738  if (encoding < 0)
740 
741  db_encoding_convert(encoding, &worklconv.int_curr_symbol);
742  db_encoding_convert(encoding, &worklconv.currency_symbol);
743  db_encoding_convert(encoding, &worklconv.mon_decimal_point);
744  db_encoding_convert(encoding, &worklconv.mon_thousands_sep);
745  /* mon_grouping is not text and does not require conversion */
746  db_encoding_convert(encoding, &worklconv.positive_sign);
747  db_encoding_convert(encoding, &worklconv.negative_sign);
748  }
749  PG_CATCH();
750  {
751  free_struct_lconv(&worklconv);
752  PG_RE_THROW();
753  }
754  PG_END_TRY();
755 
756  /*
757  * Everything is good, so save the results.
758  */
759  CurrentLocaleConv = worklconv;
760  CurrentLocaleConvAllocated = true;
761  CurrentLocaleConvValid = true;
762  return &CurrentLocaleConv;
763 }
764 
765 #ifdef WIN32
766 /*
767  * On Windows, strftime() returns its output in encoding CP_ACP (the default
768  * operating system codepage for the computer), which is likely different
769  * from SERVER_ENCODING. This is especially important in Japanese versions
770  * of Windows which will use SJIS encoding, which we don't support as a
771  * server encoding.
772  *
773  * So, instead of using strftime(), use wcsftime() to return the value in
774  * wide characters (internally UTF16) and then convert to UTF8, which we
775  * know how to handle directly.
776  *
777  * Note that this only affects the calls to strftime() in this file, which are
778  * used to get the locale-aware strings. Other parts of the backend use
779  * pg_strftime(), which isn't locale-aware and does not need to be replaced.
780  */
781 static size_t
782 strftime_win32(char *dst, size_t dstlen,
783  const char *format, const struct tm *tm)
784 {
785  size_t len;
786  wchar_t wformat[8]; /* formats used below need 3 chars */
787  wchar_t wbuf[MAX_L10N_DATA];
788 
789  /*
790  * Get a wchar_t version of the format string. We only actually use
791  * plain-ASCII formats in this file, so we can say that they're UTF8.
792  */
793  len = MultiByteToWideChar(CP_UTF8, 0, format, -1,
794  wformat, lengthof(wformat));
795  if (len == 0)
796  elog(ERROR, "could not convert format string from UTF-8: error code %lu",
797  GetLastError());
798 
799  len = wcsftime(wbuf, MAX_L10N_DATA, wformat, tm);
800  if (len == 0)
801  {
802  /*
803  * wcsftime failed, possibly because the result would not fit in
804  * MAX_L10N_DATA. Return 0 with the contents of dst unspecified.
805  */
806  return 0;
807  }
808 
809  len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen - 1,
810  NULL, NULL);
811  if (len == 0)
812  elog(ERROR, "could not convert string to UTF-8: error code %lu",
813  GetLastError());
814 
815  dst[len] = '\0';
816 
817  return len;
818 }
819 
820 /* redefine strftime() */
821 #define strftime(a,b,c,d) strftime_win32(a,b,c,d)
822 #endif /* WIN32 */
823 
824 /*
825  * Subroutine for cache_locale_time().
826  * Convert the given string from encoding "encoding" to the database
827  * encoding, and store the result at *dst, replacing any previous value.
828  */
829 static void
830 cache_single_string(char **dst, const char *src, int encoding)
831 {
832  char *ptr;
833  char *olddst;
834 
835  /* Convert the string to the database encoding, or validate it's OK */
836  ptr = pg_any_to_server(src, strlen(src), encoding);
837 
838  /* Store the string in long-lived storage, replacing any previous value */
839  olddst = *dst;
841  if (olddst)
842  pfree(olddst);
843 
844  /* Might as well clean up any palloc'd conversion result, too */
845  if (ptr != src)
846  pfree(ptr);
847 }
848 
849 /*
850  * Update the lc_time localization cache variables if needed.
851  */
852 void
854 {
855  char buf[(2 * 7 + 2 * 12) * MAX_L10N_DATA];
856  char *bufptr;
857  time_t timenow;
858  struct tm *timeinfo;
859  struct tm timeinfobuf;
860  bool strftimefail = false;
861  int encoding;
862  int i;
863  char *save_lc_time;
864 #ifdef WIN32
865  char *save_lc_ctype;
866 #endif
867 
868  /* did we do this already? */
869  if (CurrentLCTimeValid)
870  return;
871 
872  elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);
873 
874  /*
875  * As in PGLC_localeconv(), it's critical that we not throw error while
876  * libc's locale settings have nondefault values. Hence, we just call
877  * strftime() within the critical section, and then convert and save its
878  * results afterwards.
879  */
880 
881  /* Save prevailing value of time locale */
882  save_lc_time = setlocale(LC_TIME, NULL);
883  if (!save_lc_time)
884  elog(ERROR, "setlocale(NULL) failed");
885  save_lc_time = pstrdup(save_lc_time);
886 
887 #ifdef WIN32
888 
889  /*
890  * On Windows, it appears that wcsftime() internally uses LC_CTYPE, so we
891  * must set it here. This code looks the same as what PGLC_localeconv()
892  * does, but the underlying reason is different: this does NOT determine
893  * the encoding we'll get back from strftime_win32().
894  */
895 
896  /* Save prevailing value of ctype locale */
897  save_lc_ctype = setlocale(LC_CTYPE, NULL);
898  if (!save_lc_ctype)
899  elog(ERROR, "setlocale(NULL) failed");
900  save_lc_ctype = pstrdup(save_lc_ctype);
901 
902  /* use lc_time to set the ctype */
903  setlocale(LC_CTYPE, locale_time);
904 #endif
905 
906  setlocale(LC_TIME, locale_time);
907 
908  /* We use times close to current time as data for strftime(). */
909  timenow = time(NULL);
910  timeinfo = gmtime_r(&timenow, &timeinfobuf);
911 
912  /* Store the strftime results in MAX_L10N_DATA-sized portions of buf[] */
913  bufptr = buf;
914 
915  /*
916  * MAX_L10N_DATA is sufficient buffer space for every known locale, and
917  * POSIX defines no strftime() errors. (Buffer space exhaustion is not an
918  * error.) An implementation might report errors (e.g. ENOMEM) by
919  * returning 0 (or, less plausibly, a negative value) and setting errno.
920  * Report errno just in case the implementation did that, but clear it in
921  * advance of the calls so we don't emit a stale, unrelated errno.
922  */
923  errno = 0;
924 
925  /* localized days */
926  for (i = 0; i < 7; i++)
927  {
928  timeinfo->tm_wday = i;
929  if (strftime(bufptr, MAX_L10N_DATA, "%a", timeinfo) <= 0)
930  strftimefail = true;
931  bufptr += MAX_L10N_DATA;
932  if (strftime(bufptr, MAX_L10N_DATA, "%A", timeinfo) <= 0)
933  strftimefail = true;
934  bufptr += MAX_L10N_DATA;
935  }
936 
937  /* localized months */
938  for (i = 0; i < 12; i++)
939  {
940  timeinfo->tm_mon = i;
941  timeinfo->tm_mday = 1; /* make sure we don't have invalid date */
942  if (strftime(bufptr, MAX_L10N_DATA, "%b", timeinfo) <= 0)
943  strftimefail = true;
944  bufptr += MAX_L10N_DATA;
945  if (strftime(bufptr, MAX_L10N_DATA, "%B", timeinfo) <= 0)
946  strftimefail = true;
947  bufptr += MAX_L10N_DATA;
948  }
949 
950  /*
951  * Restore the prevailing locale settings; as in PGLC_localeconv(),
952  * failure to do so is fatal.
953  */
954 #ifdef WIN32
955  if (!setlocale(LC_CTYPE, save_lc_ctype))
956  elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
957 #endif
958  if (!setlocale(LC_TIME, save_lc_time))
959  elog(FATAL, "failed to restore LC_TIME to \"%s\"", save_lc_time);
960 
961  /*
962  * At this point we've done our best to clean up, and can throw errors, or
963  * call functions that might throw errors, with a clean conscience.
964  */
965  if (strftimefail)
966  elog(ERROR, "strftime() failed: %m");
967 
968  /* Release the pstrdup'd locale names */
969  pfree(save_lc_time);
970 #ifdef WIN32
971  pfree(save_lc_ctype);
972 #endif
973 
974 #ifndef WIN32
975 
976  /*
977  * As in PGLC_localeconv(), we must convert strftime()'s output from the
978  * encoding implied by LC_TIME to the database encoding. If we can't
979  * identify the LC_TIME encoding, just perform encoding validation.
980  */
982  if (encoding < 0)
984 
985 #else
986 
987  /*
988  * On Windows, strftime_win32() always returns UTF8 data, so convert from
989  * that if necessary.
990  */
991  encoding = PG_UTF8;
992 
993 #endif /* WIN32 */
994 
995  bufptr = buf;
996 
997  /* localized days */
998  for (i = 0; i < 7; i++)
999  {
1001  bufptr += MAX_L10N_DATA;
1003  bufptr += MAX_L10N_DATA;
1004  }
1005  localized_abbrev_days[7] = NULL;
1006  localized_full_days[7] = NULL;
1007 
1008  /* localized months */
1009  for (i = 0; i < 12; i++)
1010  {
1012  bufptr += MAX_L10N_DATA;
1014  bufptr += MAX_L10N_DATA;
1015  }
1016  localized_abbrev_months[12] = NULL;
1017  localized_full_months[12] = NULL;
1018 
1019  CurrentLCTimeValid = true;
1020 }
1021 
1022 
1023 #if defined(WIN32) && defined(LC_MESSAGES)
1024 /*
1025  * Convert a Windows setlocale() argument to a Unix-style one.
1026  *
1027  * Regardless of platform, we install message catalogs under a Unix-style
1028  * LL[_CC][.ENCODING][@VARIANT] naming convention. Only LC_MESSAGES settings
1029  * following that style will elicit localized interface strings.
1030  *
1031  * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
1032  * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
1033  * case-insensitive. setlocale() returns the fully-qualified form; for
1034  * example, setlocale("thaI") returns "Thai_Thailand.874". Internally,
1035  * setlocale() and _create_locale() select a "locale identifier"[1] and store
1036  * it in an undocumented _locale_t field. From that LCID, we can retrieve the
1037  * ISO 639 language and the ISO 3166 country. Character encoding does not
1038  * matter, because the server and client encodings govern that.
1039  *
1040  * Windows Vista introduced the "locale name" concept[2], closely following
1041  * RFC 4646. Locale identifiers are now deprecated. Starting with Visual
1042  * Studio 2012, setlocale() accepts locale names in addition to the strings it
1043  * accepted historically. It does not standardize them; setlocale("Th-tH")
1044  * returns "Th-tH". setlocale(category, "") still returns a traditional
1045  * string. Furthermore, msvcr110.dll changed the undocumented _locale_t
1046  * content to carry locale names instead of locale identifiers.
1047  *
1048  * Visual Studio 2015 should still be able to do the same as Visual Studio
1049  * 2012, but the declaration of locale_name is missing in _locale_t, causing
1050  * this code compilation to fail, hence this falls back instead on to
1051  * enumerating all system locales by using EnumSystemLocalesEx to find the
1052  * required locale name. If the input argument is in Unix-style then we can
1053  * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
1054  * LOCALE_SNAME.
1055  *
1056  * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
1057  * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
1058  * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
1059  * localized messages. In particular, every lc_messages setting that initdb
1060  * can select automatically will yield only C-locale messages. XXX This could
1061  * be fixed by running the fully-qualified locale name through a lookup table.
1062  *
1063  * This function returns a pointer to a static buffer bearing the converted
1064  * name or NULL if conversion fails.
1065  *
1066  * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
1067  * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
1068  */
1069 
1070 #if defined(_MSC_VER)
1071 
1072 /*
1073  * Callback function for EnumSystemLocalesEx() in get_iso_localename().
1074  *
1075  * This function enumerates all system locales, searching for one that matches
1076  * an input with the format: <Language>[_<Country>], e.g.
1077  * English[_United States]
1078  *
1079  * The input is a three wchar_t array as an LPARAM. The first element is the
1080  * locale_name we want to match, the second element is an allocated buffer
1081  * where the Unix-style locale is copied if a match is found, and the third
1082  * element is the search status, 1 if a match was found, 0 otherwise.
1083  */
1084 static BOOL CALLBACK
1085 search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
1086 {
1087  wchar_t test_locale[LOCALE_NAME_MAX_LENGTH];
1088  wchar_t **argv;
1089 
1090  (void) (dwFlags);
1091 
1092  argv = (wchar_t **) lparam;
1093  *argv[2] = (wchar_t) 0;
1094 
1095  memset(test_locale, 0, sizeof(test_locale));
1096 
1097  /* Get the name of the <Language> in English */
1098  if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
1099  test_locale, LOCALE_NAME_MAX_LENGTH))
1100  {
1101  /*
1102  * If the enumerated locale does not have a hyphen ("en") OR the
1103  * locale_name input does not have an underscore ("English"), we only
1104  * need to compare the <Language> tags.
1105  */
1106  if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
1107  {
1108  if (_wcsicmp(argv[0], test_locale) == 0)
1109  {
1110  wcscpy(argv[1], pStr);
1111  *argv[2] = (wchar_t) 1;
1112  return FALSE;
1113  }
1114  }
1115 
1116  /*
1117  * We have to compare a full <Language>_<Country> tag, so we append
1118  * the underscore and name of the country/region in English, e.g.
1119  * "English_United States".
1120  */
1121  else
1122  {
1123  size_t len;
1124 
1125  wcscat(test_locale, L"_");
1126  len = wcslen(test_locale);
1127  if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
1128  test_locale + len,
1129  LOCALE_NAME_MAX_LENGTH - len))
1130  {
1131  if (_wcsicmp(argv[0], test_locale) == 0)
1132  {
1133  wcscpy(argv[1], pStr);
1134  *argv[2] = (wchar_t) 1;
1135  return FALSE;
1136  }
1137  }
1138  }
1139  }
1140 
1141  return TRUE;
1142 }
1143 
1144 /*
1145  * This function converts a Windows locale name to an ISO formatted version
1146  * for Visual Studio 2015 or greater.
1147  *
1148  * Returns NULL, if no valid conversion was found.
1149  */
1150 static char *
1151 get_iso_localename(const char *winlocname)
1152 {
1153  wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH];
1154  wchar_t buffer[LOCALE_NAME_MAX_LENGTH];
1155  static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1156  char *period;
1157  int len;
1158  int ret_val;
1159 
1160  /*
1161  * Valid locales have the following syntax:
1162  * <Language>[_<Country>[.<CodePage>]]
1163  *
1164  * GetLocaleInfoEx can only take locale name without code-page and for the
1165  * purpose of this API the code-page doesn't matter.
1166  */
1167  period = strchr(winlocname, '.');
1168  if (period != NULL)
1169  len = period - winlocname;
1170  else
1171  len = pg_mbstrlen(winlocname);
1172 
1173  memset(wc_locale_name, 0, sizeof(wc_locale_name));
1174  memset(buffer, 0, sizeof(buffer));
1175  MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
1176  LOCALE_NAME_MAX_LENGTH);
1177 
1178  /*
1179  * If the lc_messages is already a Unix-style string, we have a direct
1180  * match with LOCALE_SNAME, e.g. en-US, en_US.
1181  */
1182  ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
1183  LOCALE_NAME_MAX_LENGTH);
1184  if (!ret_val)
1185  {
1186  /*
1187  * Search for a locale in the system that matches language and country
1188  * name.
1189  */
1190  wchar_t *argv[3];
1191 
1192  argv[0] = wc_locale_name;
1193  argv[1] = buffer;
1194  argv[2] = (wchar_t *) &ret_val;
1195  EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
1196  NULL);
1197  }
1198 
1199  if (ret_val)
1200  {
1201  size_t rc;
1202  char *hyphen;
1203 
1204  /* Locale names use only ASCII, any conversion locale suffices. */
1205  rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
1206  if (rc == -1 || rc == sizeof(iso_lc_messages))
1207  return NULL;
1208 
1209  /*
1210  * Since the message catalogs sit on a case-insensitive filesystem, we
1211  * need not standardize letter case here. So long as we do not ship
1212  * message catalogs for which it would matter, we also need not
1213  * translate the script/variant portion, e.g. uz-Cyrl-UZ to
1214  * uz_UZ@cyrillic. Simply replace the hyphen with an underscore.
1215  */
1216  hyphen = strchr(iso_lc_messages, '-');
1217  if (hyphen)
1218  *hyphen = '_';
1219  return iso_lc_messages;
1220  }
1221 
1222  return NULL;
1223 }
1224 
1225 static char *
1226 IsoLocaleName(const char *winlocname)
1227 {
1228  static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1229 
1230  if (pg_strcasecmp("c", winlocname) == 0 ||
1231  pg_strcasecmp("posix", winlocname) == 0)
1232  {
1233  strcpy(iso_lc_messages, "C");
1234  return iso_lc_messages;
1235  }
1236  else
1237  return get_iso_localename(winlocname);
1238 }
1239 
1240 #else /* !defined(_MSC_VER) */
1241 
1242 static char *
1243 IsoLocaleName(const char *winlocname)
1244 {
1245  return NULL; /* Not supported on MinGW */
1246 }
1247 
1248 #endif /* defined(_MSC_VER) */
1249 
1250 #endif /* WIN32 && LC_MESSAGES */
1251 
1252 
1253 /*
1254  * Cache mechanism for collation information.
1255  *
1256  * Note that we currently lack any way to flush the cache. Since we don't
1257  * support ALTER COLLATION, this is OK. The worst case is that someone
1258  * drops a collation, and a useless cache entry hangs around in existing
1259  * backends.
1260  */
1261 static collation_cache_entry *
1263 {
1264  collation_cache_entry *cache_entry;
1265  bool found;
1266 
1267  Assert(OidIsValid(collation));
1268  Assert(collation != DEFAULT_COLLATION_OID);
1269 
1270  if (CollationCache == NULL)
1271  {
1273  "collation cache",
1275  CollationCache = collation_cache_create(CollationCacheContext,
1276  16, NULL);
1277  }
1278 
1279  cache_entry = collation_cache_insert(CollationCache, collation, &found);
1280  if (!found)
1281  {
1282  /*
1283  * Make sure cache entry is marked invalid, in case we fail before
1284  * setting things.
1285  */
1286  cache_entry->locale = 0;
1287  }
1288 
1289  return cache_entry;
1290 }
1291 
1292 /* simple subroutine for reporting errors from newlocale() */
1293 static void
1294 report_newlocale_failure(const char *localename)
1295 {
1296  int save_errno;
1297 
1298  /*
1299  * Windows doesn't provide any useful error indication from
1300  * _create_locale(), and BSD-derived platforms don't seem to feel they
1301  * need to set errno either (even though POSIX is pretty clear that
1302  * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1303  * is what to report.
1304  */
1305  if (errno == 0)
1306  errno = ENOENT;
1307 
1308  /*
1309  * ENOENT means "no such locale", not "no such file", so clarify that
1310  * errno with an errdetail message.
1311  */
1312  save_errno = errno; /* auxiliary funcs might change errno */
1313  ereport(ERROR,
1314  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1315  errmsg("could not create locale \"%s\": %m",
1316  localename),
1317  (save_errno == ENOENT ?
1318  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1319  localename) : 0)));
1320 }
1321 
1322 /*
1323  * Create a locale_t with the given collation and ctype.
1324  *
1325  * The "C" and "POSIX" locales are not actually handled by libc, so return
1326  * NULL.
1327  *
1328  * Ensure that no path leaks a locale_t.
1329  */
1330 static locale_t
1331 make_libc_collator(const char *collate, const char *ctype)
1332 {
1333  locale_t loc = 0;
1334 
1335  if (strcmp(collate, ctype) == 0)
1336  {
1337  if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
1338  {
1339  /* Normal case where they're the same */
1340  errno = 0;
1341 #ifndef WIN32
1342  loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
1343  NULL);
1344 #else
1345  loc = _create_locale(LC_ALL, collate);
1346 #endif
1347  if (!loc)
1348  report_newlocale_failure(collate);
1349  }
1350  }
1351  else
1352  {
1353 #ifndef WIN32
1354  /* We need two newlocale() steps */
1355  locale_t loc1 = 0;
1356 
1357  if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
1358  {
1359  errno = 0;
1360  loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
1361  if (!loc1)
1362  report_newlocale_failure(collate);
1363  }
1364 
1365  if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
1366  {
1367  errno = 0;
1368  loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
1369  if (!loc)
1370  {
1371  if (loc1)
1372  freelocale(loc1);
1373  report_newlocale_failure(ctype);
1374  }
1375  }
1376  else
1377  loc = loc1;
1378 #else
1379 
1380  /*
1381  * XXX The _create_locale() API doesn't appear to support this. Could
1382  * perhaps be worked around by changing pg_locale_t to contain two
1383  * separate fields.
1384  */
1385  ereport(ERROR,
1386  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1387  errmsg("collations with different collate and ctype values are not supported on this platform")));
1388 #endif
1389  }
1390 
1391  return loc;
1392 }
1393 
1394 /*
1395  * Create a UCollator with the given locale string and rules.
1396  *
1397  * Ensure that no path leaks a UCollator.
1398  */
1399 #ifdef USE_ICU
1400 static UCollator *
1401 make_icu_collator(const char *iculocstr, const char *icurules)
1402 {
1403  if (!icurules)
1404  {
1405  /* simple case without rules */
1406  return pg_ucol_open(iculocstr);
1407  }
1408  else
1409  {
1410  UCollator *collator_std_rules;
1411  UCollator *collator_all_rules;
1412  const UChar *std_rules;
1413  UChar *my_rules;
1414  UChar *all_rules;
1415  int32_t length;
1416  int32_t total;
1417  UErrorCode status;
1418 
1419  /*
1420  * If rules are specified, we extract the rules of the standard
1421  * collation, add our own rules, and make a new collator with the
1422  * combined rules.
1423  */
1424  icu_to_uchar(&my_rules, icurules, strlen(icurules));
1425 
1426  collator_std_rules = pg_ucol_open(iculocstr);
1427 
1428  std_rules = ucol_getRules(collator_std_rules, &length);
1429 
1430  total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
1431 
1432  /* avoid leaking collator on OOM */
1433  all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
1434  if (!all_rules)
1435  {
1436  ucol_close(collator_std_rules);
1437  ereport(ERROR,
1438  (errcode(ERRCODE_OUT_OF_MEMORY),
1439  errmsg("out of memory")));
1440  }
1441 
1442  u_strcpy(all_rules, std_rules);
1443  u_strcat(all_rules, my_rules);
1444 
1445  ucol_close(collator_std_rules);
1446 
1447  status = U_ZERO_ERROR;
1448  collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
1449  UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
1450  NULL, &status);
1451  if (U_FAILURE(status))
1452  {
1453  ereport(ERROR,
1454  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1455  errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
1456  iculocstr, icurules, u_errorName(status))));
1457  }
1458 
1459  return collator_all_rules;
1460  }
1461 }
1462 #endif /* not USE_ICU */
1463 
1464 /*
1465  * Initialize default_locale with database locale settings.
1466  */
1467 void
1469 {
1470  HeapTuple tup;
1471  Form_pg_database dbform;
1472  Datum datum;
1473 
1474  /* Fetch our pg_database row normally, via syscache */
1475  tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
1476  if (!HeapTupleIsValid(tup))
1477  elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
1478  dbform = (Form_pg_database) GETSTRUCT(tup);
1479 
1480  if (dbform->datlocprovider == COLLPROVIDER_BUILTIN)
1481  {
1482  char *datlocale;
1483 
1484  datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
1485  datlocale = TextDatumGetCString(datum);
1486 
1487  builtin_validate_locale(dbform->encoding, datlocale);
1488 
1490  default_locale.ctype_is_c = (strcmp(datlocale, "C") == 0);
1491 
1494  }
1495  else if (dbform->datlocprovider == COLLPROVIDER_ICU)
1496  {
1497 #ifdef USE_ICU
1498  char *datlocale;
1499  char *icurules;
1500  bool isnull;
1501 
1502  datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
1503  datlocale = TextDatumGetCString(datum);
1504 
1505  default_locale.collate_is_c = false;
1506  default_locale.ctype_is_c = false;
1507 
1508  datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull);
1509  if (!isnull)
1510  icurules = TextDatumGetCString(datum);
1511  else
1512  icurules = NULL;
1513 
1515  default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules);
1516 #else
1517  /* could get here if a collation was created by a build with ICU */
1518  ereport(ERROR,
1519  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1520  errmsg("ICU is not supported in this build")));
1521 #endif
1522  }
1523  else if (dbform->datlocprovider == COLLPROVIDER_LIBC)
1524  {
1525  const char *datcollate;
1526  const char *datctype;
1527 
1528  datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datcollate);
1529  datcollate = TextDatumGetCString(datum);
1530  datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype);
1531  datctype = TextDatumGetCString(datum);
1532 
1533  default_locale.collate_is_c = (strcmp(datcollate, "C") == 0) ||
1534  (strcmp(datcollate, "POSIX") == 0);
1535  default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) ||
1536  (strcmp(datctype, "POSIX") == 0);
1537 
1538  default_locale.info.lt = make_libc_collator(datcollate, datctype);
1539  }
1540  else
1541  /* shouldn't happen */
1542  PGLOCALE_SUPPORT_ERROR(dbform->datlocprovider);
1543 
1544 
1545  default_locale.provider = dbform->datlocprovider;
1546 
1547  /*
1548  * Default locale is currently always deterministic. Nondeterministic
1549  * locales currently don't support pattern matching, which would break a
1550  * lot of things if applied globally.
1551  */
1553 
1554  ReleaseSysCache(tup);
1555 }
1556 
1557 /*
1558  * Create a pg_locale_t from a collation OID. Results are cached for the
1559  * lifetime of the backend. Thus, do not free the result with freelocale().
1560  *
1561  * For simplicity, we always generate COLLATE + CTYPE even though we
1562  * might only need one of them. Since this is called only once per session,
1563  * it shouldn't cost much.
1564  */
1567 {
1568  collation_cache_entry *cache_entry;
1569 
1570  if (collid == DEFAULT_COLLATION_OID)
1571  return &default_locale;
1572 
1573  if (!OidIsValid(collid))
1574  elog(ERROR, "cache lookup failed for collation %u", collid);
1575 
1578 
1579  cache_entry = lookup_collation_cache(collid);
1580 
1581  if (cache_entry->locale == 0)
1582  {
1583  /* We haven't computed this yet in this session, so do it */
1584  HeapTuple tp;
1585  Form_pg_collation collform;
1586  struct pg_locale_struct result;
1587  pg_locale_t resultp;
1588  Datum datum;
1589  bool isnull;
1590 
1591  tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
1592  if (!HeapTupleIsValid(tp))
1593  elog(ERROR, "cache lookup failed for collation %u", collid);
1594  collform = (Form_pg_collation) GETSTRUCT(tp);
1595 
1596  /* We'll fill in the result struct locally before allocating memory */
1597  memset(&result, 0, sizeof(result));
1598  result.provider = collform->collprovider;
1599  result.deterministic = collform->collisdeterministic;
1600 
1601  if (collform->collprovider == COLLPROVIDER_BUILTIN)
1602  {
1603  const char *locstr;
1604 
1605  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1606  locstr = TextDatumGetCString(datum);
1607 
1608  result.collate_is_c = true;
1609  result.ctype_is_c = (strcmp(locstr, "C") == 0);
1610 
1612 
1614  locstr);
1615  }
1616  else if (collform->collprovider == COLLPROVIDER_ICU)
1617  {
1618 #ifdef USE_ICU
1619  const char *iculocstr;
1620  const char *icurules;
1621 
1622  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1623  iculocstr = TextDatumGetCString(datum);
1624 
1625  result.collate_is_c = false;
1626  result.ctype_is_c = false;
1627 
1628  datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
1629  if (!isnull)
1630  icurules = TextDatumGetCString(datum);
1631  else
1632  icurules = NULL;
1633 
1634  result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
1635  result.info.icu.ucol = make_icu_collator(iculocstr, icurules);
1636 #else
1637  /* could get here if a collation was created by a build with ICU */
1638  ereport(ERROR,
1639  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1640  errmsg("ICU is not supported in this build")));
1641 #endif
1642  }
1643  else if (collform->collprovider == COLLPROVIDER_LIBC)
1644  {
1645  const char *collcollate;
1646  const char *collctype;
1647 
1648  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1649  collcollate = TextDatumGetCString(datum);
1650  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1651  collctype = TextDatumGetCString(datum);
1652 
1653  result.collate_is_c = (strcmp(collcollate, "C") == 0) ||
1654  (strcmp(collcollate, "POSIX") == 0);
1655  result.ctype_is_c = (strcmp(collctype, "C") == 0) ||
1656  (strcmp(collctype, "POSIX") == 0);
1657 
1658  result.info.lt = make_libc_collator(collcollate, collctype);
1659  }
1660  else
1661  /* shouldn't happen */
1662  PGLOCALE_SUPPORT_ERROR(collform->collprovider);
1663 
1664  datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
1665  &isnull);
1666  if (!isnull)
1667  {
1668  char *actual_versionstr;
1669  char *collversionstr;
1670 
1671  collversionstr = TextDatumGetCString(datum);
1672 
1673  if (collform->collprovider == COLLPROVIDER_LIBC)
1674  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1675  else
1676  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1677 
1678  actual_versionstr = get_collation_actual_version(collform->collprovider,
1679  TextDatumGetCString(datum));
1680  if (!actual_versionstr)
1681  {
1682  /*
1683  * This could happen when specifying a version in CREATE
1684  * COLLATION but the provider does not support versioning, or
1685  * manually creating a mess in the catalogs.
1686  */
1687  ereport(ERROR,
1688  (errmsg("collation \"%s\" has no actual version, but a version was recorded",
1689  NameStr(collform->collname))));
1690  }
1691 
1692  if (strcmp(actual_versionstr, collversionstr) != 0)
1693  ereport(WARNING,
1694  (errmsg("collation \"%s\" has version mismatch",
1695  NameStr(collform->collname)),
1696  errdetail("The collation in the database was created using version %s, "
1697  "but the operating system provides version %s.",
1698  collversionstr, actual_versionstr),
1699  errhint("Rebuild all objects affected by this collation and run "
1700  "ALTER COLLATION %s REFRESH VERSION, "
1701  "or build PostgreSQL with the right library version.",
1702  quote_qualified_identifier(get_namespace_name(collform->collnamespace),
1703  NameStr(collform->collname)))));
1704  }
1705 
1706  ReleaseSysCache(tp);
1707 
1708  /* We'll keep the pg_locale_t structures in TopMemoryContext */
1709  resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
1710  *resultp = result;
1711 
1712  cache_entry->locale = resultp;
1713  }
1714 
1716  last_collation_cache_locale = cache_entry->locale;
1717 
1718  return cache_entry->locale;
1719 }
1720 
1721 /*
1722  * Get provider-specific collation version string for the given collation from
1723  * the operating system/library.
1724  */
1725 char *
1726 get_collation_actual_version(char collprovider, const char *collcollate)
1727 {
1728  char *collversion = NULL;
1729 
1730  /*
1731  * The only two supported locales (C and C.UTF-8) are both based on memcmp
1732  * and are not expected to change, but track the version anyway.
1733  *
1734  * Note that the character semantics may change for some locales, but the
1735  * collation version only tracks changes to sort order.
1736  */
1737  if (collprovider == COLLPROVIDER_BUILTIN)
1738  {
1739  if (strcmp(collcollate, "C") == 0)
1740  return "1";
1741  else if (strcmp(collcollate, "C.UTF-8") == 0)
1742  return "1";
1743  else
1744  ereport(ERROR,
1745  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1746  errmsg("invalid locale name \"%s\" for builtin provider",
1747  collcollate)));
1748  }
1749 
1750 #ifdef USE_ICU
1751  if (collprovider == COLLPROVIDER_ICU)
1752  {
1753  UCollator *collator;
1754  UVersionInfo versioninfo;
1755  char buf[U_MAX_VERSION_STRING_LENGTH];
1756 
1757  collator = pg_ucol_open(collcollate);
1758 
1759  ucol_getVersion(collator, versioninfo);
1760  ucol_close(collator);
1761 
1762  u_versionToString(versioninfo, buf);
1763  collversion = pstrdup(buf);
1764  }
1765  else
1766 #endif
1767  if (collprovider == COLLPROVIDER_LIBC &&
1768  pg_strcasecmp("C", collcollate) != 0 &&
1769  pg_strncasecmp("C.", collcollate, 2) != 0 &&
1770  pg_strcasecmp("POSIX", collcollate) != 0)
1771  {
1772 #if defined(__GLIBC__)
1773  /* Use the glibc version because we don't have anything better. */
1774  collversion = pstrdup(gnu_get_libc_version());
1775 #elif defined(LC_VERSION_MASK)
1776  locale_t loc;
1777 
1778  /* Look up FreeBSD collation version. */
1779  loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
1780  if (loc)
1781  {
1782  collversion =
1783  pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1784  freelocale(loc);
1785  }
1786  else
1787  ereport(ERROR,
1788  (errmsg("could not load locale \"%s\"", collcollate)));
1789 #elif defined(WIN32)
1790  /*
1791  * If we are targeting Windows Vista and above, we can ask for a name
1792  * given a collation name (earlier versions required a location code
1793  * that we don't have).
1794  */
1795  NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1796  WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1797 
1798  MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1799  LOCALE_NAME_MAX_LENGTH);
1800  if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1801  {
1802  /*
1803  * GetNLSVersionEx() wants a language tag such as "en-US", not a
1804  * locale name like "English_United States.1252". Until those
1805  * values can be prevented from entering the system, or 100%
1806  * reliably converted to the more useful tag format, tolerate the
1807  * resulting error and report that we have no version data.
1808  */
1809  if (GetLastError() == ERROR_INVALID_PARAMETER)
1810  return NULL;
1811 
1812  ereport(ERROR,
1813  (errmsg("could not get collation version for locale \"%s\": error code %lu",
1814  collcollate,
1815  GetLastError())));
1816  }
1817  collversion = psprintf("%lu.%lu,%lu.%lu",
1818  (version.dwNLSVersion >> 8) & 0xFFFF,
1819  version.dwNLSVersion & 0xFF,
1820  (version.dwDefinedVersion >> 8) & 0xFFFF,
1821  version.dwDefinedVersion & 0xFF);
1822 #endif
1823  }
1824 
1825  return collversion;
1826 }
1827 
1828 /*
1829  * strncoll_libc_win32_utf8
1830  *
1831  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1832  * invoke wcscoll_l().
1833  *
1834  * An input string length of -1 means that it's NUL-terminated.
1835  */
1836 #ifdef WIN32
1837 static int
1838 strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1839  ssize_t len2, pg_locale_t locale)
1840 {
1841  char sbuf[TEXTBUFLEN];
1842  char *buf = sbuf;
1843  char *a1p,
1844  *a2p;
1845  int a1len;
1846  int a2len;
1847  int r;
1848  int result;
1849 
1850  Assert(locale->provider == COLLPROVIDER_LIBC);
1852 
1853  if (len1 == -1)
1854  len1 = strlen(arg1);
1855  if (len2 == -1)
1856  len2 = strlen(arg2);
1857 
1858  a1len = len1 * 2 + 2;
1859  a2len = len2 * 2 + 2;
1860 
1861  if (a1len + a2len > TEXTBUFLEN)
1862  buf = palloc(a1len + a2len);
1863 
1864  a1p = buf;
1865  a2p = buf + a1len;
1866 
1867  /* API does not work for zero-length input */
1868  if (len1 == 0)
1869  r = 0;
1870  else
1871  {
1872  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1873  (LPWSTR) a1p, a1len / 2);
1874  if (!r)
1875  ereport(ERROR,
1876  (errmsg("could not convert string to UTF-16: error code %lu",
1877  GetLastError())));
1878  }
1879  ((LPWSTR) a1p)[r] = 0;
1880 
1881  if (len2 == 0)
1882  r = 0;
1883  else
1884  {
1885  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1886  (LPWSTR) a2p, a2len / 2);
1887  if (!r)
1888  ereport(ERROR,
1889  (errmsg("could not convert string to UTF-16: error code %lu",
1890  GetLastError())));
1891  }
1892  ((LPWSTR) a2p)[r] = 0;
1893 
1894  errno = 0;
1895  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1896  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1897  ereport(ERROR,
1898  (errmsg("could not compare Unicode strings: %m")));
1899 
1900  if (buf != sbuf)
1901  pfree(buf);
1902 
1903  return result;
1904 }
1905 #endif /* WIN32 */
1906 
1907 /*
1908  * strncoll_libc
1909  *
1910  * NUL-terminate arguments, if necessary, and pass to strcoll_l().
1911  *
1912  * An input string length of -1 means that it's already NUL-terminated.
1913  */
1914 static int
1915 strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
1917 {
1918  char sbuf[TEXTBUFLEN];
1919  char *buf = sbuf;
1920  size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
1921  size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
1922  const char *arg1n;
1923  const char *arg2n;
1924  int result;
1925 
1926  Assert(locale->provider == COLLPROVIDER_LIBC);
1927 
1928 #ifdef WIN32
1929  /* check for this case before doing the work for nul-termination */
1930  if (GetDatabaseEncoding() == PG_UTF8)
1931  return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1932 #endif /* WIN32 */
1933 
1934  if (bufsize1 + bufsize2 > TEXTBUFLEN)
1935  buf = palloc(bufsize1 + bufsize2);
1936 
1937  /* nul-terminate arguments if necessary */
1938  if (len1 == -1)
1939  {
1940  arg1n = arg1;
1941  }
1942  else
1943  {
1944  char *buf1 = buf;
1945 
1946  memcpy(buf1, arg1, len1);
1947  buf1[len1] = '\0';
1948  arg1n = buf1;
1949  }
1950 
1951  if (len2 == -1)
1952  {
1953  arg2n = arg2;
1954  }
1955  else
1956  {
1957  char *buf2 = buf + bufsize1;
1958 
1959  memcpy(buf2, arg2, len2);
1960  buf2[len2] = '\0';
1961  arg2n = buf2;
1962  }
1963 
1964  result = strcoll_l(arg1n, arg2n, locale->info.lt);
1965 
1966  if (buf != sbuf)
1967  pfree(buf);
1968 
1969  return result;
1970 }
1971 
1972 #ifdef USE_ICU
1973 
1974 /*
1975  * strncoll_icu_no_utf8
1976  *
1977  * Convert the arguments from the database encoding to UChar strings, then
1978  * call ucol_strcoll(). An argument length of -1 means that the string is
1979  * NUL-terminated.
1980  *
1981  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1982  * caller should call that instead.
1983  */
1984 static int
1985 strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
1986  const char *arg2, ssize_t len2, pg_locale_t locale)
1987 {
1988  char sbuf[TEXTBUFLEN];
1989  char *buf = sbuf;
1990  int32_t ulen1;
1991  int32_t ulen2;
1992  size_t bufsize1;
1993  size_t bufsize2;
1994  UChar *uchar1,
1995  *uchar2;
1996  int result;
1997 
1998  Assert(locale->provider == COLLPROVIDER_ICU);
1999 #ifdef HAVE_UCOL_STRCOLLUTF8
2001 #endif
2002 
2003  init_icu_converter();
2004 
2005  ulen1 = uchar_length(icu_converter, arg1, len1);
2006  ulen2 = uchar_length(icu_converter, arg2, len2);
2007 
2008  bufsize1 = (ulen1 + 1) * sizeof(UChar);
2009  bufsize2 = (ulen2 + 1) * sizeof(UChar);
2010 
2011  if (bufsize1 + bufsize2 > TEXTBUFLEN)
2012  buf = palloc(bufsize1 + bufsize2);
2013 
2014  uchar1 = (UChar *) buf;
2015  uchar2 = (UChar *) (buf + bufsize1);
2016 
2017  ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
2018  ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
2019 
2020  result = ucol_strcoll(locale->info.icu.ucol,
2021  uchar1, ulen1,
2022  uchar2, ulen2);
2023 
2024  if (buf != sbuf)
2025  pfree(buf);
2026 
2027  return result;
2028 }
2029 
2030 /*
2031  * strncoll_icu
2032  *
2033  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
2034  * database encoding. An argument length of -1 means the string is
2035  * NUL-terminated.
2036  */
2037 static int
2038 strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
2040 {
2041  int result;
2042 
2043  Assert(locale->provider == COLLPROVIDER_ICU);
2044 
2045 #ifdef HAVE_UCOL_STRCOLLUTF8
2046  if (GetDatabaseEncoding() == PG_UTF8)
2047  {
2048  UErrorCode status;
2049 
2050  status = U_ZERO_ERROR;
2051  result = ucol_strcollUTF8(locale->info.icu.ucol,
2052  arg1, len1,
2053  arg2, len2,
2054  &status);
2055  if (U_FAILURE(status))
2056  ereport(ERROR,
2057  (errmsg("collation failed: %s", u_errorName(status))));
2058  }
2059  else
2060 #endif
2061  {
2062  result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
2063  }
2064 
2065  return result;
2066 }
2067 
2068 #endif /* USE_ICU */
2069 
2070 /*
2071  * pg_strcoll
2072  *
2073  * Like pg_strncoll for NUL-terminated input strings.
2074  */
2075 int
2076 pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
2077 {
2078  int result;
2079 
2080  if (locale->provider == COLLPROVIDER_LIBC)
2081  result = strncoll_libc(arg1, -1, arg2, -1, locale);
2082 #ifdef USE_ICU
2083  else if (locale->provider == COLLPROVIDER_ICU)
2084  result = strncoll_icu(arg1, -1, arg2, -1, locale);
2085 #endif
2086  else
2087  /* shouldn't happen */
2088  PGLOCALE_SUPPORT_ERROR(locale->provider);
2089 
2090  return result;
2091 }
2092 
2093 /*
2094  * pg_strncoll
2095  *
2096  * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
2097  * appropriate for the given locale, platform, and database encoding. If the
2098  * locale is not specified, use the database collation.
2099  *
2100  * The input strings must be encoded in the database encoding. If an input
2101  * string is NUL-terminated, its length may be specified as -1.
2102  *
2103  * The caller is responsible for breaking ties if the collation is
2104  * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
2105  * easily account for deterministic collations.
2106  */
2107 int
2108 pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
2110 {
2111  int result;
2112 
2113  if (locale->provider == COLLPROVIDER_LIBC)
2114  result = strncoll_libc(arg1, len1, arg2, len2, locale);
2115 #ifdef USE_ICU
2116  else if (locale->provider == COLLPROVIDER_ICU)
2117  result = strncoll_icu(arg1, len1, arg2, len2, locale);
2118 #endif
2119  else
2120  /* shouldn't happen */
2121  PGLOCALE_SUPPORT_ERROR(locale->provider);
2122 
2123  return result;
2124 }
2125 
2126 /*
2127  * strnxfrm_libc
2128  *
2129  * NUL-terminate src, if necessary, and pass to strxfrm_l().
2130  *
2131  * A source length of -1 means that it's already NUL-terminated.
2132  */
2133 static size_t
2134 strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
2136 {
2137  char sbuf[TEXTBUFLEN];
2138  char *buf = sbuf;
2139  size_t bufsize = srclen + 1;
2140  size_t result;
2141 
2142  Assert(locale->provider == COLLPROVIDER_LIBC);
2143 
2144  if (srclen == -1)
2145  return strxfrm_l(dest, src, destsize, locale->info.lt);
2146 
2147  if (bufsize > TEXTBUFLEN)
2148  buf = palloc(bufsize);
2149 
2150  /* nul-terminate argument */
2151  memcpy(buf, src, srclen);
2152  buf[srclen] = '\0';
2153 
2154  result = strxfrm_l(dest, buf, destsize, locale->info.lt);
2155 
2156  if (buf != sbuf)
2157  pfree(buf);
2158 
2159  /* if dest is defined, it should be nul-terminated */
2160  Assert(result >= destsize || dest[result] == '\0');
2161 
2162  return result;
2163 }
2164 
2165 #ifdef USE_ICU
2166 
2167 /* 'srclen' of -1 means the strings are NUL-terminated */
2168 static size_t
2169 strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
2171 {
2172  char sbuf[TEXTBUFLEN];
2173  char *buf = sbuf;
2174  UChar *uchar;
2175  int32_t ulen;
2176  size_t uchar_bsize;
2177  Size result_bsize;
2178 
2179  Assert(locale->provider == COLLPROVIDER_ICU);
2180 
2181  init_icu_converter();
2182 
2183  ulen = uchar_length(icu_converter, src, srclen);
2184 
2185  uchar_bsize = (ulen + 1) * sizeof(UChar);
2186 
2187  if (uchar_bsize > TEXTBUFLEN)
2188  buf = palloc(uchar_bsize);
2189 
2190  uchar = (UChar *) buf;
2191 
2192  ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2193 
2194  result_bsize = ucol_getSortKey(locale->info.icu.ucol,
2195  uchar, ulen,
2196  (uint8_t *) dest, destsize);
2197 
2198  /*
2199  * ucol_getSortKey() counts the nul-terminator in the result length, but
2200  * this function should not.
2201  */
2202  Assert(result_bsize > 0);
2203  result_bsize--;
2204 
2205  if (buf != sbuf)
2206  pfree(buf);
2207 
2208  /* if dest is defined, it should be nul-terminated */
2209  Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
2210 
2211  return result_bsize;
2212 }
2213 
2214 /* 'srclen' of -1 means the strings are NUL-terminated */
2215 static size_t
2216 strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize,
2217  const char *src, ssize_t srclen,
2219 {
2220  char sbuf[TEXTBUFLEN];
2221  char *buf = sbuf;
2222  UCharIterator iter;
2223  uint32_t state[2];
2224  UErrorCode status;
2225  int32_t ulen = -1;
2226  UChar *uchar = NULL;
2227  size_t uchar_bsize;
2228  Size result_bsize;
2229 
2230  Assert(locale->provider == COLLPROVIDER_ICU);
2232 
2233  init_icu_converter();
2234 
2235  ulen = uchar_length(icu_converter, src, srclen);
2236 
2237  uchar_bsize = (ulen + 1) * sizeof(UChar);
2238 
2239  if (uchar_bsize > TEXTBUFLEN)
2240  buf = palloc(uchar_bsize);
2241 
2242  uchar = (UChar *) buf;
2243 
2244  ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2245 
2246  uiter_setString(&iter, uchar, ulen);
2247  state[0] = state[1] = 0; /* won't need that again */
2248  status = U_ZERO_ERROR;
2249  result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
2250  &iter,
2251  state,
2252  (uint8_t *) dest,
2253  destsize,
2254  &status);
2255  if (U_FAILURE(status))
2256  ereport(ERROR,
2257  (errmsg("sort key generation failed: %s",
2258  u_errorName(status))));
2259 
2260  return result_bsize;
2261 }
2262 
2263 /* 'srclen' of -1 means the strings are NUL-terminated */
2264 static size_t
2265 strnxfrm_prefix_icu(char *dest, size_t destsize,
2266  const char *src, ssize_t srclen,
2268 {
2269  size_t result;
2270 
2271  Assert(locale->provider == COLLPROVIDER_ICU);
2272 
2273  if (GetDatabaseEncoding() == PG_UTF8)
2274  {
2275  UCharIterator iter;
2276  uint32_t state[2];
2277  UErrorCode status;
2278 
2279  uiter_setUTF8(&iter, src, srclen);
2280  state[0] = state[1] = 0; /* won't need that again */
2281  status = U_ZERO_ERROR;
2282  result = ucol_nextSortKeyPart(locale->info.icu.ucol,
2283  &iter,
2284  state,
2285  (uint8_t *) dest,
2286  destsize,
2287  &status);
2288  if (U_FAILURE(status))
2289  ereport(ERROR,
2290  (errmsg("sort key generation failed: %s",
2291  u_errorName(status))));
2292  }
2293  else
2294  result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
2295  locale);
2296 
2297  return result;
2298 }
2299 
2300 #endif
2301 
2302 /*
2303  * Return true if the collation provider supports pg_strxfrm() and
2304  * pg_strnxfrm(); otherwise false.
2305  *
2306  * Unfortunately, it seems that strxfrm() for non-C collations is broken on
2307  * many common platforms; testing of multiple versions of glibc reveals that,
2308  * for many locales, strcoll() and strxfrm() do not return consistent
2309  * results. While no other libc other than Cygwin has so far been shown to
2310  * have a problem, we take the conservative course of action for right now and
2311  * disable this categorically. (Users who are certain this isn't a problem on
2312  * their system can define TRUST_STRXFRM.)
2313  *
2314  * No similar problem is known for the ICU provider.
2315  */
2316 bool
2318 {
2319  if (locale->provider == COLLPROVIDER_LIBC)
2320 #ifdef TRUST_STRXFRM
2321  return true;
2322 #else
2323  return false;
2324 #endif
2325  else if (locale->provider == COLLPROVIDER_ICU)
2326  return true;
2327  else
2328  /* shouldn't happen */
2329  PGLOCALE_SUPPORT_ERROR(locale->provider);
2330 
2331  return false; /* keep compiler quiet */
2332 }
2333 
2334 /*
2335  * pg_strxfrm
2336  *
2337  * Like pg_strnxfrm for a NUL-terminated input string.
2338  */
2339 size_t
2340 pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
2341 {
2342  size_t result = 0; /* keep compiler quiet */
2343 
2344  if (locale->provider == COLLPROVIDER_LIBC)
2345  result = strnxfrm_libc(dest, destsize, src, -1, locale);
2346 #ifdef USE_ICU
2347  else if (locale->provider == COLLPROVIDER_ICU)
2348  result = strnxfrm_icu(dest, destsize, src, -1, locale);
2349 #endif
2350  else
2351  /* shouldn't happen */
2352  PGLOCALE_SUPPORT_ERROR(locale->provider);
2353 
2354  return result;
2355 }
2356 
2357 /*
2358  * pg_strnxfrm
2359  *
2360  * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2361  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2362  * untransformed strings.
2363  *
2364  * The input string must be encoded in the database encoding. If the input
2365  * string is NUL-terminated, its length may be specified as -1. If 'destsize'
2366  * is zero, 'dest' may be NULL.
2367  *
2368  * Not all providers support pg_strnxfrm() safely. The caller should check
2369  * pg_strxfrm_enabled() first, otherwise this function may return wrong
2370  * results or an error.
2371  *
2372  * Returns the number of bytes needed (or more) to store the transformed
2373  * string, excluding the terminating nul byte. If the value returned is
2374  * 'destsize' or greater, the resulting contents of 'dest' are undefined.
2375  */
2376 size_t
2377 pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen,
2379 {
2380  size_t result = 0; /* keep compiler quiet */
2381 
2382  if (locale->provider == COLLPROVIDER_LIBC)
2383  result = strnxfrm_libc(dest, destsize, src, srclen, locale);
2384 #ifdef USE_ICU
2385  else if (locale->provider == COLLPROVIDER_ICU)
2386  result = strnxfrm_icu(dest, destsize, src, srclen, locale);
2387 #endif
2388  else
2389  /* shouldn't happen */
2390  PGLOCALE_SUPPORT_ERROR(locale->provider);
2391 
2392  return result;
2393 }
2394 
2395 /*
2396  * Return true if the collation provider supports pg_strxfrm_prefix() and
2397  * pg_strnxfrm_prefix(); otherwise false.
2398  */
2399 bool
2401 {
2402  if (locale->provider == COLLPROVIDER_LIBC)
2403  return false;
2404  else if (locale->provider == COLLPROVIDER_ICU)
2405  return true;
2406  else
2407  /* shouldn't happen */
2408  PGLOCALE_SUPPORT_ERROR(locale->provider);
2409 
2410  return false; /* keep compiler quiet */
2411 }
2412 
2413 /*
2414  * pg_strxfrm_prefix
2415  *
2416  * Like pg_strnxfrm_prefix for a NUL-terminated input string.
2417  */
2418 size_t
2419 pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
2421 {
2422  return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
2423 }
2424 
2425 /*
2426  * pg_strnxfrm_prefix
2427  *
2428  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2429  * memcmp() on the byte sequence is equivalent to pg_strncoll() on
2430  * untransformed strings. The result is not nul-terminated.
2431  *
2432  * The input string must be encoded in the database encoding. If the input
2433  * string is NUL-terminated, its length may be specified as -1.
2434  *
2435  * Not all providers support pg_strnxfrm_prefix() safely. The caller should
2436  * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
2437  * wrong results or an error.
2438  *
2439  * If destsize is not large enough to hold the resulting byte sequence, stores
2440  * only the first destsize bytes in 'dest'. Returns the number of bytes
2441  * actually copied to 'dest'.
2442  */
2443 size_t
2444 pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
2445  ssize_t srclen, pg_locale_t locale)
2446 {
2447  size_t result = 0; /* keep compiler quiet */
2448 
2449 #ifdef USE_ICU
2450  if (locale->provider == COLLPROVIDER_ICU)
2451  result = strnxfrm_prefix_icu(dest, destsize, src, -1, locale);
2452  else
2453 #endif
2454  PGLOCALE_SUPPORT_ERROR(locale->provider);
2455 
2456  return result;
2457 }
2458 
2459 /*
2460  * Return required encoding ID for the given locale, or -1 if any encoding is
2461  * valid for the locale.
2462  */
2463 int
2465 {
2466  if (strcmp(locale, "C") == 0)
2467  return -1;
2468  if (strcmp(locale, "C.UTF-8") == 0)
2469  return PG_UTF8;
2470 
2471  ereport(ERROR,
2472  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
2473  errmsg("invalid locale name \"%s\" for builtin provider",
2474  locale)));
2475 
2476  return 0; /* keep compiler quiet */
2477 }
2478 
2479 
2480 /*
2481  * Validate the locale and encoding combination, and return the canonical form
2482  * of the locale name.
2483  */
2484 const char *
2486 {
2487  const char *canonical_name = NULL;
2488  int required_encoding;
2489 
2490  if (strcmp(locale, "C") == 0)
2491  canonical_name = "C";
2492  else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
2493  canonical_name = "C.UTF-8";
2494 
2495  if (!canonical_name)
2496  ereport(ERROR,
2497  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
2498  errmsg("invalid locale name \"%s\" for builtin provider",
2499  locale)));
2500 
2501  required_encoding = builtin_locale_encoding(canonical_name);
2502  if (required_encoding >= 0 && encoding != required_encoding)
2503  ereport(ERROR,
2504  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
2505  errmsg("encoding \"%s\" does not match locale \"%s\"",
2507 
2508  return canonical_name;
2509 }
2510 
2511 
2512 #ifdef USE_ICU
2513 
2514 /*
2515  * Wrapper around ucol_open() to handle API differences for older ICU
2516  * versions.
2517  *
2518  * Ensure that no path leaks a UCollator.
2519  */
2520 static UCollator *
2521 pg_ucol_open(const char *loc_str)
2522 {
2523  UCollator *collator;
2524  UErrorCode status;
2525  const char *orig_str = loc_str;
2526  char *fixed_str = NULL;
2527 
2528  /*
2529  * Must never open default collator, because it depends on the environment
2530  * and may change at any time. Should not happen, but check here to catch
2531  * bugs that might be hard to catch otherwise.
2532  *
2533  * NB: the default collator is not the same as the collator for the root
2534  * locale. The root locale may be specified as the empty string, "und", or
2535  * "root". The default collator is opened by passing NULL to ucol_open().
2536  */
2537  if (loc_str == NULL)
2538  elog(ERROR, "opening default collator is not supported");
2539 
2540  /*
2541  * In ICU versions 54 and earlier, "und" is not a recognized spelling of
2542  * the root locale. If the first component of the locale is "und", replace
2543  * with "root" before opening.
2544  */
2545  if (U_ICU_VERSION_MAJOR_NUM < 55)
2546  {
2547  char lang[ULOC_LANG_CAPACITY];
2548 
2549  status = U_ZERO_ERROR;
2550  uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2551  if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2552  {
2553  ereport(ERROR,
2554  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2555  errmsg("could not get language from locale \"%s\": %s",
2556  loc_str, u_errorName(status))));
2557  }
2558 
2559  if (strcmp(lang, "und") == 0)
2560  {
2561  const char *remainder = loc_str + strlen("und");
2562 
2563  fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
2564  strcpy(fixed_str, "root");
2565  strcat(fixed_str, remainder);
2566 
2567  loc_str = fixed_str;
2568  }
2569  }
2570 
2571  status = U_ZERO_ERROR;
2572  collator = ucol_open(loc_str, &status);
2573  if (U_FAILURE(status))
2574  ereport(ERROR,
2575  /* use original string for error report */
2576  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2577  errmsg("could not open collator for locale \"%s\": %s",
2578  orig_str, u_errorName(status))));
2579 
2580  if (U_ICU_VERSION_MAJOR_NUM < 54)
2581  {
2582  status = U_ZERO_ERROR;
2583  icu_set_collation_attributes(collator, loc_str, &status);
2584 
2585  /*
2586  * Pretend the error came from ucol_open(), for consistent error
2587  * message across ICU versions.
2588  */
2589  if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2590  {
2591  ucol_close(collator);
2592  ereport(ERROR,
2593  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2594  errmsg("could not open collator for locale \"%s\": %s",
2595  orig_str, u_errorName(status))));
2596  }
2597  }
2598 
2599  if (fixed_str != NULL)
2600  pfree(fixed_str);
2601 
2602  return collator;
2603 }
2604 
2605 static void
2606 init_icu_converter(void)
2607 {
2608  const char *icu_encoding_name;
2609  UErrorCode status;
2610  UConverter *conv;
2611 
2612  if (icu_converter)
2613  return; /* already done */
2614 
2615  icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
2616  if (!icu_encoding_name)
2617  ereport(ERROR,
2618  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2619  errmsg("encoding \"%s\" not supported by ICU",
2621 
2622  status = U_ZERO_ERROR;
2623  conv = ucnv_open(icu_encoding_name, &status);
2624  if (U_FAILURE(status))
2625  ereport(ERROR,
2626  (errmsg("could not open ICU converter for encoding \"%s\": %s",
2627  icu_encoding_name, u_errorName(status))));
2628 
2629  icu_converter = conv;
2630 }
2631 
2632 /*
2633  * Find length, in UChars, of given string if converted to UChar string.
2634  *
2635  * A length of -1 indicates that the input string is NUL-terminated.
2636  */
2637 static size_t
2638 uchar_length(UConverter *converter, const char *str, int32_t len)
2639 {
2640  UErrorCode status = U_ZERO_ERROR;
2641  int32_t ulen;
2642 
2643  ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
2644  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2645  ereport(ERROR,
2646  (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2647  return ulen;
2648 }
2649 
2650 /*
2651  * Convert the given source string into a UChar string, stored in dest, and
2652  * return the length (in UChars).
2653  *
2654  * A srclen of -1 indicates that the input string is NUL-terminated.
2655  */
2656 static int32_t
2657 uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
2658  const char *src, int32_t srclen)
2659 {
2660  UErrorCode status = U_ZERO_ERROR;
2661  int32_t ulen;
2662 
2663  status = U_ZERO_ERROR;
2664  ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
2665  if (U_FAILURE(status))
2666  ereport(ERROR,
2667  (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2668  return ulen;
2669 }
2670 
2671 /*
2672  * Convert a string in the database encoding into a string of UChars.
2673  *
2674  * The source string at buff is of length nbytes
2675  * (it needn't be nul-terminated)
2676  *
2677  * *buff_uchar receives a pointer to the palloc'd result string, and
2678  * the function's result is the number of UChars generated.
2679  *
2680  * The result string is nul-terminated, though most callers rely on the
2681  * result length instead.
2682  */
2683 int32_t
2684 icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
2685 {
2686  int32_t len_uchar;
2687 
2688  init_icu_converter();
2689 
2690  len_uchar = uchar_length(icu_converter, buff, nbytes);
2691 
2692  *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
2693  len_uchar = uchar_convert(icu_converter,
2694  *buff_uchar, len_uchar + 1, buff, nbytes);
2695 
2696  return len_uchar;
2697 }
2698 
2699 /*
2700  * Convert a string of UChars into the database encoding.
2701  *
2702  * The source string at buff_uchar is of length len_uchar
2703  * (it needn't be nul-terminated)
2704  *
2705  * *result receives a pointer to the palloc'd result string, and the
2706  * function's result is the number of bytes generated (not counting nul).
2707  *
2708  * The result string is nul-terminated.
2709  */
2710 int32_t
2711 icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
2712 {
2713  UErrorCode status;
2714  int32_t len_result;
2715 
2716  init_icu_converter();
2717 
2718  status = U_ZERO_ERROR;
2719  len_result = ucnv_fromUChars(icu_converter, NULL, 0,
2720  buff_uchar, len_uchar, &status);
2721  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2722  ereport(ERROR,
2723  (errmsg("%s failed: %s", "ucnv_fromUChars",
2724  u_errorName(status))));
2725 
2726  *result = palloc(len_result + 1);
2727 
2728  status = U_ZERO_ERROR;
2729  len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
2730  buff_uchar, len_uchar, &status);
2731  if (U_FAILURE(status) ||
2732  status == U_STRING_NOT_TERMINATED_WARNING)
2733  ereport(ERROR,
2734  (errmsg("%s failed: %s", "ucnv_fromUChars",
2735  u_errorName(status))));
2736 
2737  return len_result;
2738 }
2739 
2740 /*
2741  * Parse collation attributes from the given locale string and apply them to
2742  * the open collator.
2743  *
2744  * First, the locale string is canonicalized to an ICU format locale ID such
2745  * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
2746  * the key-value arguments.
2747  *
2748  * Starting with ICU version 54, the attributes are processed automatically by
2749  * ucol_open(), so this is only necessary for emulating this behavior on older
2750  * versions.
2751  */
2753 static void
2754 icu_set_collation_attributes(UCollator *collator, const char *loc,
2755  UErrorCode *status)
2756 {
2757  int32_t len;
2758  char *icu_locale_id;
2759  char *lower_str;
2760  char *str;
2761  char *token;
2762 
2763  /*
2764  * The input locale may be a BCP 47 language tag, e.g.
2765  * "und-u-kc-ks-level1", which expresses the same attributes in a
2766  * different form. It will be converted to the equivalent ICU format
2767  * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
2768  * uloc_canonicalize().
2769  */
2770  *status = U_ZERO_ERROR;
2771  len = uloc_canonicalize(loc, NULL, 0, status);
2772  icu_locale_id = palloc(len + 1);
2773  *status = U_ZERO_ERROR;
2774  len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
2775  if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
2776  return;
2777 
2778  lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
2779 
2780  pfree(icu_locale_id);
2781 
2782  str = strchr(lower_str, '@');
2783  if (!str)
2784  return;
2785  str++;
2786 
2787  while ((token = strsep(&str, ";")))
2788  {
2789  char *e = strchr(token, '=');
2790 
2791  if (e)
2792  {
2793  char *name;
2794  char *value;
2795  UColAttribute uattr;
2796  UColAttributeValue uvalue;
2797 
2798  *status = U_ZERO_ERROR;
2799 
2800  *e = '\0';
2801  name = token;
2802  value = e + 1;
2803 
2804  /*
2805  * See attribute name and value lists in ICU i18n/coll.cpp
2806  */
2807  if (strcmp(name, "colstrength") == 0)
2808  uattr = UCOL_STRENGTH;
2809  else if (strcmp(name, "colbackwards") == 0)
2810  uattr = UCOL_FRENCH_COLLATION;
2811  else if (strcmp(name, "colcaselevel") == 0)
2812  uattr = UCOL_CASE_LEVEL;
2813  else if (strcmp(name, "colcasefirst") == 0)
2814  uattr = UCOL_CASE_FIRST;
2815  else if (strcmp(name, "colalternate") == 0)
2816  uattr = UCOL_ALTERNATE_HANDLING;
2817  else if (strcmp(name, "colnormalization") == 0)
2818  uattr = UCOL_NORMALIZATION_MODE;
2819  else if (strcmp(name, "colnumeric") == 0)
2820  uattr = UCOL_NUMERIC_COLLATION;
2821  else
2822  /* ignore if unknown */
2823  continue;
2824 
2825  if (strcmp(value, "primary") == 0)
2826  uvalue = UCOL_PRIMARY;
2827  else if (strcmp(value, "secondary") == 0)
2828  uvalue = UCOL_SECONDARY;
2829  else if (strcmp(value, "tertiary") == 0)
2830  uvalue = UCOL_TERTIARY;
2831  else if (strcmp(value, "quaternary") == 0)
2832  uvalue = UCOL_QUATERNARY;
2833  else if (strcmp(value, "identical") == 0)
2834  uvalue = UCOL_IDENTICAL;
2835  else if (strcmp(value, "no") == 0)
2836  uvalue = UCOL_OFF;
2837  else if (strcmp(value, "yes") == 0)
2838  uvalue = UCOL_ON;
2839  else if (strcmp(value, "shifted") == 0)
2840  uvalue = UCOL_SHIFTED;
2841  else if (strcmp(value, "non-ignorable") == 0)
2842  uvalue = UCOL_NON_IGNORABLE;
2843  else if (strcmp(value, "lower") == 0)
2844  uvalue = UCOL_LOWER_FIRST;
2845  else if (strcmp(value, "upper") == 0)
2846  uvalue = UCOL_UPPER_FIRST;
2847  else
2848  {
2849  *status = U_ILLEGAL_ARGUMENT_ERROR;
2850  break;
2851  }
2852 
2853  ucol_setAttribute(collator, uattr, uvalue, status);
2854  }
2855  }
2856 
2857  pfree(lower_str);
2858 }
2859 #endif
2860 
2861 /*
2862  * Return the BCP47 language tag representation of the requested locale.
2863  *
2864  * This function should be called before passing the string to ucol_open(),
2865  * because conversion to a language tag also performs "level 2
2866  * canonicalization". In addition to producing a consistent format, level 2
2867  * canonicalization is able to more accurately interpret different input
2868  * locale string formats, such as POSIX and .NET IDs.
2869  */
2870 char *
2871 icu_language_tag(const char *loc_str, int elevel)
2872 {
2873 #ifdef USE_ICU
2874  UErrorCode status;
2875  char *langtag;
2876  size_t buflen = 32; /* arbitrary starting buffer size */
2877  const bool strict = true;
2878 
2879  /*
2880  * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
2881  * RFC5646 section 4.4). Additionally, in older ICU versions,
2882  * uloc_toLanguageTag() doesn't always return the ultimate length on the
2883  * first call, necessitating a loop.
2884  */
2885  langtag = palloc(buflen);
2886  while (true)
2887  {
2888  status = U_ZERO_ERROR;
2889  uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
2890 
2891  /* try again if the buffer is not large enough */
2892  if ((status == U_BUFFER_OVERFLOW_ERROR ||
2893  status == U_STRING_NOT_TERMINATED_WARNING) &&
2894  buflen < MaxAllocSize)
2895  {
2896  buflen = Min(buflen * 2, MaxAllocSize);
2897  langtag = repalloc(langtag, buflen);
2898  continue;
2899  }
2900 
2901  break;
2902  }
2903 
2904  if (U_FAILURE(status))
2905  {
2906  pfree(langtag);
2907 
2908  if (elevel > 0)
2909  ereport(elevel,
2910  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2911  errmsg("could not convert locale name \"%s\" to language tag: %s",
2912  loc_str, u_errorName(status))));
2913  return NULL;
2914  }
2915 
2916  return langtag;
2917 #else /* not USE_ICU */
2918  ereport(ERROR,
2919  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2920  errmsg("ICU is not supported in this build")));
2921  return NULL; /* keep compiler quiet */
2922 #endif /* not USE_ICU */
2923 }
2924 
2925 /*
2926  * Perform best-effort check that the locale is a valid one.
2927  */
2928 void
2929 icu_validate_locale(const char *loc_str)
2930 {
2931 #ifdef USE_ICU
2932  UCollator *collator;
2933  UErrorCode status;
2934  char lang[ULOC_LANG_CAPACITY];
2935  bool found = false;
2936  int elevel = icu_validation_level;
2937 
2938  /* no validation */
2939  if (elevel < 0)
2940  return;
2941 
2942  /* downgrade to WARNING during pg_upgrade */
2943  if (IsBinaryUpgrade && elevel > WARNING)
2944  elevel = WARNING;
2945 
2946  /* validate that we can extract the language */
2947  status = U_ZERO_ERROR;
2948  uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2949  if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2950  {
2951  ereport(elevel,
2952  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2953  errmsg("could not get language from ICU locale \"%s\": %s",
2954  loc_str, u_errorName(status)),
2955  errhint("To disable ICU locale validation, set the parameter \"%s\" to \"%s\".",
2956  "icu_validation_level", "disabled")));
2957  return;
2958  }
2959 
2960  /* check for special language name */
2961  if (strcmp(lang, "") == 0 ||
2962  strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
2963  found = true;
2964 
2965  /* search for matching language within ICU */
2966  for (int32_t i = 0; !found && i < uloc_countAvailable(); i++)
2967  {
2968  const char *otherloc = uloc_getAvailable(i);
2969  char otherlang[ULOC_LANG_CAPACITY];
2970 
2971  status = U_ZERO_ERROR;
2972  uloc_getLanguage(otherloc, otherlang, ULOC_LANG_CAPACITY, &status);
2973  if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2974  continue;
2975 
2976  if (strcmp(lang, otherlang) == 0)
2977  found = true;
2978  }
2979 
2980  if (!found)
2981  ereport(elevel,
2982  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2983  errmsg("ICU locale \"%s\" has unknown language \"%s\"",
2984  loc_str, lang),
2985  errhint("To disable ICU locale validation, set the parameter \"%s\" to \"%s\".",
2986  "icu_validation_level", "disabled")));
2987 
2988  /* check that it can be opened */
2989  collator = pg_ucol_open(loc_str);
2990  ucol_close(collator);
2991 #else /* not USE_ICU */
2992  /* could get here if a collation was created by a build with ICU */
2993  ereport(ERROR,
2994  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2995  errmsg("ICU is not supported in this build")));
2996 #endif /* not USE_ICU */
2997 }
2998 
2999 /*
3000  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
3001  * Therefore we keep them here rather than with the mbutils code.
3002  */
3003 
3004 /*
3005  * wchar2char --- convert wide characters to multibyte format
3006  *
3007  * This has the same API as the standard wcstombs_l() function; in particular,
3008  * tolen is the maximum number of bytes to store at *to, and *from must be
3009  * zero-terminated. The output will be zero-terminated iff there is room.
3010  */
3011 size_t
3012 wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
3013 {
3014  size_t result;
3015 
3016  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
3017 
3018  if (tolen == 0)
3019  return 0;
3020 
3021 #ifdef WIN32
3022 
3023  /*
3024  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
3025  * for some reason mbstowcs and wcstombs won't do this for us, so we use
3026  * MultiByteToWideChar().
3027  */
3028  if (GetDatabaseEncoding() == PG_UTF8)
3029  {
3030  result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
3031  NULL, NULL);
3032  /* A zero return is failure */
3033  if (result <= 0)
3034  result = -1;
3035  else
3036  {
3037  Assert(result <= tolen);
3038  /* Microsoft counts the zero terminator in the result */
3039  result--;
3040  }
3041  }
3042  else
3043 #endif /* WIN32 */
3044  if (locale == (pg_locale_t) 0)
3045  {
3046  /* Use wcstombs directly for the default locale */
3047  result = wcstombs(to, from, tolen);
3048  }
3049  else
3050  {
3051  /* Use wcstombs_l for nondefault locales */
3052  result = wcstombs_l(to, from, tolen, locale->info.lt);
3053  }
3054 
3055  return result;
3056 }
3057 
3058 /*
3059  * char2wchar --- convert multibyte characters to wide characters
3060  *
3061  * This has almost the API of mbstowcs_l(), except that *from need not be
3062  * null-terminated; instead, the number of input bytes is specified as
3063  * fromlen. Also, we ereport() rather than returning -1 for invalid
3064  * input encoding. tolen is the maximum number of wchar_t's to store at *to.
3065  * The output will be zero-terminated iff there is room.
3066  */
3067 size_t
3068 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
3070 {
3071  size_t result;
3072 
3073  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
3074 
3075  if (tolen == 0)
3076  return 0;
3077 
3078 #ifdef WIN32
3079  /* See WIN32 "Unicode" comment above */
3080  if (GetDatabaseEncoding() == PG_UTF8)
3081  {
3082  /* Win32 API does not work for zero-length input */
3083  if (fromlen == 0)
3084  result = 0;
3085  else
3086  {
3087  result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
3088  /* A zero return is failure */
3089  if (result == 0)
3090  result = -1;
3091  }
3092 
3093  if (result != -1)
3094  {
3095  Assert(result < tolen);
3096  /* Append trailing null wchar (MultiByteToWideChar() does not) */
3097  to[result] = 0;
3098  }
3099  }
3100  else
3101 #endif /* WIN32 */
3102  {
3103  /* mbstowcs requires ending '\0' */
3104  char *str = pnstrdup(from, fromlen);
3105 
3106  if (locale == (pg_locale_t) 0)
3107  {
3108  /* Use mbstowcs directly for the default locale */
3109  result = mbstowcs(to, str, tolen);
3110  }
3111  else
3112  {
3113  /* Use mbstowcs_l for nondefault locales */
3114  result = mbstowcs_l(to, str, tolen, locale->info.lt);
3115  }
3116 
3117  pfree(str);
3118  }
3119 
3120  if (result == -1)
3121  {
3122  /*
3123  * Invalid multibyte character encountered. We try to give a useful
3124  * error message by letting pg_verifymbstr check the string. But it's
3125  * possible that the string is OK to us, and not OK to mbstowcs ---
3126  * this suggests that the LC_CTYPE locale is different from the
3127  * database encoding. Give a generic error message if pg_verifymbstr
3128  * can't find anything wrong.
3129  */
3130  pg_verifymbstr(from, fromlen, false); /* might not return */
3131  /* but if it does ... */
3132  ereport(ERROR,
3133  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
3134  errmsg("invalid multibyte character for locale"),
3135  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
3136  }
3137 
3138  return result;
3139 }
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define NameStr(name)
Definition: c.h:749
unsigned int uint32
Definition: c.h:509
#define Min(x, y)
Definition: c.h:1007
#define pg_attribute_unused()
Definition: c.h:126
#define Assert(condition)
Definition: c.h:861
#define lengthof(array)
Definition: c.h:791
#define OidIsValid(objectId)
Definition: c.h:778
size_t Size
Definition: c.h:608
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define PG_RE_THROW()
Definition: elog.h:412
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define PG_TRY(...)
Definition: elog.h:371
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:396
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:381
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:472
#define MCXT_ALLOC_NO_OOM
Definition: fe_memutils.h:17
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:2113
bool IsBinaryUpgrade
Definition: globals.c:120
Oid MyDatabaseId
Definition: globals.c:93
#define newval
GucSource
Definition: guc.h:108
@ PGC_S_DEFAULT
Definition: guc.h:109
const char * str
#define free(a)
Definition: header.h:65
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
#define period
Definition: indent_codes.h:66
#define token
Definition: indent_globs.h:126
#define bufsize
Definition: indent_globs.h:36
static struct @157 value
static char * datlocale
Definition: initdb.c:149
static char * locale
Definition: initdb.c:140
int i
Definition: isn.c:73
static struct pg_tm tm
Definition: localtime.c:104
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3366
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:676
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:1037
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1556
void SetMessageEncoding(int encoding)
Definition: mbutils.c:1171
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1707
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
MemoryContext TopMemoryContext
Definition: mcxt.c:149
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc_extended(Size size, int flags)
Definition: mcxt.c:1368
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1683
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define MaxAllocSize
Definition: memutils.h:40
static char format
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
FormData_pg_database * Form_pg_database
Definition: pg_database.h:96
int32 encoding
Definition: pg_database.h:41
int icu_validation_level
Definition: pg_locale.c:104
static pg_locale_t last_collation_cache_locale
Definition: pg_locale.c:160
void cache_locale_time(void)
Definition: pg_locale.c:853
size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.c:2377
bool pg_strxfrm_enabled(pg_locale_t locale)
Definition: pg_locale.c:2317
char * localized_full_months[12+1]
Definition: pg_locale.c:116
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:3012
struct lconv * PGLC_localeconv(void)
Definition: pg_locale.c:571
static struct pg_locale_struct default_locale
Definition: pg_locale.c:121
void icu_validate_locale(const char *loc_str)
Definition: pg_locale.c:2929
static bool CurrentLCTimeValid
Definition: pg_locale.c:125
void assign_locale_time(const char *newval, void *extra)
Definition: pg_locale.c:435
bool check_locale_time(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:429
char * locale_messages
Definition: pg_locale.c:99
char * locale_numeric
Definition: pg_locale.c:101
pg_locale_t pg_newlocale_from_collation(Oid collid)
Definition: pg_locale.c:1566
int builtin_locale_encoding(const char *locale)
Definition: pg_locale.c:2464
size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.c:2444
bool database_ctype_is_c
Definition: pg_locale.c:119
#define PGLOCALE_SUPPORT_ERROR(provider)
Definition: pg_locale.c:86
char * locale_time
Definition: pg_locale.c:102
static void cache_single_string(char **dst, const char *src, int encoding)
Definition: pg_locale.c:830
static locale_t make_libc_collator(const char *collate, const char *ctype)
Definition: pg_locale.c:1331
bool check_locale_numeric(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:417
static void db_encoding_convert(int encoding, char **str)
Definition: pg_locale.c:541
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
Definition: pg_locale.c:207
void assign_locale_numeric(const char *newval, void *extra)
Definition: pg_locale.c:423
bool check_locale_messages(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:451
#define MAX_L10N_DATA
Definition: pg_locale.c:95
char * get_collation_actual_version(char collprovider, const char *collcollate)
Definition: pg_locale.c:1726
static void free_struct_lconv(struct lconv *s)
Definition: pg_locale.c:491
char * pg_perm_setlocale(int category, const char *locale)
Definition: pg_locale.c:237
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.c:1915
static MemoryContext CollationCacheContext
Definition: pg_locale.c:152
void assign_locale_messages(const char *newval, void *extra)
Definition: pg_locale.c:474
static bool CurrentLocaleConvValid
Definition: pg_locale.c:124
char * icu_language_tag(const char *loc_str, int elevel)
Definition: pg_locale.c:2871
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.c:2134
int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
Definition: pg_locale.c:2076
bool pg_strxfrm_prefix_enabled(pg_locale_t locale)
Definition: pg_locale.c:2400
static void report_newlocale_failure(const char *localename)
Definition: pg_locale.c:1294
char * localized_abbrev_months[12+1]
Definition: pg_locale.c:115
static collation_cache_hash * CollationCache
Definition: pg_locale.c:153
int pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.c:2108
static bool struct_lconv_is_valid(struct lconv *s)
Definition: pg_locale.c:510
void init_database_collation(void)
Definition: pg_locale.c:1468
char * localized_full_days[7+1]
Definition: pg_locale.c:114
static collation_cache_entry * lookup_collation_cache(Oid collation)
Definition: pg_locale.c:1262
size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
Definition: pg_locale.c:2340
const char * builtin_validate_locale(int encoding, const char *locale)
Definition: pg_locale.c:2485
void assign_locale_monetary(const char *newval, void *extra)
Definition: pg_locale.c:411
#define TEXTBUFLEN
Definition: pg_locale.c:93
bool check_locale(int category, const char *locale, char **canonname)
Definition: pg_locale.c:340
char * localized_abbrev_days[7+1]
Definition: pg_locale.c:113
size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale)
Definition: pg_locale.c:2419
char * locale_monetary
Definition: pg_locale.c:100
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:3068
bool check_locale_monetary(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:405
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
Definition: pg_locale.c:191
static Oid last_collation_cache_oid
Definition: pg_locale.c:159
#define LOCALE_NAME_BUFLEN
Definition: pg_locale.h:33
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:73
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_UTF8
Definition: pg_wchar.h:232
#define pg_encoding_to_char
Definition: pg_wchar.h:630
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
char * strsep(char **stringp, const char *delim)
Definition: strsep.c:49
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:301
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
e
Definition: preproc-init.c:82
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
char * quote_qualified_identifier(const char *qualifier, const char *ident)
Definition: ruleutils.c:12924
bool pg_is_ascii(const char *str)
Definition: string.c:133
Definition: pg_locale.c:130
char status
Definition: pg_locale.c:136
Oid collid
Definition: pg_locale.c:131
pg_locale_t locale
Definition: pg_locale.c:132
uint32 hash
Definition: pg_locale.c:135
union pg_locale_struct::@153 info
struct pg_locale_struct::@153::@154 builtin
locale_t lt
Definition: pg_locale.h:91
const char * locale
Definition: pg_locale.h:89
bool deterministic
Definition: pg_locale.h:82
Definition: regguts.h:323
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:596
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:627
const char * name
#define locale_t
Definition: win32_port.h:442
#define strcoll_l
Definition: win32_port.h:465
#define strxfrm_l
Definition: win32_port.h:466
#define wcscoll_l
Definition: win32_port.h:467
#define setenv(x, y, z)
Definition: win32_port.h:555
#define setlocale(a, b)
Definition: win32_port.h:485