PostgreSQL Source Code  git master
pg_locale.c
Go to the documentation of this file.
1 /*-----------------------------------------------------------------------
2  *
3  * PostgreSQL locale utilities
4  *
5  * Portions Copyright (c) 2002-2023, PostgreSQL Global Development Group
6  *
7  * src/backend/utils/adt/pg_locale.c
8  *
9  *-----------------------------------------------------------------------
10  */
11 
12 /*----------
13  * Here is how the locale stuff is handled: LC_COLLATE and LC_CTYPE
14  * are fixed at CREATE DATABASE time, stored in pg_database, and cannot
15  * be changed. Thus, the effects of strcoll(), strxfrm(), isupper(),
16  * toupper(), etc. are always in the same fixed locale.
17  *
18  * LC_MESSAGES is settable at run time and will take effect
19  * immediately.
20  *
21  * The other categories, LC_MONETARY, LC_NUMERIC, and LC_TIME are also
22  * settable at run-time. However, we don't actually set those locale
23  * categories permanently. This would have bizarre effects like no
24  * longer accepting standard floating-point literals in some locales.
25  * Instead, we only set these locale categories briefly when needed,
26  * cache the required information obtained from localeconv() or
27  * strftime(), and then set the locale categories back to "C".
28  * The cached information is only used by the formatting functions
29  * (to_char, etc.) and the money type. For the user, this should all be
30  * transparent.
31  *
32  * !!! NOW HEAR THIS !!!
33  *
34  * We've been bitten repeatedly by this bug, so let's try to keep it in
35  * mind in future: on some platforms, the locale functions return pointers
36  * to static data that will be overwritten by any later locale function.
37  * Thus, for example, the obvious-looking sequence
38  * save = setlocale(category, NULL);
39  * if (!setlocale(category, value))
40  * fail = true;
41  * setlocale(category, save);
42  * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
43  * will change the memory save is pointing at. To do this sort of thing
44  * safely, you *must* pstrdup what setlocale returns the first time.
45  *
46  * The POSIX locale standard is available here:
47  *
48  * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
49  *----------
50  */
51 
52 
53 #include "postgres.h"
54 
55 #include <time.h>
56 
57 #include "access/htup_details.h"
58 #include "catalog/pg_collation.h"
59 #include "catalog/pg_control.h"
60 #include "mb/pg_wchar.h"
61 #include "miscadmin.h"
62 #include "utils/builtins.h"
63 #include "utils/formatting.h"
64 #include "utils/guc_hooks.h"
65 #include "utils/hsearch.h"
66 #include "utils/lsyscache.h"
67 #include "utils/memutils.h"
68 #include "utils/pg_locale.h"
69 #include "utils/syscache.h"
70 
71 #ifdef USE_ICU
72 #include <unicode/ucnv.h>
73 #include <unicode/ustring.h>
74 #endif
75 
76 #ifdef __GLIBC__
77 #include <gnu/libc-version.h>
78 #endif
79 
80 #ifdef WIN32
81 #include <shlwapi.h>
82 #endif
83 
84 /*
85  * This should be large enough that most strings will fit, but small enough
86  * that we feel comfortable putting it on the stack
87  */
88 #define TEXTBUFLEN 1024
89 
90 #define MAX_L10N_DATA 80
91 
92 
93 /* GUC settings */
98 
100 
101 /*
102  * lc_time localization cache.
103  *
104  * We use only the first 7 or 12 entries of these arrays. The last array
105  * element is left as NULL for the convenience of outside code that wants
106  * to sequentially scan these arrays.
107  */
109 char *localized_full_days[7 + 1];
111 char *localized_full_months[12 + 1];
112 
113 /* is the databases's LC_CTYPE the C locale? */
114 bool database_ctype_is_c = false;
115 
116 /* indicates whether locale information cache is valid */
117 static bool CurrentLocaleConvValid = false;
118 static bool CurrentLCTimeValid = false;
119 
120 /* Cache for collation-related knowledge */
121 
122 typedef struct
123 {
124  Oid collid; /* hash key: pg_collation OID */
125  bool collate_is_c; /* is collation's LC_COLLATE C? */
126  bool ctype_is_c; /* is collation's LC_CTYPE C? */
127  bool flags_valid; /* true if above flags are valid */
128  pg_locale_t locale; /* locale_t struct, or 0 if not valid */
130 
131 static HTAB *collation_cache = NULL;
132 
133 
134 #if defined(WIN32) && defined(LC_MESSAGES)
135 static char *IsoLocaleName(const char *);
136 #endif
137 
138 #ifdef USE_ICU
139 /*
140  * Converter object for converting between ICU's UChar strings and C strings
141  * in database encoding. Since the database encoding doesn't change, we only
142  * need one of these per session.
143  */
144 static UConverter *icu_converter = NULL;
145 
146 static UCollator *pg_ucol_open(const char *loc_str);
147 static void init_icu_converter(void);
148 static size_t uchar_length(UConverter *converter,
149  const char *str, int32_t len);
150 static int32_t uchar_convert(UConverter *converter,
151  UChar *dest, int32_t destlen,
152  const char *str, int32_t srclen);
153 static void icu_set_collation_attributes(UCollator *collator, const char *loc,
154  UErrorCode *status);
155 #endif
156 
157 /*
158  * pg_perm_setlocale
159  *
160  * This wraps the libc function setlocale(), with two additions. First, when
161  * changing LC_CTYPE, update gettext's encoding for the current message
162  * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but
163  * not on Windows. Second, if the operation is successful, the corresponding
164  * LC_XXX environment variable is set to match. By setting the environment
165  * variable, we ensure that any subsequent use of setlocale(..., "") will
166  * preserve the settings made through this routine. Of course, LC_ALL must
167  * also be unset to fully ensure that, but that has to be done elsewhere after
168  * all the individual LC_XXX variables have been set correctly. (Thank you
169  * Perl for making this kluge necessary.)
170  */
171 char *
172 pg_perm_setlocale(int category, const char *locale)
173 {
174  char *result;
175  const char *envvar;
176 
177 #ifndef WIN32
178  result = setlocale(category, locale);
179 #else
180 
181  /*
182  * On Windows, setlocale(LC_MESSAGES) does not work, so just assume that
183  * the given value is good and set it in the environment variables. We
184  * must ignore attempts to set to "", which means "keep using the old
185  * environment value".
186  */
187 #ifdef LC_MESSAGES
188  if (category == LC_MESSAGES)
189  {
190  result = (char *) locale;
191  if (locale == NULL || locale[0] == '\0')
192  return result;
193  }
194  else
195 #endif
196  result = setlocale(category, locale);
197 #endif /* WIN32 */
198 
199  if (result == NULL)
200  return result; /* fall out immediately on failure */
201 
202  /*
203  * Use the right encoding in translated messages. Under ENABLE_NLS, let
204  * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message
205  * format strings are ASCII, but database-encoding strings may enter the
206  * message via %s. This makes the overall message encoding equal to the
207  * database encoding.
208  */
209  if (category == LC_CTYPE)
210  {
211  static char save_lc_ctype[LOCALE_NAME_BUFLEN];
212 
213  /* copy setlocale() return value before callee invokes it again */
214  strlcpy(save_lc_ctype, result, sizeof(save_lc_ctype));
215  result = save_lc_ctype;
216 
217 #ifdef ENABLE_NLS
218  SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL)));
219 #else
221 #endif
222  }
223 
224  switch (category)
225  {
226  case LC_COLLATE:
227  envvar = "LC_COLLATE";
228  break;
229  case LC_CTYPE:
230  envvar = "LC_CTYPE";
231  break;
232 #ifdef LC_MESSAGES
233  case LC_MESSAGES:
234  envvar = "LC_MESSAGES";
235 #ifdef WIN32
236  result = IsoLocaleName(locale);
237  if (result == NULL)
238  result = (char *) locale;
239  elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
240 #endif /* WIN32 */
241  break;
242 #endif /* LC_MESSAGES */
243  case LC_MONETARY:
244  envvar = "LC_MONETARY";
245  break;
246  case LC_NUMERIC:
247  envvar = "LC_NUMERIC";
248  break;
249  case LC_TIME:
250  envvar = "LC_TIME";
251  break;
252  default:
253  elog(FATAL, "unrecognized LC category: %d", category);
254  return NULL; /* keep compiler quiet */
255  }
256 
257  if (setenv(envvar, result, 1) != 0)
258  return NULL;
259 
260  return result;
261 }
262 
263 
264 /*
265  * Is the locale name valid for the locale category?
266  *
267  * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
268  * canonical name is stored there. This is especially useful for figuring out
269  * what locale name "" means (ie, the server environment value). (Actually,
270  * it seems that on most implementations that's the only thing it's good for;
271  * we could wish that setlocale gave back a canonically spelled version of
272  * the locale name, but typically it doesn't.)
273  */
274 bool
275 check_locale(int category, const char *locale, char **canonname)
276 {
277  char *save;
278  char *res;
279 
280  if (canonname)
281  *canonname = NULL; /* in case of failure */
282 
283  save = setlocale(category, NULL);
284  if (!save)
285  return false; /* won't happen, we hope */
286 
287  /* save may be pointing at a modifiable scratch variable, see above. */
288  save = pstrdup(save);
289 
290  /* set the locale with setlocale, to see if it accepts it. */
291  res = setlocale(category, locale);
292 
293  /* save canonical name if requested. */
294  if (res && canonname)
295  *canonname = pstrdup(res);
296 
297  /* restore old value. */
298  if (!setlocale(category, save))
299  elog(WARNING, "failed to restore old locale \"%s\"", save);
300  pfree(save);
301 
302  return (res != NULL);
303 }
304 
305 
306 /*
307  * GUC check/assign hooks
308  *
309  * For most locale categories, the assign hook doesn't actually set the locale
310  * permanently, just reset flags so that the next use will cache the
311  * appropriate values. (See explanation at the top of this file.)
312  *
313  * Note: we accept value = "" as selecting the postmaster's environment
314  * value, whatever it was (so long as the environment setting is legal).
315  * This will have been locked down by an earlier call to pg_perm_setlocale.
316  */
317 bool
319 {
320  return check_locale(LC_MONETARY, *newval, NULL);
321 }
322 
323 void
324 assign_locale_monetary(const char *newval, void *extra)
325 {
326  CurrentLocaleConvValid = false;
327 }
328 
329 bool
331 {
332  return check_locale(LC_NUMERIC, *newval, NULL);
333 }
334 
335 void
336 assign_locale_numeric(const char *newval, void *extra)
337 {
338  CurrentLocaleConvValid = false;
339 }
340 
341 bool
342 check_locale_time(char **newval, void **extra, GucSource source)
343 {
344  return check_locale(LC_TIME, *newval, NULL);
345 }
346 
347 void
348 assign_locale_time(const char *newval, void *extra)
349 {
350  CurrentLCTimeValid = false;
351 }
352 
353 /*
354  * We allow LC_MESSAGES to actually be set globally.
355  *
356  * Note: we normally disallow value = "" because it wouldn't have consistent
357  * semantics (it'd effectively just use the previous value). However, this
358  * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
359  * not even if the attempted setting fails due to invalid environment value.
360  * The idea there is just to accept the environment setting *if possible*
361  * during startup, until we can read the proper value from postgresql.conf.
362  */
363 bool
365 {
366  if (**newval == '\0')
367  {
368  if (source == PGC_S_DEFAULT)
369  return true;
370  else
371  return false;
372  }
373 
374  /*
375  * LC_MESSAGES category does not exist everywhere, but accept it anyway
376  *
377  * On Windows, we can't even check the value, so accept blindly
378  */
379 #if defined(LC_MESSAGES) && !defined(WIN32)
380  return check_locale(LC_MESSAGES, *newval, NULL);
381 #else
382  return true;
383 #endif
384 }
385 
386 void
387 assign_locale_messages(const char *newval, void *extra)
388 {
389  /*
390  * LC_MESSAGES category does not exist everywhere, but accept it anyway.
391  * We ignore failure, as per comment above.
392  */
393 #ifdef LC_MESSAGES
394  (void) pg_perm_setlocale(LC_MESSAGES, newval);
395 #endif
396 }
397 
398 
399 /*
400  * Frees the malloced content of a struct lconv. (But not the struct
401  * itself.) It's important that this not throw elog(ERROR).
402  */
403 static void
404 free_struct_lconv(struct lconv *s)
405 {
406  free(s->decimal_point);
407  free(s->thousands_sep);
408  free(s->grouping);
409  free(s->int_curr_symbol);
410  free(s->currency_symbol);
411  free(s->mon_decimal_point);
412  free(s->mon_thousands_sep);
413  free(s->mon_grouping);
414  free(s->positive_sign);
415  free(s->negative_sign);
416 }
417 
418 /*
419  * Check that all fields of a struct lconv (or at least, the ones we care
420  * about) are non-NULL. The field list must match free_struct_lconv().
421  */
422 static bool
423 struct_lconv_is_valid(struct lconv *s)
424 {
425  if (s->decimal_point == NULL)
426  return false;
427  if (s->thousands_sep == NULL)
428  return false;
429  if (s->grouping == NULL)
430  return false;
431  if (s->int_curr_symbol == NULL)
432  return false;
433  if (s->currency_symbol == NULL)
434  return false;
435  if (s->mon_decimal_point == NULL)
436  return false;
437  if (s->mon_thousands_sep == NULL)
438  return false;
439  if (s->mon_grouping == NULL)
440  return false;
441  if (s->positive_sign == NULL)
442  return false;
443  if (s->negative_sign == NULL)
444  return false;
445  return true;
446 }
447 
448 
449 /*
450  * Convert the strdup'd string at *str from the specified encoding to the
451  * database encoding.
452  */
453 static void
455 {
456  char *pstr;
457  char *mstr;
458 
459  /* convert the string to the database encoding */
460  pstr = pg_any_to_server(*str, strlen(*str), encoding);
461  if (pstr == *str)
462  return; /* no conversion happened */
463 
464  /* need it malloc'd not palloc'd */
465  mstr = strdup(pstr);
466  if (mstr == NULL)
467  ereport(ERROR,
468  (errcode(ERRCODE_OUT_OF_MEMORY),
469  errmsg("out of memory")));
470 
471  /* replace old string */
472  free(*str);
473  *str = mstr;
474 
475  pfree(pstr);
476 }
477 
478 
479 /*
480  * Return the POSIX lconv struct (contains number/money formatting
481  * information) with locale information for all categories.
482  */
483 struct lconv *
485 {
486  static struct lconv CurrentLocaleConv;
487  static bool CurrentLocaleConvAllocated = false;
488  struct lconv *extlconv;
489  struct lconv worklconv;
490  char *save_lc_monetary;
491  char *save_lc_numeric;
492 #ifdef WIN32
493  char *save_lc_ctype;
494 #endif
495 
496  /* Did we do it already? */
498  return &CurrentLocaleConv;
499 
500  /* Free any already-allocated storage */
501  if (CurrentLocaleConvAllocated)
502  {
503  free_struct_lconv(&CurrentLocaleConv);
504  CurrentLocaleConvAllocated = false;
505  }
506 
507  /*
508  * This is tricky because we really don't want to risk throwing error
509  * while the locale is set to other than our usual settings. Therefore,
510  * the process is: collect the usual settings, set locale to special
511  * setting, copy relevant data into worklconv using strdup(), restore
512  * normal settings, convert data to desired encoding, and finally stash
513  * the collected data in CurrentLocaleConv. This makes it safe if we
514  * throw an error during encoding conversion or run out of memory anywhere
515  * in the process. All data pointed to by struct lconv members is
516  * allocated with strdup, to avoid premature elog(ERROR) and to allow
517  * using a single cleanup routine.
518  */
519  memset(&worklconv, 0, sizeof(worklconv));
520 
521  /* Save prevailing values of monetary and numeric locales */
522  save_lc_monetary = setlocale(LC_MONETARY, NULL);
523  if (!save_lc_monetary)
524  elog(ERROR, "setlocale(NULL) failed");
525  save_lc_monetary = pstrdup(save_lc_monetary);
526 
527  save_lc_numeric = setlocale(LC_NUMERIC, NULL);
528  if (!save_lc_numeric)
529  elog(ERROR, "setlocale(NULL) failed");
530  save_lc_numeric = pstrdup(save_lc_numeric);
531 
532 #ifdef WIN32
533 
534  /*
535  * The POSIX standard explicitly says that it is undefined what happens if
536  * LC_MONETARY or LC_NUMERIC imply an encoding (codeset) different from
537  * that implied by LC_CTYPE. In practice, all Unix-ish platforms seem to
538  * believe that localeconv() should return strings that are encoded in the
539  * codeset implied by the LC_MONETARY or LC_NUMERIC locale name. Hence,
540  * once we have successfully collected the localeconv() results, we will
541  * convert them from that codeset to the desired server encoding.
542  *
543  * Windows, of course, resolutely does things its own way; on that
544  * platform LC_CTYPE has to match LC_MONETARY/LC_NUMERIC to get sane
545  * results. Hence, we must temporarily set that category as well.
546  */
547 
548  /* Save prevailing value of ctype locale */
549  save_lc_ctype = setlocale(LC_CTYPE, NULL);
550  if (!save_lc_ctype)
551  elog(ERROR, "setlocale(NULL) failed");
552  save_lc_ctype = pstrdup(save_lc_ctype);
553 
554  /* Here begins the critical section where we must not throw error */
555 
556  /* use numeric to set the ctype */
557  setlocale(LC_CTYPE, locale_numeric);
558 #endif
559 
560  /* Get formatting information for numeric */
561  setlocale(LC_NUMERIC, locale_numeric);
562  extlconv = localeconv();
563 
564  /* Must copy data now in case setlocale() overwrites it */
565  worklconv.decimal_point = strdup(extlconv->decimal_point);
566  worklconv.thousands_sep = strdup(extlconv->thousands_sep);
567  worklconv.grouping = strdup(extlconv->grouping);
568 
569 #ifdef WIN32
570  /* use monetary to set the ctype */
571  setlocale(LC_CTYPE, locale_monetary);
572 #endif
573 
574  /* Get formatting information for monetary */
575  setlocale(LC_MONETARY, locale_monetary);
576  extlconv = localeconv();
577 
578  /* Must copy data now in case setlocale() overwrites it */
579  worklconv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
580  worklconv.currency_symbol = strdup(extlconv->currency_symbol);
581  worklconv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
582  worklconv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
583  worklconv.mon_grouping = strdup(extlconv->mon_grouping);
584  worklconv.positive_sign = strdup(extlconv->positive_sign);
585  worklconv.negative_sign = strdup(extlconv->negative_sign);
586  /* Copy scalar fields as well */
587  worklconv.int_frac_digits = extlconv->int_frac_digits;
588  worklconv.frac_digits = extlconv->frac_digits;
589  worklconv.p_cs_precedes = extlconv->p_cs_precedes;
590  worklconv.p_sep_by_space = extlconv->p_sep_by_space;
591  worklconv.n_cs_precedes = extlconv->n_cs_precedes;
592  worklconv.n_sep_by_space = extlconv->n_sep_by_space;
593  worklconv.p_sign_posn = extlconv->p_sign_posn;
594  worklconv.n_sign_posn = extlconv->n_sign_posn;
595 
596  /*
597  * Restore the prevailing locale settings; failure to do so is fatal.
598  * Possibly we could limp along with nondefault LC_MONETARY or LC_NUMERIC,
599  * but proceeding with the wrong value of LC_CTYPE would certainly be bad
600  * news; and considering that the prevailing LC_MONETARY and LC_NUMERIC
601  * are almost certainly "C", there's really no reason that restoring those
602  * should fail.
603  */
604 #ifdef WIN32
605  if (!setlocale(LC_CTYPE, save_lc_ctype))
606  elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
607 #endif
608  if (!setlocale(LC_MONETARY, save_lc_monetary))
609  elog(FATAL, "failed to restore LC_MONETARY to \"%s\"", save_lc_monetary);
610  if (!setlocale(LC_NUMERIC, save_lc_numeric))
611  elog(FATAL, "failed to restore LC_NUMERIC to \"%s\"", save_lc_numeric);
612 
613  /*
614  * At this point we've done our best to clean up, and can call functions
615  * that might possibly throw errors with a clean conscience. But let's
616  * make sure we don't leak any already-strdup'd fields in worklconv.
617  */
618  PG_TRY();
619  {
620  int encoding;
621 
622  /* Release the pstrdup'd locale names */
623  pfree(save_lc_monetary);
624  pfree(save_lc_numeric);
625 #ifdef WIN32
626  pfree(save_lc_ctype);
627 #endif
628 
629  /* If any of the preceding strdup calls failed, complain now. */
630  if (!struct_lconv_is_valid(&worklconv))
631  ereport(ERROR,
632  (errcode(ERRCODE_OUT_OF_MEMORY),
633  errmsg("out of memory")));
634 
635  /*
636  * Now we must perform encoding conversion from whatever's associated
637  * with the locales into the database encoding. If we can't identify
638  * the encoding implied by LC_NUMERIC or LC_MONETARY (ie we get -1),
639  * use PG_SQL_ASCII, which will result in just validating that the
640  * strings are OK in the database encoding.
641  */
643  if (encoding < 0)
645 
646  db_encoding_convert(encoding, &worklconv.decimal_point);
647  db_encoding_convert(encoding, &worklconv.thousands_sep);
648  /* grouping is not text and does not require conversion */
649 
651  if (encoding < 0)
653 
654  db_encoding_convert(encoding, &worklconv.int_curr_symbol);
655  db_encoding_convert(encoding, &worklconv.currency_symbol);
656  db_encoding_convert(encoding, &worklconv.mon_decimal_point);
657  db_encoding_convert(encoding, &worklconv.mon_thousands_sep);
658  /* mon_grouping is not text and does not require conversion */
659  db_encoding_convert(encoding, &worklconv.positive_sign);
660  db_encoding_convert(encoding, &worklconv.negative_sign);
661  }
662  PG_CATCH();
663  {
664  free_struct_lconv(&worklconv);
665  PG_RE_THROW();
666  }
667  PG_END_TRY();
668 
669  /*
670  * Everything is good, so save the results.
671  */
672  CurrentLocaleConv = worklconv;
673  CurrentLocaleConvAllocated = true;
674  CurrentLocaleConvValid = true;
675  return &CurrentLocaleConv;
676 }
677 
678 #ifdef WIN32
679 /*
680  * On Windows, strftime() returns its output in encoding CP_ACP (the default
681  * operating system codepage for the computer), which is likely different
682  * from SERVER_ENCODING. This is especially important in Japanese versions
683  * of Windows which will use SJIS encoding, which we don't support as a
684  * server encoding.
685  *
686  * So, instead of using strftime(), use wcsftime() to return the value in
687  * wide characters (internally UTF16) and then convert to UTF8, which we
688  * know how to handle directly.
689  *
690  * Note that this only affects the calls to strftime() in this file, which are
691  * used to get the locale-aware strings. Other parts of the backend use
692  * pg_strftime(), which isn't locale-aware and does not need to be replaced.
693  */
694 static size_t
695 strftime_win32(char *dst, size_t dstlen,
696  const char *format, const struct tm *tm)
697 {
698  size_t len;
699  wchar_t wformat[8]; /* formats used below need 3 chars */
700  wchar_t wbuf[MAX_L10N_DATA];
701 
702  /*
703  * Get a wchar_t version of the format string. We only actually use
704  * plain-ASCII formats in this file, so we can say that they're UTF8.
705  */
706  len = MultiByteToWideChar(CP_UTF8, 0, format, -1,
707  wformat, lengthof(wformat));
708  if (len == 0)
709  elog(ERROR, "could not convert format string from UTF-8: error code %lu",
710  GetLastError());
711 
712  len = wcsftime(wbuf, MAX_L10N_DATA, wformat, tm);
713  if (len == 0)
714  {
715  /*
716  * wcsftime failed, possibly because the result would not fit in
717  * MAX_L10N_DATA. Return 0 with the contents of dst unspecified.
718  */
719  return 0;
720  }
721 
722  len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen - 1,
723  NULL, NULL);
724  if (len == 0)
725  elog(ERROR, "could not convert string to UTF-8: error code %lu",
726  GetLastError());
727 
728  dst[len] = '\0';
729 
730  return len;
731 }
732 
733 /* redefine strftime() */
734 #define strftime(a,b,c,d) strftime_win32(a,b,c,d)
735 #endif /* WIN32 */
736 
737 /*
738  * Subroutine for cache_locale_time().
739  * Convert the given string from encoding "encoding" to the database
740  * encoding, and store the result at *dst, replacing any previous value.
741  */
742 static void
743 cache_single_string(char **dst, const char *src, int encoding)
744 {
745  char *ptr;
746  char *olddst;
747 
748  /* Convert the string to the database encoding, or validate it's OK */
749  ptr = pg_any_to_server(src, strlen(src), encoding);
750 
751  /* Store the string in long-lived storage, replacing any previous value */
752  olddst = *dst;
754  if (olddst)
755  pfree(olddst);
756 
757  /* Might as well clean up any palloc'd conversion result, too */
758  if (ptr != src)
759  pfree(ptr);
760 }
761 
762 /*
763  * Update the lc_time localization cache variables if needed.
764  */
765 void
767 {
768  char buf[(2 * 7 + 2 * 12) * MAX_L10N_DATA];
769  char *bufptr;
770  time_t timenow;
771  struct tm *timeinfo;
772  bool strftimefail = false;
773  int encoding;
774  int i;
775  char *save_lc_time;
776 #ifdef WIN32
777  char *save_lc_ctype;
778 #endif
779 
780  /* did we do this already? */
781  if (CurrentLCTimeValid)
782  return;
783 
784  elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);
785 
786  /*
787  * As in PGLC_localeconv(), it's critical that we not throw error while
788  * libc's locale settings have nondefault values. Hence, we just call
789  * strftime() within the critical section, and then convert and save its
790  * results afterwards.
791  */
792 
793  /* Save prevailing value of time locale */
794  save_lc_time = setlocale(LC_TIME, NULL);
795  if (!save_lc_time)
796  elog(ERROR, "setlocale(NULL) failed");
797  save_lc_time = pstrdup(save_lc_time);
798 
799 #ifdef WIN32
800 
801  /*
802  * On Windows, it appears that wcsftime() internally uses LC_CTYPE, so we
803  * must set it here. This code looks the same as what PGLC_localeconv()
804  * does, but the underlying reason is different: this does NOT determine
805  * the encoding we'll get back from strftime_win32().
806  */
807 
808  /* Save prevailing value of ctype locale */
809  save_lc_ctype = setlocale(LC_CTYPE, NULL);
810  if (!save_lc_ctype)
811  elog(ERROR, "setlocale(NULL) failed");
812  save_lc_ctype = pstrdup(save_lc_ctype);
813 
814  /* use lc_time to set the ctype */
815  setlocale(LC_CTYPE, locale_time);
816 #endif
817 
818  setlocale(LC_TIME, locale_time);
819 
820  /* We use times close to current time as data for strftime(). */
821  timenow = time(NULL);
822  timeinfo = localtime(&timenow);
823 
824  /* Store the strftime results in MAX_L10N_DATA-sized portions of buf[] */
825  bufptr = buf;
826 
827  /*
828  * MAX_L10N_DATA is sufficient buffer space for every known locale, and
829  * POSIX defines no strftime() errors. (Buffer space exhaustion is not an
830  * error.) An implementation might report errors (e.g. ENOMEM) by
831  * returning 0 (or, less plausibly, a negative value) and setting errno.
832  * Report errno just in case the implementation did that, but clear it in
833  * advance of the calls so we don't emit a stale, unrelated errno.
834  */
835  errno = 0;
836 
837  /* localized days */
838  for (i = 0; i < 7; i++)
839  {
840  timeinfo->tm_wday = i;
841  if (strftime(bufptr, MAX_L10N_DATA, "%a", timeinfo) <= 0)
842  strftimefail = true;
843  bufptr += MAX_L10N_DATA;
844  if (strftime(bufptr, MAX_L10N_DATA, "%A", timeinfo) <= 0)
845  strftimefail = true;
846  bufptr += MAX_L10N_DATA;
847  }
848 
849  /* localized months */
850  for (i = 0; i < 12; i++)
851  {
852  timeinfo->tm_mon = i;
853  timeinfo->tm_mday = 1; /* make sure we don't have invalid date */
854  if (strftime(bufptr, MAX_L10N_DATA, "%b", timeinfo) <= 0)
855  strftimefail = true;
856  bufptr += MAX_L10N_DATA;
857  if (strftime(bufptr, MAX_L10N_DATA, "%B", timeinfo) <= 0)
858  strftimefail = true;
859  bufptr += MAX_L10N_DATA;
860  }
861 
862  /*
863  * Restore the prevailing locale settings; as in PGLC_localeconv(),
864  * failure to do so is fatal.
865  */
866 #ifdef WIN32
867  if (!setlocale(LC_CTYPE, save_lc_ctype))
868  elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
869 #endif
870  if (!setlocale(LC_TIME, save_lc_time))
871  elog(FATAL, "failed to restore LC_TIME to \"%s\"", save_lc_time);
872 
873  /*
874  * At this point we've done our best to clean up, and can throw errors, or
875  * call functions that might throw errors, with a clean conscience.
876  */
877  if (strftimefail)
878  elog(ERROR, "strftime() failed: %m");
879 
880  /* Release the pstrdup'd locale names */
881  pfree(save_lc_time);
882 #ifdef WIN32
883  pfree(save_lc_ctype);
884 #endif
885 
886 #ifndef WIN32
887 
888  /*
889  * As in PGLC_localeconv(), we must convert strftime()'s output from the
890  * encoding implied by LC_TIME to the database encoding. If we can't
891  * identify the LC_TIME encoding, just perform encoding validation.
892  */
894  if (encoding < 0)
896 
897 #else
898 
899  /*
900  * On Windows, strftime_win32() always returns UTF8 data, so convert from
901  * that if necessary.
902  */
903  encoding = PG_UTF8;
904 
905 #endif /* WIN32 */
906 
907  bufptr = buf;
908 
909  /* localized days */
910  for (i = 0; i < 7; i++)
911  {
913  bufptr += MAX_L10N_DATA;
915  bufptr += MAX_L10N_DATA;
916  }
917  localized_abbrev_days[7] = NULL;
918  localized_full_days[7] = NULL;
919 
920  /* localized months */
921  for (i = 0; i < 12; i++)
922  {
924  bufptr += MAX_L10N_DATA;
926  bufptr += MAX_L10N_DATA;
927  }
928  localized_abbrev_months[12] = NULL;
929  localized_full_months[12] = NULL;
930 
931  CurrentLCTimeValid = true;
932 }
933 
934 
935 #if defined(WIN32) && defined(LC_MESSAGES)
936 /*
937  * Convert a Windows setlocale() argument to a Unix-style one.
938  *
939  * Regardless of platform, we install message catalogs under a Unix-style
940  * LL[_CC][.ENCODING][@VARIANT] naming convention. Only LC_MESSAGES settings
941  * following that style will elicit localized interface strings.
942  *
943  * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
944  * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
945  * case-insensitive. setlocale() returns the fully-qualified form; for
946  * example, setlocale("thaI") returns "Thai_Thailand.874". Internally,
947  * setlocale() and _create_locale() select a "locale identifier"[1] and store
948  * it in an undocumented _locale_t field. From that LCID, we can retrieve the
949  * ISO 639 language and the ISO 3166 country. Character encoding does not
950  * matter, because the server and client encodings govern that.
951  *
952  * Windows Vista introduced the "locale name" concept[2], closely following
953  * RFC 4646. Locale identifiers are now deprecated. Starting with Visual
954  * Studio 2012, setlocale() accepts locale names in addition to the strings it
955  * accepted historically. It does not standardize them; setlocale("Th-tH")
956  * returns "Th-tH". setlocale(category, "") still returns a traditional
957  * string. Furthermore, msvcr110.dll changed the undocumented _locale_t
958  * content to carry locale names instead of locale identifiers.
959  *
960  * Visual Studio 2015 should still be able to do the same as Visual Studio
961  * 2012, but the declaration of locale_name is missing in _locale_t, causing
962  * this code compilation to fail, hence this falls back instead on to
963  * enumerating all system locales by using EnumSystemLocalesEx to find the
964  * required locale name. If the input argument is in Unix-style then we can
965  * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
966  * LOCALE_SNAME.
967  *
968  * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
969  * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
970  * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
971  * localized messages. In particular, every lc_messages setting that initdb
972  * can select automatically will yield only C-locale messages. XXX This could
973  * be fixed by running the fully-qualified locale name through a lookup table.
974  *
975  * This function returns a pointer to a static buffer bearing the converted
976  * name or NULL if conversion fails.
977  *
978  * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
979  * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
980  */
981 
982 #if defined(_MSC_VER)
983 
984 /*
985  * Callback function for EnumSystemLocalesEx() in get_iso_localename().
986  *
987  * This function enumerates all system locales, searching for one that matches
988  * an input with the format: <Language>[_<Country>], e.g.
989  * English[_United States]
990  *
991  * The input is a three wchar_t array as an LPARAM. The first element is the
992  * locale_name we want to match, the second element is an allocated buffer
993  * where the Unix-style locale is copied if a match is found, and the third
994  * element is the search status, 1 if a match was found, 0 otherwise.
995  */
996 static BOOL CALLBACK
997 search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
998 {
999  wchar_t test_locale[LOCALE_NAME_MAX_LENGTH];
1000  wchar_t **argv;
1001 
1002  (void) (dwFlags);
1003 
1004  argv = (wchar_t **) lparam;
1005  *argv[2] = (wchar_t) 0;
1006 
1007  memset(test_locale, 0, sizeof(test_locale));
1008 
1009  /* Get the name of the <Language> in English */
1010  if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
1011  test_locale, LOCALE_NAME_MAX_LENGTH))
1012  {
1013  /*
1014  * If the enumerated locale does not have a hyphen ("en") OR the
1015  * lc_message input does not have an underscore ("English"), we only
1016  * need to compare the <Language> tags.
1017  */
1018  if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
1019  {
1020  if (_wcsicmp(argv[0], test_locale) == 0)
1021  {
1022  wcscpy(argv[1], pStr);
1023  *argv[2] = (wchar_t) 1;
1024  return FALSE;
1025  }
1026  }
1027 
1028  /*
1029  * We have to compare a full <Language>_<Country> tag, so we append
1030  * the underscore and name of the country/region in English, e.g.
1031  * "English_United States".
1032  */
1033  else
1034  {
1035  size_t len;
1036 
1037  wcscat(test_locale, L"_");
1038  len = wcslen(test_locale);
1039  if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
1040  test_locale + len,
1041  LOCALE_NAME_MAX_LENGTH - len))
1042  {
1043  if (_wcsicmp(argv[0], test_locale) == 0)
1044  {
1045  wcscpy(argv[1], pStr);
1046  *argv[2] = (wchar_t) 1;
1047  return FALSE;
1048  }
1049  }
1050  }
1051  }
1052 
1053  return TRUE;
1054 }
1055 
1056 /*
1057  * This function converts a Windows locale name to an ISO formatted version
1058  * for Visual Studio 2015 or greater.
1059  *
1060  * Returns NULL, if no valid conversion was found.
1061  */
1062 static char *
1063 get_iso_localename(const char *winlocname)
1064 {
1065  wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH];
1066  wchar_t buffer[LOCALE_NAME_MAX_LENGTH];
1067  static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1068  char *period;
1069  int len;
1070  int ret_val;
1071 
1072  /*
1073  * Valid locales have the following syntax:
1074  * <Language>[_<Country>[.<CodePage>]]
1075  *
1076  * GetLocaleInfoEx can only take locale name without code-page and for the
1077  * purpose of this API the code-page doesn't matter.
1078  */
1079  period = strchr(winlocname, '.');
1080  if (period != NULL)
1081  len = period - winlocname;
1082  else
1083  len = pg_mbstrlen(winlocname);
1084 
1085  memset(wc_locale_name, 0, sizeof(wc_locale_name));
1086  memset(buffer, 0, sizeof(buffer));
1087  MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
1088  LOCALE_NAME_MAX_LENGTH);
1089 
1090  /*
1091  * If the lc_messages is already a Unix-style string, we have a direct
1092  * match with LOCALE_SNAME, e.g. en-US, en_US.
1093  */
1094  ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
1095  LOCALE_NAME_MAX_LENGTH);
1096  if (!ret_val)
1097  {
1098  /*
1099  * Search for a locale in the system that matches language and country
1100  * name.
1101  */
1102  wchar_t *argv[3];
1103 
1104  argv[0] = wc_locale_name;
1105  argv[1] = buffer;
1106  argv[2] = (wchar_t *) &ret_val;
1107  EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
1108  NULL);
1109  }
1110 
1111  if (ret_val)
1112  {
1113  size_t rc;
1114  char *hyphen;
1115 
1116  /* Locale names use only ASCII, any conversion locale suffices. */
1117  rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
1118  if (rc == -1 || rc == sizeof(iso_lc_messages))
1119  return NULL;
1120 
1121  /*
1122  * Since the message catalogs sit on a case-insensitive filesystem, we
1123  * need not standardize letter case here. So long as we do not ship
1124  * message catalogs for which it would matter, we also need not
1125  * translate the script/variant portion, e.g. uz-Cyrl-UZ to
1126  * uz_UZ@cyrillic. Simply replace the hyphen with an underscore.
1127  */
1128  hyphen = strchr(iso_lc_messages, '-');
1129  if (hyphen)
1130  *hyphen = '_';
1131  return iso_lc_messages;
1132  }
1133 
1134  return NULL;
1135 }
1136 
1137 static char *
1138 IsoLocaleName(const char *winlocname)
1139 {
1140  static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1141 
1142  if (pg_strcasecmp("c", winlocname) == 0 ||
1143  pg_strcasecmp("posix", winlocname) == 0)
1144  {
1145  strcpy(iso_lc_messages, "C");
1146  return iso_lc_messages;
1147  }
1148  else
1149  return get_iso_localename(winlocname);
1150 }
1151 
1152 #else /* !defined(_MSC_VER) */
1153 
1154 static char *
1155 IsoLocaleName(const char *winlocname)
1156 {
1157  return NULL; /* Not supported on MinGW */
1158 }
1159 
1160 #endif /* defined(_MSC_VER) */
1161 
1162 #endif /* WIN32 && LC_MESSAGES */
1163 
1164 
1165 /*
1166  * Detect aging strxfrm() implementations that, in a subset of locales, write
1167  * past the specified buffer length. Affected users must update OS packages
1168  * before using PostgreSQL 9.5 or later.
1169  *
1170  * Assume that the bug can come and go from one postmaster startup to another
1171  * due to physical replication among diverse machines. Assume that the bug's
1172  * presence will not change during the life of a particular postmaster. Given
1173  * those assumptions, call this no less than once per postmaster startup per
1174  * LC_COLLATE setting used. No known-affected system offers strxfrm_l(), so
1175  * there is no need to consider pg_collation locales.
1176  */
1177 void
1179 {
1180  char buf[32];
1181  const int canary = 0x7F;
1182  bool ok = true;
1183 
1184  /*
1185  * Given a two-byte ASCII string and length limit 7, 8 or 9, Solaris 10
1186  * 05/08 returns 18 and modifies 10 bytes. It respects limits above or
1187  * below that range.
1188  *
1189  * The bug is present in Solaris 8 as well; it is absent in Solaris 10
1190  * 01/13 and Solaris 11.2. Affected locales include is_IS.ISO8859-1,
1191  * en_US.UTF-8, en_US.ISO8859-1, and ru_RU.KOI8-R. Unaffected locales
1192  * include de_DE.UTF-8, de_DE.ISO8859-1, zh_TW.UTF-8, and C.
1193  */
1194  buf[7] = canary;
1195  (void) strxfrm(buf, "ab", 7);
1196  if (buf[7] != canary)
1197  ok = false;
1198 
1199  /*
1200  * illumos bug #1594 was present in the source tree from 2010-10-11 to
1201  * 2012-02-01. Given an ASCII string of any length and length limit 1,
1202  * affected systems ignore the length limit and modify a number of bytes
1203  * one less than the return value. The problem inputs for this bug do not
1204  * overlap those for the Solaris bug, hence a distinct test.
1205  *
1206  * Affected systems include smartos-20110926T021612Z. Affected locales
1207  * include en_US.ISO8859-1 and en_US.UTF-8. Unaffected locales include C.
1208  */
1209  buf[1] = canary;
1210  (void) strxfrm(buf, "a", 1);
1211  if (buf[1] != canary)
1212  ok = false;
1213 
1214  if (!ok)
1215  ereport(ERROR,
1216  (errcode(ERRCODE_SYSTEM_ERROR),
1217  errmsg_internal("strxfrm(), in locale \"%s\", writes past the specified array length",
1218  setlocale(LC_COLLATE, NULL)),
1219  errhint("Apply system library package updates.")));
1220 }
1221 
1222 
1223 /*
1224  * Cache mechanism for collation information.
1225  *
1226  * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
1227  * (or POSIX), so we can optimize a few code paths in various places.
1228  * For the built-in C and POSIX collations, we can know that without even
1229  * doing a cache lookup, but we want to support aliases for C/POSIX too.
1230  * For the "default" collation, there are separate static cache variables,
1231  * since consulting the pg_collation catalog doesn't tell us what we need.
1232  *
1233  * Also, if a pg_locale_t has been requested for a collation, we cache that
1234  * for the life of a backend.
1235  *
1236  * Note that some code relies on the flags not reporting false negatives
1237  * (that is, saying it's not C when it is). For example, char2wchar()
1238  * could fail if the locale is C, so str_tolower() shouldn't call it
1239  * in that case.
1240  *
1241  * Note that we currently lack any way to flush the cache. Since we don't
1242  * support ALTER COLLATION, this is OK. The worst case is that someone
1243  * drops a collation, and a useless cache entry hangs around in existing
1244  * backends.
1245  */
1246 
1247 static collation_cache_entry *
1248 lookup_collation_cache(Oid collation, bool set_flags)
1249 {
1250  collation_cache_entry *cache_entry;
1251  bool found;
1252 
1253  Assert(OidIsValid(collation));
1254  Assert(collation != DEFAULT_COLLATION_OID);
1255 
1256  if (collation_cache == NULL)
1257  {
1258  /* First time through, initialize the hash table */
1259  HASHCTL ctl;
1260 
1261  ctl.keysize = sizeof(Oid);
1262  ctl.entrysize = sizeof(collation_cache_entry);
1263  collation_cache = hash_create("Collation cache", 100, &ctl,
1264  HASH_ELEM | HASH_BLOBS);
1265  }
1266 
1267  cache_entry = hash_search(collation_cache, &collation, HASH_ENTER, &found);
1268  if (!found)
1269  {
1270  /*
1271  * Make sure cache entry is marked invalid, in case we fail before
1272  * setting things.
1273  */
1274  cache_entry->flags_valid = false;
1275  cache_entry->locale = 0;
1276  }
1277 
1278  if (set_flags && !cache_entry->flags_valid)
1279  {
1280  /* Attempt to set the flags */
1281  HeapTuple tp;
1282  Form_pg_collation collform;
1283 
1284  tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
1285  if (!HeapTupleIsValid(tp))
1286  elog(ERROR, "cache lookup failed for collation %u", collation);
1287  collform = (Form_pg_collation) GETSTRUCT(tp);
1288 
1289  if (collform->collprovider == COLLPROVIDER_LIBC)
1290  {
1291  Datum datum;
1292  const char *collcollate;
1293  const char *collctype;
1294 
1295  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1296  collcollate = TextDatumGetCString(datum);
1297  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1298  collctype = TextDatumGetCString(datum);
1299 
1300  cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
1301  (strcmp(collcollate, "POSIX") == 0));
1302  cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
1303  (strcmp(collctype, "POSIX") == 0));
1304  }
1305  else
1306  {
1307  cache_entry->collate_is_c = false;
1308  cache_entry->ctype_is_c = false;
1309  }
1310 
1311  cache_entry->flags_valid = true;
1312 
1313  ReleaseSysCache(tp);
1314  }
1315 
1316  return cache_entry;
1317 }
1318 
1319 
1320 /*
1321  * Detect whether collation's LC_COLLATE property is C
1322  */
1323 bool
1325 {
1326  /*
1327  * If we're asked about "collation 0", return false, so that the code will
1328  * go into the non-C path and report that the collation is bogus.
1329  */
1330  if (!OidIsValid(collation))
1331  return false;
1332 
1333  /*
1334  * If we're asked about the default collation, we have to inquire of the C
1335  * library. Cache the result so we only have to compute it once.
1336  */
1337  if (collation == DEFAULT_COLLATION_OID)
1338  {
1339  static int result = -1;
1340  char *localeptr;
1341 
1342  if (default_locale.provider == COLLPROVIDER_ICU)
1343  return false;
1344 
1345  if (result >= 0)
1346  return (bool) result;
1347  localeptr = setlocale(LC_COLLATE, NULL);
1348  if (!localeptr)
1349  elog(ERROR, "invalid LC_COLLATE setting");
1350 
1351  if (strcmp(localeptr, "C") == 0)
1352  result = true;
1353  else if (strcmp(localeptr, "POSIX") == 0)
1354  result = true;
1355  else
1356  result = false;
1357  return (bool) result;
1358  }
1359 
1360  /*
1361  * If we're asked about the built-in C/POSIX collations, we know that.
1362  */
1363  if (collation == C_COLLATION_OID ||
1364  collation == POSIX_COLLATION_OID)
1365  return true;
1366 
1367  /*
1368  * Otherwise, we have to consult pg_collation, but we cache that.
1369  */
1370  return (lookup_collation_cache(collation, true))->collate_is_c;
1371 }
1372 
1373 /*
1374  * Detect whether collation's LC_CTYPE property is C
1375  */
1376 bool
1377 lc_ctype_is_c(Oid collation)
1378 {
1379  /*
1380  * If we're asked about "collation 0", return false, so that the code will
1381  * go into the non-C path and report that the collation is bogus.
1382  */
1383  if (!OidIsValid(collation))
1384  return false;
1385 
1386  /*
1387  * If we're asked about the default collation, we have to inquire of the C
1388  * library. Cache the result so we only have to compute it once.
1389  */
1390  if (collation == DEFAULT_COLLATION_OID)
1391  {
1392  static int result = -1;
1393  char *localeptr;
1394 
1395  if (default_locale.provider == COLLPROVIDER_ICU)
1396  return false;
1397 
1398  if (result >= 0)
1399  return (bool) result;
1400  localeptr = setlocale(LC_CTYPE, NULL);
1401  if (!localeptr)
1402  elog(ERROR, "invalid LC_CTYPE setting");
1403 
1404  if (strcmp(localeptr, "C") == 0)
1405  result = true;
1406  else if (strcmp(localeptr, "POSIX") == 0)
1407  result = true;
1408  else
1409  result = false;
1410  return (bool) result;
1411  }
1412 
1413  /*
1414  * If we're asked about the built-in C/POSIX collations, we know that.
1415  */
1416  if (collation == C_COLLATION_OID ||
1417  collation == POSIX_COLLATION_OID)
1418  return true;
1419 
1420  /*
1421  * Otherwise, we have to consult pg_collation, but we cache that.
1422  */
1423  return (lookup_collation_cache(collation, true))->ctype_is_c;
1424 }
1425 
1427 
1428 void
1429 make_icu_collator(const char *iculocstr,
1430  const char *icurules,
1431  struct pg_locale_struct *resultp)
1432 {
1433 #ifdef USE_ICU
1434  UCollator *collator;
1435 
1436  collator = pg_ucol_open(iculocstr);
1437 
1438  /*
1439  * If rules are specified, we extract the rules of the standard collation,
1440  * add our own rules, and make a new collator with the combined rules.
1441  */
1442  if (icurules)
1443  {
1444  const UChar *default_rules;
1445  UChar *agg_rules;
1446  UChar *my_rules;
1447  UErrorCode status;
1448  int32_t length;
1449 
1450  default_rules = ucol_getRules(collator, &length);
1451  icu_to_uchar(&my_rules, icurules, strlen(icurules));
1452 
1453  agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1);
1454  u_strcpy(agg_rules, default_rules);
1455  u_strcat(agg_rules, my_rules);
1456 
1457  ucol_close(collator);
1458 
1459  status = U_ZERO_ERROR;
1460  collator = ucol_openRules(agg_rules, u_strlen(agg_rules),
1461  UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status);
1462  if (U_FAILURE(status))
1463  ereport(ERROR,
1464  (errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
1465  iculocstr, icurules, u_errorName(status))));
1466  }
1467 
1468  /* We will leak this string if the caller errors later :-( */
1469  resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
1470  resultp->info.icu.ucol = collator;
1471 #else /* not USE_ICU */
1472  /* could get here if a collation was created by a build with ICU */
1473  ereport(ERROR,
1474  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1475  errmsg("ICU is not supported in this build")));
1476 #endif /* not USE_ICU */
1477 }
1478 
1479 
1480 /* simple subroutine for reporting errors from newlocale() */
1481 #ifdef HAVE_LOCALE_T
1482 static void
1483 report_newlocale_failure(const char *localename)
1484 {
1485  int save_errno;
1486 
1487  /*
1488  * Windows doesn't provide any useful error indication from
1489  * _create_locale(), and BSD-derived platforms don't seem to feel they
1490  * need to set errno either (even though POSIX is pretty clear that
1491  * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1492  * is what to report.
1493  */
1494  if (errno == 0)
1495  errno = ENOENT;
1496 
1497  /*
1498  * ENOENT means "no such locale", not "no such file", so clarify that
1499  * errno with an errdetail message.
1500  */
1501  save_errno = errno; /* auxiliary funcs might change errno */
1502  ereport(ERROR,
1503  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1504  errmsg("could not create locale \"%s\": %m",
1505  localename),
1506  (save_errno == ENOENT ?
1507  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1508  localename) : 0)));
1509 }
1510 #endif /* HAVE_LOCALE_T */
1511 
1512 bool
1514 {
1515  /* default locale must always be deterministic */
1516  if (locale == NULL)
1517  return true;
1518  else
1519  return locale->deterministic;
1520 }
1521 
1522 /*
1523  * Create a locale_t from a collation OID. Results are cached for the
1524  * lifetime of the backend. Thus, do not free the result with freelocale().
1525  *
1526  * As a special optimization, the default/database collation returns 0.
1527  * Callers should then revert to the non-locale_t-enabled code path.
1528  * Also, callers should avoid calling this before going down a C/POSIX
1529  * fastpath, because such a fastpath should work even on platforms without
1530  * locale_t support in the C library.
1531  *
1532  * For simplicity, we always generate COLLATE + CTYPE even though we
1533  * might only need one of them. Since this is called only once per session,
1534  * it shouldn't cost much.
1535  */
1538 {
1539  collation_cache_entry *cache_entry;
1540 
1541  /* Callers must pass a valid OID */
1543 
1544  if (collid == DEFAULT_COLLATION_OID)
1545  {
1546  if (default_locale.provider == COLLPROVIDER_ICU)
1547  return &default_locale;
1548  else
1549  return (pg_locale_t) 0;
1550  }
1551 
1552  cache_entry = lookup_collation_cache(collid, false);
1553 
1554  if (cache_entry->locale == 0)
1555  {
1556  /* We haven't computed this yet in this session, so do it */
1557  HeapTuple tp;
1558  Form_pg_collation collform;
1559  struct pg_locale_struct result;
1560  pg_locale_t resultp;
1561  Datum datum;
1562  bool isnull;
1563 
1565  if (!HeapTupleIsValid(tp))
1566  elog(ERROR, "cache lookup failed for collation %u", collid);
1567  collform = (Form_pg_collation) GETSTRUCT(tp);
1568 
1569  /* We'll fill in the result struct locally before allocating memory */
1570  memset(&result, 0, sizeof(result));
1571  result.provider = collform->collprovider;
1572  result.deterministic = collform->collisdeterministic;
1573 
1574  if (collform->collprovider == COLLPROVIDER_LIBC)
1575  {
1576 #ifdef HAVE_LOCALE_T
1577  const char *collcollate;
1578  const char *collctype pg_attribute_unused();
1579  locale_t loc;
1580 
1581  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1582  collcollate = TextDatumGetCString(datum);
1583  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1584  collctype = TextDatumGetCString(datum);
1585 
1586  if (strcmp(collcollate, collctype) == 0)
1587  {
1588  /* Normal case where they're the same */
1589  errno = 0;
1590 #ifndef WIN32
1591  loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate,
1592  NULL);
1593 #else
1594  loc = _create_locale(LC_ALL, collcollate);
1595 #endif
1596  if (!loc)
1597  report_newlocale_failure(collcollate);
1598  }
1599  else
1600  {
1601 #ifndef WIN32
1602  /* We need two newlocale() steps */
1603  locale_t loc1;
1604 
1605  errno = 0;
1606  loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
1607  if (!loc1)
1608  report_newlocale_failure(collcollate);
1609  errno = 0;
1610  loc = newlocale(LC_CTYPE_MASK, collctype, loc1);
1611  if (!loc)
1612  report_newlocale_failure(collctype);
1613 #else
1614 
1615  /*
1616  * XXX The _create_locale() API doesn't appear to support
1617  * this. Could perhaps be worked around by changing
1618  * pg_locale_t to contain two separate fields.
1619  */
1620  ereport(ERROR,
1621  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1622  errmsg("collations with different collate and ctype values are not supported on this platform")));
1623 #endif
1624  }
1625 
1626  result.info.lt = loc;
1627 #else /* not HAVE_LOCALE_T */
1628  /* platform that doesn't support locale_t */
1629  ereport(ERROR,
1630  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1631  errmsg("collation provider LIBC is not supported on this platform")));
1632 #endif /* not HAVE_LOCALE_T */
1633  }
1634  else if (collform->collprovider == COLLPROVIDER_ICU)
1635  {
1636  const char *iculocstr;
1637  const char *icurules;
1638 
1639  datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale);
1640  iculocstr = TextDatumGetCString(datum);
1641 
1642  datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
1643  if (!isnull)
1644  icurules = TextDatumGetCString(datum);
1645  else
1646  icurules = NULL;
1647 
1648  make_icu_collator(iculocstr, icurules, &result);
1649  }
1650 
1651  datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
1652  &isnull);
1653  if (!isnull)
1654  {
1655  char *actual_versionstr;
1656  char *collversionstr;
1657 
1658  collversionstr = TextDatumGetCString(datum);
1659 
1660  datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : Anum_pg_collation_collcollate);
1661 
1662  actual_versionstr = get_collation_actual_version(collform->collprovider,
1663  TextDatumGetCString(datum));
1664  if (!actual_versionstr)
1665  {
1666  /*
1667  * This could happen when specifying a version in CREATE
1668  * COLLATION but the provider does not support versioning, or
1669  * manually creating a mess in the catalogs.
1670  */
1671  ereport(ERROR,
1672  (errmsg("collation \"%s\" has no actual version, but a version was recorded",
1673  NameStr(collform->collname))));
1674  }
1675 
1676  if (strcmp(actual_versionstr, collversionstr) != 0)
1677  ereport(WARNING,
1678  (errmsg("collation \"%s\" has version mismatch",
1679  NameStr(collform->collname)),
1680  errdetail("The collation in the database was created using version %s, "
1681  "but the operating system provides version %s.",
1682  collversionstr, actual_versionstr),
1683  errhint("Rebuild all objects affected by this collation and run "
1684  "ALTER COLLATION %s REFRESH VERSION, "
1685  "or build PostgreSQL with the right library version.",
1686  quote_qualified_identifier(get_namespace_name(collform->collnamespace),
1687  NameStr(collform->collname)))));
1688  }
1689 
1690  ReleaseSysCache(tp);
1691 
1692  /* We'll keep the pg_locale_t structures in TopMemoryContext */
1693  resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
1694  *resultp = result;
1695 
1696  cache_entry->locale = resultp;
1697  }
1698 
1699  return cache_entry->locale;
1700 }
1701 
1702 /*
1703  * Get provider-specific collation version string for the given collation from
1704  * the operating system/library.
1705  */
1706 char *
1707 get_collation_actual_version(char collprovider, const char *collcollate)
1708 {
1709  char *collversion = NULL;
1710 
1711 #ifdef USE_ICU
1712  if (collprovider == COLLPROVIDER_ICU)
1713  {
1714  UCollator *collator;
1715  UVersionInfo versioninfo;
1716  char buf[U_MAX_VERSION_STRING_LENGTH];
1717 
1718  collator = pg_ucol_open(collcollate);
1719 
1720  ucol_getVersion(collator, versioninfo);
1721  ucol_close(collator);
1722 
1723  u_versionToString(versioninfo, buf);
1724  collversion = pstrdup(buf);
1725  }
1726  else
1727 #endif
1728  if (collprovider == COLLPROVIDER_LIBC &&
1729  pg_strcasecmp("C", collcollate) != 0 &&
1730  pg_strncasecmp("C.", collcollate, 2) != 0 &&
1731  pg_strcasecmp("POSIX", collcollate) != 0)
1732  {
1733 #if defined(__GLIBC__)
1734  /* Use the glibc version because we don't have anything better. */
1735  collversion = pstrdup(gnu_get_libc_version());
1736 #elif defined(LC_VERSION_MASK)
1737  locale_t loc;
1738 
1739  /* Look up FreeBSD collation version. */
1740  loc = newlocale(LC_COLLATE, collcollate, NULL);
1741  if (loc)
1742  {
1743  collversion =
1744  pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1745  freelocale(loc);
1746  }
1747  else
1748  ereport(ERROR,
1749  (errmsg("could not load locale \"%s\"", collcollate)));
1750 #elif defined(WIN32)
1751  /*
1752  * If we are targeting Windows Vista and above, we can ask for a name
1753  * given a collation name (earlier versions required a location code
1754  * that we don't have).
1755  */
1756  NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1757  WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1758 
1759  MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1760  LOCALE_NAME_MAX_LENGTH);
1761  if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1762  {
1763  /*
1764  * GetNLSVersionEx() wants a language tag such as "en-US", not a
1765  * locale name like "English_United States.1252". Until those
1766  * values can be prevented from entering the system, or 100%
1767  * reliably converted to the more useful tag format, tolerate the
1768  * resulting error and report that we have no version data.
1769  */
1770  if (GetLastError() == ERROR_INVALID_PARAMETER)
1771  return NULL;
1772 
1773  ereport(ERROR,
1774  (errmsg("could not get collation version for locale \"%s\": error code %lu",
1775  collcollate,
1776  GetLastError())));
1777  }
1778  collversion = psprintf("%lu.%lu,%lu.%lu",
1779  (version.dwNLSVersion >> 8) & 0xFFFF,
1780  version.dwNLSVersion & 0xFF,
1781  (version.dwDefinedVersion >> 8) & 0xFFFF,
1782  version.dwDefinedVersion & 0xFF);
1783 #endif
1784  }
1785 
1786  return collversion;
1787 }
1788 
1789 /*
1790  * pg_strncoll_libc_win32_utf8
1791  *
1792  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1793  * invoke wcscoll() or wcscoll_l().
1794  */
1795 #ifdef WIN32
1796 static int
1797 pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
1798  size_t len2, pg_locale_t locale)
1799 {
1800  char sbuf[TEXTBUFLEN];
1801  char *buf = sbuf;
1802  char *a1p,
1803  *a2p;
1804  int a1len = len1 * 2 + 2;
1805  int a2len = len2 * 2 + 2;
1806  int r;
1807  int result;
1808 
1809  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1811 #ifndef WIN32
1812  Assert(false);
1813 #endif
1814 
1815  if (a1len + a2len > TEXTBUFLEN)
1816  buf = palloc(a1len + a2len);
1817 
1818  a1p = buf;
1819  a2p = buf + a1len;
1820 
1821  /* API does not work for zero-length input */
1822  if (len1 == 0)
1823  r = 0;
1824  else
1825  {
1826  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1827  (LPWSTR) a1p, a1len / 2);
1828  if (!r)
1829  ereport(ERROR,
1830  (errmsg("could not convert string to UTF-16: error code %lu",
1831  GetLastError())));
1832  }
1833  ((LPWSTR) a1p)[r] = 0;
1834 
1835  if (len2 == 0)
1836  r = 0;
1837  else
1838  {
1839  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1840  (LPWSTR) a2p, a2len / 2);
1841  if (!r)
1842  ereport(ERROR,
1843  (errmsg("could not convert string to UTF-16: error code %lu",
1844  GetLastError())));
1845  }
1846  ((LPWSTR) a2p)[r] = 0;
1847 
1848  errno = 0;
1849 #ifdef HAVE_LOCALE_T
1850  if (locale)
1851  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1852  else
1853 #endif
1854  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1855  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1856  * headers */
1857  ereport(ERROR,
1858  (errmsg("could not compare Unicode strings: %m")));
1859 
1860  if (buf != sbuf)
1861  pfree(buf);
1862 
1863  return result;
1864 }
1865 #endif /* WIN32 */
1866 
1867 /*
1868  * pg_strcoll_libc
1869  *
1870  * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for
1871  * the given locale, platform, and database encoding. If the locale is NULL,
1872  * use the database collation.
1873  *
1874  * Arguments must be encoded in the database encoding and nul-terminated.
1875  */
1876 static int
1877 pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
1878 {
1879  int result;
1880 
1881  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1882 #ifdef WIN32
1883  if (GetDatabaseEncoding() == PG_UTF8)
1884  {
1885  size_t len1 = strlen(arg1);
1886  size_t len2 = strlen(arg2);
1887  result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1888  }
1889  else
1890 #endif /* WIN32 */
1891  if (locale)
1892  {
1893 #ifdef HAVE_LOCALE_T
1894  result = strcoll_l(arg1, arg2, locale->info.lt);
1895 #else
1896  /* shouldn't happen */
1897  elog(ERROR, "unsupported collprovider: %c", locale->provider);
1898 #endif
1899  }
1900  else
1901  result = strcoll(arg1, arg2);
1902 
1903  return result;
1904 }
1905 
1906 /*
1907  * pg_strncoll_libc
1908  *
1909  * Nul-terminate the arguments and call pg_strcoll_libc().
1910  */
1911 static int
1912 pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
1914 {
1915  char sbuf[TEXTBUFLEN];
1916  char *buf = sbuf;
1917  size_t bufsize1 = len1 + 1;
1918  size_t bufsize2 = len2 + 1;
1919  char *arg1n;
1920  char *arg2n;
1921  int result;
1922 
1923  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1924 
1925 #ifdef WIN32
1926  /* check for this case before doing the work for nul-termination */
1927  if (GetDatabaseEncoding() == PG_UTF8)
1928  return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1929 #endif /* WIN32 */
1930 
1931  if (bufsize1 + bufsize2 > TEXTBUFLEN)
1932  buf = palloc(bufsize1 + bufsize2);
1933 
1934  arg1n = buf;
1935  arg2n = buf + bufsize1;
1936 
1937  /* nul-terminate arguments */
1938  memcpy(arg1n, arg1, len1);
1939  arg1n[len1] = '\0';
1940  memcpy(arg2n, arg2, len2);
1941  arg2n[len2] = '\0';
1942 
1943  result = pg_strcoll_libc(arg1n, arg2n, locale);
1944 
1945  if (buf != sbuf)
1946  pfree(buf);
1947 
1948  return result;
1949 }
1950 
1951 #ifdef USE_ICU
1952 
1953 /*
1954  * pg_strncoll_icu_no_utf8
1955  *
1956  * Convert the arguments from the database encoding to UChar strings, then
1957  * call ucol_strcoll(). An argument length of -1 means that the string is
1958  * NUL-terminated.
1959  *
1960  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1961  * caller should call that instead.
1962  */
1963 static int
1964 pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
1965  const char *arg2, int32_t len2, pg_locale_t locale)
1966 {
1967  char sbuf[TEXTBUFLEN];
1968  char *buf = sbuf;
1969  int32_t ulen1;
1970  int32_t ulen2;
1971  size_t bufsize1;
1972  size_t bufsize2;
1973  UChar *uchar1,
1974  *uchar2;
1975  int result;
1976 
1977  Assert(locale->provider == COLLPROVIDER_ICU);
1978 #ifdef HAVE_UCOL_STRCOLLUTF8
1980 #endif
1981 
1982  init_icu_converter();
1983 
1984  ulen1 = uchar_length(icu_converter, arg1, len1);
1985  ulen2 = uchar_length(icu_converter, arg2, len2);
1986 
1987  bufsize1 = (ulen1 + 1) * sizeof(UChar);
1988  bufsize2 = (ulen2 + 1) * sizeof(UChar);
1989 
1990  if (bufsize1 + bufsize2 > TEXTBUFLEN)
1991  buf = palloc(bufsize1 + bufsize2);
1992 
1993  uchar1 = (UChar *) buf;
1994  uchar2 = (UChar *) (buf + bufsize1);
1995 
1996  ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
1997  ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
1998 
1999  result = ucol_strcoll(locale->info.icu.ucol,
2000  uchar1, ulen1,
2001  uchar2, ulen2);
2002 
2003  if (buf != sbuf)
2004  pfree(buf);
2005 
2006  return result;
2007 }
2008 
2009 /*
2010  * pg_strncoll_icu
2011  *
2012  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
2013  * database encoding. An argument length of -1 means the string is
2014  * NUL-terminated.
2015  *
2016  * Arguments must be encoded in the database encoding.
2017  */
2018 static int
2019 pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
2021 {
2022  int result;
2023 
2024  Assert(locale->provider == COLLPROVIDER_ICU);
2025 
2026 #ifdef HAVE_UCOL_STRCOLLUTF8
2027  if (GetDatabaseEncoding() == PG_UTF8)
2028  {
2029  UErrorCode status;
2030 
2031  status = U_ZERO_ERROR;
2032  result = ucol_strcollUTF8(locale->info.icu.ucol,
2033  arg1, len1,
2034  arg2, len2,
2035  &status);
2036  if (U_FAILURE(status))
2037  ereport(ERROR,
2038  (errmsg("collation failed: %s", u_errorName(status))));
2039  }
2040  else
2041 #endif
2042  {
2043  result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
2044  }
2045 
2046  return result;
2047 }
2048 
2049 #endif /* USE_ICU */
2050 
2051 /*
2052  * pg_strcoll
2053  *
2054  * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2055  * or wcscoll_l() as appropriate for the given locale, platform, and database
2056  * encoding. If the locale is not specified, use the database collation.
2057  *
2058  * Arguments must be encoded in the database encoding and nul-terminated.
2059  *
2060  * The caller is responsible for breaking ties if the collation is
2061  * deterministic; this maintains consistency with pg_strxfrm(), which cannot
2062  * easily account for deterministic collations.
2063  */
2064 int
2065 pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
2066 {
2067  int result;
2068 
2069  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2070  result = pg_strcoll_libc(arg1, arg2, locale);
2071 #ifdef USE_ICU
2072  else if (locale->provider == COLLPROVIDER_ICU)
2073  result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
2074 #endif
2075  else
2076  /* shouldn't happen */
2077  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2078 
2079  return result;
2080 }
2081 
2082 /*
2083  * pg_strncoll
2084  *
2085  * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2086  * or wcscoll_l() as appropriate for the given locale, platform, and database
2087  * encoding. If the locale is not specified, use the database collation.
2088  *
2089  * Arguments must be encoded in the database encoding.
2090  *
2091  * This function may need to nul-terminate the arguments for libc functions;
2092  * so if the caller already has nul-terminated strings, it should call
2093  * pg_strcoll() instead.
2094  *
2095  * The caller is responsible for breaking ties if the collation is
2096  * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
2097  * easily account for deterministic collations.
2098  */
2099 int
2100 pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
2102 {
2103  int result;
2104 
2105  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2106  result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
2107 #ifdef USE_ICU
2108  else if (locale->provider == COLLPROVIDER_ICU)
2109  result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
2110 #endif
2111  else
2112  /* shouldn't happen */
2113  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2114 
2115  return result;
2116 }
2117 
2118 
2119 static size_t
2120 pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
2122 {
2123  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2124 
2125 #ifdef TRUST_STRXFRM
2126 #ifdef HAVE_LOCALE_T
2127  if (locale)
2128  return strxfrm_l(dest, src, destsize, locale->info.lt);
2129  else
2130 #endif
2131  return strxfrm(dest, src, destsize);
2132 #else
2133  /* shouldn't happen */
2134  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2135  return 0; /* keep compiler quiet */
2136 #endif
2137 }
2138 
2139 static size_t
2140 pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
2142 {
2143  char sbuf[TEXTBUFLEN];
2144  char *buf = sbuf;
2145  size_t bufsize = srclen + 1;
2146  size_t result;
2147 
2148  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2149 
2150  if (bufsize > TEXTBUFLEN)
2151  buf = palloc(bufsize);
2152 
2153  /* nul-terminate arguments */
2154  memcpy(buf, src, srclen);
2155  buf[srclen] = '\0';
2156 
2157  result = pg_strxfrm_libc(dest, buf, destsize, locale);
2158 
2159  if (buf != sbuf)
2160  pfree(buf);
2161 
2162  /* if dest is defined, it should be nul-terminated */
2163  Assert(result >= destsize || dest[result] == '\0');
2164 
2165  return result;
2166 }
2167 
2168 #ifdef USE_ICU
2169 
2170 /* 'srclen' of -1 means the strings are NUL-terminated */
2171 static size_t
2172 pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
2174 {
2175  char sbuf[TEXTBUFLEN];
2176  char *buf = sbuf;
2177  UChar *uchar;
2178  int32_t ulen;
2179  size_t uchar_bsize;
2180  Size result_bsize;
2181 
2182  Assert(locale->provider == COLLPROVIDER_ICU);
2183 
2184  init_icu_converter();
2185 
2186  ulen = uchar_length(icu_converter, src, srclen);
2187 
2188  uchar_bsize = (ulen + 1) * sizeof(UChar);
2189 
2190  if (uchar_bsize > TEXTBUFLEN)
2191  buf = palloc(uchar_bsize);
2192 
2193  uchar = (UChar *) buf;
2194 
2195  ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2196 
2197  result_bsize = ucol_getSortKey(locale->info.icu.ucol,
2198  uchar, ulen,
2199  (uint8_t *) dest, destsize);
2200 
2201  /*
2202  * ucol_getSortKey() counts the nul-terminator in the result length, but
2203  * this function should not.
2204  */
2205  Assert(result_bsize > 0);
2206  result_bsize--;
2207 
2208  if (buf != sbuf)
2209  pfree(buf);
2210 
2211  /* if dest is defined, it should be nul-terminated */
2212  Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
2213 
2214  return result_bsize;
2215 }
2216 
2217 /* 'srclen' of -1 means the strings are NUL-terminated */
2218 static size_t
2219 pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
2220  int32_t destsize, pg_locale_t locale)
2221 {
2222  char sbuf[TEXTBUFLEN];
2223  char *buf = sbuf;
2224  UCharIterator iter;
2225  uint32_t state[2];
2226  UErrorCode status;
2227  int32_t ulen = -1;
2228  UChar *uchar = NULL;
2229  size_t uchar_bsize;
2230  Size result_bsize;
2231 
2232  Assert(locale->provider == COLLPROVIDER_ICU);
2234 
2235  init_icu_converter();
2236 
2237  ulen = uchar_length(icu_converter, src, srclen);
2238 
2239  uchar_bsize = (ulen + 1) * sizeof(UChar);
2240 
2241  if (uchar_bsize > TEXTBUFLEN)
2242  buf = palloc(uchar_bsize);
2243 
2244  uchar = (UChar *) buf;
2245 
2246  ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2247 
2248  uiter_setString(&iter, uchar, ulen);
2249  state[0] = state[1] = 0; /* won't need that again */
2250  status = U_ZERO_ERROR;
2251  result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
2252  &iter,
2253  state,
2254  (uint8_t *) dest,
2255  destsize,
2256  &status);
2257  if (U_FAILURE(status))
2258  ereport(ERROR,
2259  (errmsg("sort key generation failed: %s",
2260  u_errorName(status))));
2261 
2262  return result_bsize;
2263 }
2264 
2265 /* 'srclen' of -1 means the strings are NUL-terminated */
2266 static size_t
2267 pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
2268  int32_t destsize, pg_locale_t locale)
2269 {
2270  size_t result;
2271 
2272  Assert(locale->provider == COLLPROVIDER_ICU);
2273 
2274  if (GetDatabaseEncoding() == PG_UTF8)
2275  {
2276  UCharIterator iter;
2277  uint32_t state[2];
2278  UErrorCode status;
2279 
2280  uiter_setUTF8(&iter, src, srclen);
2281  state[0] = state[1] = 0; /* won't need that again */
2282  status = U_ZERO_ERROR;
2283  result = ucol_nextSortKeyPart(locale->info.icu.ucol,
2284  &iter,
2285  state,
2286  (uint8_t *) dest,
2287  destsize,
2288  &status);
2289  if (U_FAILURE(status))
2290  ereport(ERROR,
2291  (errmsg("sort key generation failed: %s",
2292  u_errorName(status))));
2293  }
2294  else
2295  result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
2296  locale);
2297 
2298  return result;
2299 }
2300 
2301 #endif
2302 
2303 /*
2304  * Return true if the collation provider supports pg_strxfrm() and
2305  * pg_strnxfrm(); otherwise false.
2306  *
2307  * Unfortunately, it seems that strxfrm() for non-C collations is broken on
2308  * many common platforms; testing of multiple versions of glibc reveals that,
2309  * for many locales, strcoll() and strxfrm() do not return consistent
2310  * results. While no other libc other than Cygwin has so far been shown to
2311  * have a problem, we take the conservative course of action for right now and
2312  * disable this categorically. (Users who are certain this isn't a problem on
2313  * their system can define TRUST_STRXFRM.)
2314  *
2315  * No similar problem is known for the ICU provider.
2316  */
2317 bool
2319 {
2320  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2321 #ifdef TRUST_STRXFRM
2322  return true;
2323 #else
2324  return false;
2325 #endif
2326  else if (locale->provider == COLLPROVIDER_ICU)
2327  return true;
2328  else
2329  /* shouldn't happen */
2330  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2331 
2332  return false; /* keep compiler quiet */
2333 }
2334 
2335 /*
2336  * pg_strxfrm
2337  *
2338  * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2339  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2340  * untransformed strings.
2341  *
2342  * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
2343  * may be NULL.
2344  *
2345  * Returns the number of bytes needed to store the transformed string,
2346  * excluding the terminating nul byte. If the value returned is 'destsize' or
2347  * greater, the resulting contents of 'dest' are undefined.
2348  */
2349 size_t
2350 pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
2351 {
2352  size_t result = 0; /* keep compiler quiet */
2353 
2354  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2355  result = pg_strxfrm_libc(dest, src, destsize, locale);
2356 #ifdef USE_ICU
2357  else if (locale->provider == COLLPROVIDER_ICU)
2358  result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
2359 #endif
2360  else
2361  /* shouldn't happen */
2362  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2363 
2364  return result;
2365 }
2366 
2367 /*
2368  * pg_strnxfrm
2369  *
2370  * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2371  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2372  * untransformed strings.
2373  *
2374  * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
2375  * be NULL.
2376  *
2377  * Returns the number of bytes needed to store the transformed string,
2378  * excluding the terminating nul byte. If the value returned is 'destsize' or
2379  * greater, the resulting contents of 'dest' are undefined.
2380  *
2381  * This function may need to nul-terminate the argument for libc functions;
2382  * so if the caller already has a nul-terminated string, it should call
2383  * pg_strxfrm() instead.
2384  */
2385 size_t
2386 pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
2388 {
2389  size_t result = 0; /* keep compiler quiet */
2390 
2391  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2392  result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
2393 #ifdef USE_ICU
2394  else if (locale->provider == COLLPROVIDER_ICU)
2395  result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
2396 #endif
2397  else
2398  /* shouldn't happen */
2399  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2400 
2401  return result;
2402 }
2403 
2404 /*
2405  * Return true if the collation provider supports pg_strxfrm_prefix() and
2406  * pg_strnxfrm_prefix(); otherwise false.
2407  */
2408 bool
2410 {
2411  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2412  return false;
2413  else if (locale->provider == COLLPROVIDER_ICU)
2414  return true;
2415  else
2416  /* shouldn't happen */
2417  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2418 
2419  return false; /* keep compiler quiet */
2420 }
2421 
2422 /*
2423  * pg_strxfrm_prefix
2424  *
2425  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2426  * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2427  * untransformed strings. The result is not nul-terminated.
2428  *
2429  * The provided 'src' must be nul-terminated.
2430  *
2431  * If destsize is not large enough to hold the resulting byte sequence, stores
2432  * only the first destsize bytes in 'dest'. Returns the number of bytes
2433  * actually copied to 'dest'.
2434  */
2435 size_t
2436 pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
2438 {
2439  size_t result = 0; /* keep compiler quiet */
2440 
2441  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2442  elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()",
2443  locale->provider);
2444 #ifdef USE_ICU
2445  else if (locale->provider == COLLPROVIDER_ICU)
2446  result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2447 #endif
2448  else
2449  /* shouldn't happen */
2450  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2451 
2452  return result;
2453 }
2454 
2455 /*
2456  * pg_strnxfrm_prefix
2457  *
2458  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2459  * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2460  * untransformed strings. The result is not nul-terminated.
2461  *
2462  * The provided 'src' must be nul-terminated.
2463  *
2464  * If destsize is not large enough to hold the resulting byte sequence, stores
2465  * only the first destsize bytes in 'dest'. Returns the number of bytes
2466  * actually copied to 'dest'.
2467  *
2468  * This function may need to nul-terminate the argument for libc functions;
2469  * so if the caller already has a nul-terminated string, it should call
2470  * pg_strxfrm_prefix() instead.
2471  */
2472 size_t
2473 pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
2474  size_t srclen, pg_locale_t locale)
2475 {
2476  size_t result = 0; /* keep compiler quiet */
2477 
2478  if (!locale || locale->provider == COLLPROVIDER_LIBC)
2479  elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()",
2480  locale->provider);
2481 #ifdef USE_ICU
2482  else if (locale->provider == COLLPROVIDER_ICU)
2483  result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2484 #endif
2485  else
2486  /* shouldn't happen */
2487  elog(ERROR, "unsupported collprovider: %c", locale->provider);
2488 
2489  return result;
2490 }
2491 
2492 #ifdef USE_ICU
2493 
2494 /*
2495  * Wrapper around ucol_open() to handle API differences for older ICU
2496  * versions.
2497  */
2498 static UCollator *
2499 pg_ucol_open(const char *loc_str)
2500 {
2501  UCollator *collator;
2502  UErrorCode status;
2503  const char *orig_str = loc_str;
2504  char *fixed_str = NULL;
2505 
2506  /*
2507  * Must never open default collator, because it depends on the environment
2508  * and may change at any time. Should not happen, but check here to catch
2509  * bugs that might be hard to catch otherwise.
2510  *
2511  * NB: the default collator is not the same as the collator for the root
2512  * locale. The root locale may be specified as the empty string, "und", or
2513  * "root". The default collator is opened by passing NULL to ucol_open().
2514  */
2515  if (loc_str == NULL)
2516  elog(ERROR, "opening default collator is not supported");
2517 
2518  /*
2519  * In ICU versions 54 and earlier, "und" is not a recognized spelling of
2520  * the root locale. If the first component of the locale is "und", replace
2521  * with "root" before opening.
2522  */
2523  if (U_ICU_VERSION_MAJOR_NUM < 55)
2524  {
2525  char lang[ULOC_LANG_CAPACITY];
2526 
2527  status = U_ZERO_ERROR;
2528  uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2529  if (U_FAILURE(status))
2530  {
2531  ereport(ERROR,
2532  (errmsg("could not get language from locale \"%s\": %s",
2533  loc_str, u_errorName(status))));
2534  }
2535 
2536  if (strcmp(lang, "und") == 0)
2537  {
2538  const char *remainder = loc_str + strlen("und");
2539 
2540  fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
2541  strcpy(fixed_str, "root");
2542  strcat(fixed_str, remainder);
2543 
2544  loc_str = fixed_str;
2545  }
2546  }
2547 
2548  status = U_ZERO_ERROR;
2549  collator = ucol_open(loc_str, &status);
2550  if (U_FAILURE(status))
2551  ereport(ERROR,
2552  /* use original string for error report */
2553  (errmsg("could not open collator for locale \"%s\": %s",
2554  orig_str, u_errorName(status))));
2555 
2556  if (U_ICU_VERSION_MAJOR_NUM < 54)
2557  {
2558  status = U_ZERO_ERROR;
2559  icu_set_collation_attributes(collator, loc_str, &status);
2560 
2561  /*
2562  * Pretend the error came from ucol_open(), for consistent error
2563  * message across ICU versions.
2564  */
2565  if (U_FAILURE(status))
2566  {
2567  ucol_close(collator);
2568  ereport(ERROR,
2569  (errmsg("could not open collator for locale \"%s\": %s",
2570  orig_str, u_errorName(status))));
2571  }
2572  }
2573 
2574  if (fixed_str != NULL)
2575  pfree(fixed_str);
2576 
2577  return collator;
2578 }
2579 
2580 static void
2581 init_icu_converter(void)
2582 {
2583  const char *icu_encoding_name;
2584  UErrorCode status;
2585  UConverter *conv;
2586 
2587  if (icu_converter)
2588  return; /* already done */
2589 
2590  icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
2591  if (!icu_encoding_name)
2592  ereport(ERROR,
2593  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2594  errmsg("encoding \"%s\" not supported by ICU",
2596 
2597  status = U_ZERO_ERROR;
2598  conv = ucnv_open(icu_encoding_name, &status);
2599  if (U_FAILURE(status))
2600  ereport(ERROR,
2601  (errmsg("could not open ICU converter for encoding \"%s\": %s",
2602  icu_encoding_name, u_errorName(status))));
2603 
2604  icu_converter = conv;
2605 }
2606 
2607 /*
2608  * Find length, in UChars, of given string if converted to UChar string.
2609  */
2610 static size_t
2611 uchar_length(UConverter *converter, const char *str, int32_t len)
2612 {
2613  UErrorCode status = U_ZERO_ERROR;
2614  int32_t ulen;
2615  ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
2616  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2617  ereport(ERROR,
2618  (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2619  return ulen;
2620 }
2621 
2622 /*
2623  * Convert the given source string into a UChar string, stored in dest, and
2624  * return the length (in UChars).
2625  */
2626 static int32_t
2627 uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
2628  const char *src, int32_t srclen)
2629 {
2630  UErrorCode status = U_ZERO_ERROR;
2631  int32_t ulen;
2632  status = U_ZERO_ERROR;
2633  ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
2634  if (U_FAILURE(status))
2635  ereport(ERROR,
2636  (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2637  return ulen;
2638 }
2639 
2640 /*
2641  * Convert a string in the database encoding into a string of UChars.
2642  *
2643  * The source string at buff is of length nbytes
2644  * (it needn't be nul-terminated)
2645  *
2646  * *buff_uchar receives a pointer to the palloc'd result string, and
2647  * the function's result is the number of UChars generated.
2648  *
2649  * The result string is nul-terminated, though most callers rely on the
2650  * result length instead.
2651  */
2652 int32_t
2653 icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
2654 {
2655  int32_t len_uchar;
2656 
2657  init_icu_converter();
2658 
2659  len_uchar = uchar_length(icu_converter, buff, nbytes);
2660 
2661  *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
2662  len_uchar = uchar_convert(icu_converter,
2663  *buff_uchar, len_uchar + 1, buff, nbytes);
2664 
2665  return len_uchar;
2666 }
2667 
2668 /*
2669  * Convert a string of UChars into the database encoding.
2670  *
2671  * The source string at buff_uchar is of length len_uchar
2672  * (it needn't be nul-terminated)
2673  *
2674  * *result receives a pointer to the palloc'd result string, and the
2675  * function's result is the number of bytes generated (not counting nul).
2676  *
2677  * The result string is nul-terminated.
2678  */
2679 int32_t
2680 icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
2681 {
2682  UErrorCode status;
2683  int32_t len_result;
2684 
2685  init_icu_converter();
2686 
2687  status = U_ZERO_ERROR;
2688  len_result = ucnv_fromUChars(icu_converter, NULL, 0,
2689  buff_uchar, len_uchar, &status);
2690  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2691  ereport(ERROR,
2692  (errmsg("%s failed: %s", "ucnv_fromUChars",
2693  u_errorName(status))));
2694 
2695  *result = palloc(len_result + 1);
2696 
2697  status = U_ZERO_ERROR;
2698  len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
2699  buff_uchar, len_uchar, &status);
2700  if (U_FAILURE(status))
2701  ereport(ERROR,
2702  (errmsg("%s failed: %s", "ucnv_fromUChars",
2703  u_errorName(status))));
2704 
2705  return len_result;
2706 }
2707 
2708 /*
2709  * Parse collation attributes from the given locale string and apply them to
2710  * the open collator.
2711  *
2712  * First, the locale string is canonicalized to an ICU format locale ID such
2713  * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
2714  * the key-value arguments.
2715  *
2716  * Starting with ICU version 54, the attributes are processed automatically by
2717  * ucol_open(), so this is only necessary for emulating this behavior on older
2718  * versions.
2719  */
2721 static void
2722 icu_set_collation_attributes(UCollator *collator, const char *loc,
2723  UErrorCode *status)
2724 {
2725  int32_t len;
2726  char *icu_locale_id;
2727  char *lower_str;
2728  char *str;
2729 
2730  /*
2731  * The input locale may be a BCP 47 language tag, e.g.
2732  * "und-u-kc-ks-level1", which expresses the same attributes in a
2733  * different form. It will be converted to the equivalent ICU format
2734  * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
2735  * uloc_canonicalize().
2736  */
2737  *status = U_ZERO_ERROR;
2738  len = uloc_canonicalize(loc, NULL, 0, status);
2739  icu_locale_id = palloc(len + 1);
2740  *status = U_ZERO_ERROR;
2741  len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
2742  if (U_FAILURE(*status))
2743  return;
2744 
2745  lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
2746 
2747  pfree(icu_locale_id);
2748 
2749  str = strchr(lower_str, '@');
2750  if (!str)
2751  return;
2752  str++;
2753 
2754  for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
2755  {
2756  char *e = strchr(token, '=');
2757 
2758  if (e)
2759  {
2760  char *name;
2761  char *value;
2762  UColAttribute uattr;
2763  UColAttributeValue uvalue;
2764 
2765  *status = U_ZERO_ERROR;
2766 
2767  *e = '\0';
2768  name = token;
2769  value = e + 1;
2770 
2771  /*
2772  * See attribute name and value lists in ICU i18n/coll.cpp
2773  */
2774  if (strcmp(name, "colstrength") == 0)
2775  uattr = UCOL_STRENGTH;
2776  else if (strcmp(name, "colbackwards") == 0)
2777  uattr = UCOL_FRENCH_COLLATION;
2778  else if (strcmp(name, "colcaselevel") == 0)
2779  uattr = UCOL_CASE_LEVEL;
2780  else if (strcmp(name, "colcasefirst") == 0)
2781  uattr = UCOL_CASE_FIRST;
2782  else if (strcmp(name, "colalternate") == 0)
2783  uattr = UCOL_ALTERNATE_HANDLING;
2784  else if (strcmp(name, "colnormalization") == 0)
2785  uattr = UCOL_NORMALIZATION_MODE;
2786  else if (strcmp(name, "colnumeric") == 0)
2787  uattr = UCOL_NUMERIC_COLLATION;
2788  else
2789  /* ignore if unknown */
2790  continue;
2791 
2792  if (strcmp(value, "primary") == 0)
2793  uvalue = UCOL_PRIMARY;
2794  else if (strcmp(value, "secondary") == 0)
2795  uvalue = UCOL_SECONDARY;
2796  else if (strcmp(value, "tertiary") == 0)
2797  uvalue = UCOL_TERTIARY;
2798  else if (strcmp(value, "quaternary") == 0)
2799  uvalue = UCOL_QUATERNARY;
2800  else if (strcmp(value, "identical") == 0)
2801  uvalue = UCOL_IDENTICAL;
2802  else if (strcmp(value, "no") == 0)
2803  uvalue = UCOL_OFF;
2804  else if (strcmp(value, "yes") == 0)
2805  uvalue = UCOL_ON;
2806  else if (strcmp(value, "shifted") == 0)
2807  uvalue = UCOL_SHIFTED;
2808  else if (strcmp(value, "non-ignorable") == 0)
2809  uvalue = UCOL_NON_IGNORABLE;
2810  else if (strcmp(value, "lower") == 0)
2811  uvalue = UCOL_LOWER_FIRST;
2812  else if (strcmp(value, "upper") == 0)
2813  uvalue = UCOL_UPPER_FIRST;
2814  else
2815  {
2816  *status = U_ILLEGAL_ARGUMENT_ERROR;
2817  break;
2818  }
2819 
2820  ucol_setAttribute(collator, uattr, uvalue, status);
2821  }
2822  }
2823 
2824  pfree(lower_str);
2825 }
2826 
2827 #endif
2828 
2829 /*
2830  * Perform best-effort check that the locale is a valid one.
2831  */
2832 void
2833 icu_validate_locale(const char *loc_str)
2834 {
2835 #ifdef USE_ICU
2836  UCollator *collator;
2837  UErrorCode status;
2838  char lang[ULOC_LANG_CAPACITY];
2839  bool found = false;
2840  int elevel = icu_validation_level;
2841 
2842  /* no validation */
2843  if (elevel < 0)
2844  return;
2845 
2846  /* downgrade to WARNING during pg_upgrade */
2847  if (IsBinaryUpgrade && elevel > WARNING)
2848  elevel = WARNING;
2849 
2850  /* validate that we can extract the language */
2851  status = U_ZERO_ERROR;
2852  uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2853  if (U_FAILURE(status))
2854  {
2855  ereport(elevel,
2856  (errmsg("could not get language from ICU locale \"%s\": %s",
2857  loc_str, u_errorName(status)),
2858  errhint("To disable ICU locale validation, set parameter icu_validation_level to DISABLED.")));
2859  return;
2860  }
2861 
2862  /* check for special language name */
2863  if (strcmp(lang, "") == 0 ||
2864  strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
2865  strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
2866  found = true;
2867 
2868  /* search for matching language within ICU */
2869  for (int32_t i = 0; !found && i < uloc_countAvailable(); i++)
2870  {
2871  const char *otherloc = uloc_getAvailable(i);
2872  char otherlang[ULOC_LANG_CAPACITY];
2873 
2874  status = U_ZERO_ERROR;
2875  uloc_getLanguage(otherloc, otherlang, ULOC_LANG_CAPACITY, &status);
2876  if (U_FAILURE(status))
2877  continue;
2878 
2879  if (strcmp(lang, otherlang) == 0)
2880  found = true;
2881  }
2882 
2883  if (!found)
2884  ereport(elevel,
2885  (errmsg("ICU locale \"%s\" has unknown language \"%s\"",
2886  loc_str, lang),
2887  errhint("To disable ICU locale validation, set parameter icu_validation_level to DISABLED.")));
2888 
2889  /* check that it can be opened */
2890  collator = pg_ucol_open(loc_str);
2891  ucol_close(collator);
2892 #else /* not USE_ICU */
2893  /* could get here if a collation was created by a build with ICU */
2894  ereport(ERROR,
2895  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2896  errmsg("ICU is not supported in this build")));
2897 #endif /* not USE_ICU */
2898 }
2899 
2900 /*
2901  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
2902  * Therefore we keep them here rather than with the mbutils code.
2903  */
2904 
2905 /*
2906  * wchar2char --- convert wide characters to multibyte format
2907  *
2908  * This has the same API as the standard wcstombs_l() function; in particular,
2909  * tolen is the maximum number of bytes to store at *to, and *from must be
2910  * zero-terminated. The output will be zero-terminated iff there is room.
2911  */
2912 size_t
2913 wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
2914 {
2915  size_t result;
2916 
2917  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2918 
2919  if (tolen == 0)
2920  return 0;
2921 
2922 #ifdef WIN32
2923 
2924  /*
2925  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
2926  * for some reason mbstowcs and wcstombs won't do this for us, so we use
2927  * MultiByteToWideChar().
2928  */
2929  if (GetDatabaseEncoding() == PG_UTF8)
2930  {
2931  result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
2932  NULL, NULL);
2933  /* A zero return is failure */
2934  if (result <= 0)
2935  result = -1;
2936  else
2937  {
2938  Assert(result <= tolen);
2939  /* Microsoft counts the zero terminator in the result */
2940  result--;
2941  }
2942  }
2943  else
2944 #endif /* WIN32 */
2945  if (locale == (pg_locale_t) 0)
2946  {
2947  /* Use wcstombs directly for the default locale */
2948  result = wcstombs(to, from, tolen);
2949  }
2950  else
2951  {
2952 #ifdef HAVE_LOCALE_T
2953 #ifdef HAVE_WCSTOMBS_L
2954  /* Use wcstombs_l for nondefault locales */
2955  result = wcstombs_l(to, from, tolen, locale->info.lt);
2956 #else /* !HAVE_WCSTOMBS_L */
2957  /* We have to temporarily set the locale as current ... ugh */
2958  locale_t save_locale = uselocale(locale->info.lt);
2959 
2960  result = wcstombs(to, from, tolen);
2961 
2962  uselocale(save_locale);
2963 #endif /* HAVE_WCSTOMBS_L */
2964 #else /* !HAVE_LOCALE_T */
2965  /* Can't have locale != 0 without HAVE_LOCALE_T */
2966  elog(ERROR, "wcstombs_l is not available");
2967  result = 0; /* keep compiler quiet */
2968 #endif /* HAVE_LOCALE_T */
2969  }
2970 
2971  return result;
2972 }
2973 
2974 /*
2975  * char2wchar --- convert multibyte characters to wide characters
2976  *
2977  * This has almost the API of mbstowcs_l(), except that *from need not be
2978  * null-terminated; instead, the number of input bytes is specified as
2979  * fromlen. Also, we ereport() rather than returning -1 for invalid
2980  * input encoding. tolen is the maximum number of wchar_t's to store at *to.
2981  * The output will be zero-terminated iff there is room.
2982  */
2983 size_t
2984 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
2986 {
2987  size_t result;
2988 
2989  Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2990 
2991  if (tolen == 0)
2992  return 0;
2993 
2994 #ifdef WIN32
2995  /* See WIN32 "Unicode" comment above */
2996  if (GetDatabaseEncoding() == PG_UTF8)
2997  {
2998  /* Win32 API does not work for zero-length input */
2999  if (fromlen == 0)
3000  result = 0;
3001  else
3002  {
3003  result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
3004  /* A zero return is failure */
3005  if (result == 0)
3006  result = -1;
3007  }
3008 
3009  if (result != -1)
3010  {
3011  Assert(result < tolen);
3012  /* Append trailing null wchar (MultiByteToWideChar() does not) */
3013  to[result] = 0;
3014  }
3015  }
3016  else
3017 #endif /* WIN32 */
3018  {
3019  /* mbstowcs requires ending '\0' */
3020  char *str = pnstrdup(from, fromlen);
3021 
3022  if (locale == (pg_locale_t) 0)
3023  {
3024  /* Use mbstowcs directly for the default locale */
3025  result = mbstowcs(to, str, tolen);
3026  }
3027  else
3028  {
3029 #ifdef HAVE_LOCALE_T
3030 #ifdef HAVE_MBSTOWCS_L
3031  /* Use mbstowcs_l for nondefault locales */
3032  result = mbstowcs_l(to, str, tolen, locale->info.lt);
3033 #else /* !HAVE_MBSTOWCS_L */
3034  /* We have to temporarily set the locale as current ... ugh */
3035  locale_t save_locale = uselocale(locale->info.lt);
3036 
3037  result = mbstowcs(to, str, tolen);
3038 
3039  uselocale(save_locale);
3040 #endif /* HAVE_MBSTOWCS_L */
3041 #else /* !HAVE_LOCALE_T */
3042  /* Can't have locale != 0 without HAVE_LOCALE_T */
3043  elog(ERROR, "mbstowcs_l is not available");
3044  result = 0; /* keep compiler quiet */
3045 #endif /* HAVE_LOCALE_T */
3046  }
3047 
3048  pfree(str);
3049  }
3050 
3051  if (result == -1)
3052  {
3053  /*
3054  * Invalid multibyte character encountered. We try to give a useful
3055  * error message by letting pg_verifymbstr check the string. But it's
3056  * possible that the string is OK to us, and not OK to mbstowcs ---
3057  * this suggests that the LC_CTYPE locale is different from the
3058  * database encoding. Give a generic error message if pg_verifymbstr
3059  * can't find anything wrong.
3060  */
3061  pg_verifymbstr(from, fromlen, false); /* might not return */
3062  /* but if it does ... */
3063  ereport(ERROR,
3064  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
3065  errmsg("invalid multibyte character for locale"),
3066  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
3067  }
3068 
3069  return result;
3070 }
#define TextDatumGetCString(d)
Definition: builtins.h:95
#define NameStr(name)
Definition: c.h:730
#define pg_attribute_unused()
Definition: c.h:120
#define lengthof(array)
Definition: c.h:772
#define OidIsValid(objectId)
Definition: c.h:759
size_t Size
Definition: c.h:589
Oid collid
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define PG_RE_THROW()
Definition: elog.h:411
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define PG_TRY(...)
Definition: elog.h:370
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:395
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:380
#define ereport(elevel,...)
Definition: elog.h:149
const char * get_encoding_name_for_icu(int encoding)
Definition: encnames.c:473
const char * pg_encoding_to_char(int encoding)
Definition: encnames.c:588
const char * name
Definition: encode.c:571
#define palloc_array(type, count)
Definition: fe_memutils.h:64
char * asc_tolower(const char *buff, size_t nbytes)
Definition: formatting.c:2022
bool IsBinaryUpgrade
Definition: globals.c:114
#define newval
GucSource
Definition: guc.h:108
@ PGC_S_DEFAULT
Definition: guc.h:109
#define free(a)
Definition: header.h:65
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
#define period
Definition: indent_codes.h:66
#define token
Definition: indent_globs.h:126
#define bufsize
Definition: indent_globs.h:36
static struct @145 value
static char * locale
Definition: initdb.c:139
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
static struct pg_tm tm
Definition: localtime.c:104
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3324
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:677
int GetDatabaseEncoding(void)
Definition: mbutils.c:1268
int pg_mbstrlen(const char *mbstr)
Definition: mbutils.c:1038
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1563
void SetMessageEncoding(int encoding)
Definition: mbutils.c:1172
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1635
char * pstrdup(const char *in)
Definition: mcxt.c:1624
void pfree(void *pointer)
Definition: mcxt.c:1436
MemoryContext TopMemoryContext
Definition: mcxt.c:141
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1005
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1611
void * palloc(Size size)
Definition: mcxt.c:1210
static char format
FormData_pg_collation * Form_pg_collation
Definition: pg_collation.h:58
const void size_t len
int32 encoding
Definition: pg_database.h:41
size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
Definition: pg_locale.c:2386
static size_t pg_strxfrm_libc(char *dest, const char *src, size_t destsize, pg_locale_t locale)
Definition: pg_locale.c:2120
int icu_validation_level
Definition: pg_locale.c:99
void cache_locale_time(void)
Definition: pg_locale.c:766
bool pg_strxfrm_enabled(pg_locale_t locale)
Definition: pg_locale.c:2318
char * localized_full_months[12+1]
Definition: pg_locale.c:111
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:2913
struct lconv * PGLC_localeconv(void)
Definition: pg_locale.c:484
int pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale)
Definition: pg_locale.c:2100
void make_icu_collator(const char *iculocstr, const char *icurules, struct pg_locale_struct *resultp)
Definition: pg_locale.c:1429
bool lc_collate_is_c(Oid collation)
Definition: pg_locale.c:1324
struct pg_locale_struct default_locale
Definition: pg_locale.c:1426
void icu_validate_locale(const char *loc_str)
Definition: pg_locale.c:2833
void check_strxfrm_bug(void)
Definition: pg_locale.c:1178
static bool CurrentLCTimeValid
Definition: pg_locale.c:118
void assign_locale_time(const char *newval, void *extra)
Definition: pg_locale.c:348
bool check_locale_time(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:342
char * locale_messages
Definition: pg_locale.c:94
char * locale_numeric
Definition: pg_locale.c:96
pg_locale_t pg_newlocale_from_collation(Oid collid)
Definition: pg_locale.c:1537
bool database_ctype_is_c
Definition: pg_locale.c:114
char * locale_time
Definition: pg_locale.c:97
static void cache_single_string(char **dst, const char *src, int encoding)
Definition: pg_locale.c:743
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1377
bool check_locale_numeric(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:330
bool pg_locale_deterministic(pg_locale_t locale)
Definition: pg_locale.c:1513
static void db_encoding_convert(int encoding, char **str)
Definition: pg_locale.c:454
void assign_locale_numeric(const char *newval, void *extra)
Definition: pg_locale.c:336
bool check_locale_messages(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:364
#define MAX_L10N_DATA
Definition: pg_locale.c:90
char * get_collation_actual_version(char collprovider, const char *collcollate)
Definition: pg_locale.c:1707
static void free_struct_lconv(struct lconv *s)
Definition: pg_locale.c:404
char * pg_perm_setlocale(int category, const char *locale)
Definition: pg_locale.c:172
void assign_locale_messages(const char *newval, void *extra)
Definition: pg_locale.c:387
static bool CurrentLocaleConvValid
Definition: pg_locale.c:117
int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
Definition: pg_locale.c:2065
static int pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
Definition: pg_locale.c:1877
static HTAB * collation_cache
Definition: pg_locale.c:131
bool pg_strxfrm_prefix_enabled(pg_locale_t locale)
Definition: pg_locale.c:2409
char * localized_abbrev_months[12+1]
Definition: pg_locale.c:110
static int pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale)
Definition: pg_locale.c:1912
size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale)
Definition: pg_locale.c:2473
static bool struct_lconv_is_valid(struct lconv *s)
Definition: pg_locale.c:423
char * localized_full_days[7+1]
Definition: pg_locale.c:109
size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
Definition: pg_locale.c:2350
static collation_cache_entry * lookup_collation_cache(Oid collation, bool set_flags)
Definition: pg_locale.c:1248
void assign_locale_monetary(const char *newval, void *extra)
Definition: pg_locale.c:324
#define TEXTBUFLEN
Definition: pg_locale.c:88
bool check_locale(int category, const char *locale, char **canonname)
Definition: pg_locale.c:275
char * localized_abbrev_days[7+1]
Definition: pg_locale.c:108
size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale)
Definition: pg_locale.c:2436
char * locale_monetary
Definition: pg_locale.c:95
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2984
bool check_locale_monetary(char **newval, void **extra, GucSource source)
Definition: pg_locale.c:318
static size_t pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, pg_locale_t locale)
Definition: pg_locale.c:2140
#define LOCALE_NAME_BUFLEN
Definition: pg_locale.h:36
static rewind_source * source
Definition: pg_rewind.c:87
static char * buf
Definition: pg_test_fsync.c:67
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
int pg_get_encoding_from_locale(const char *ctype, bool write_message)
Definition: chklocale.c:428
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
unsigned int Oid
Definition: postgres_ext.h:31
e
Definition: preproc-init.c:82
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
char * quote_qualified_identifier(const char *qualifier, const char *ident)
Definition: ruleutils.c:11835
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:220
Definition: pg_locale.c:123
bool collate_is_c
Definition: pg_locale.c:125
Oid collid
Definition: pg_locale.c:124
pg_locale_t locale
Definition: pg_locale.c:128
bool flags_valid
Definition: pg_locale.c:127
bool ctype_is_c
Definition: pg_locale.c:126
union pg_locale_struct::@142 info
bool deterministic
Definition: pg_locale.h:79
Definition: regguts.h:318
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:866
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:818
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:1079
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:1110
@ COLLOID
Definition: syscache.h:50
#define locale_t
Definition: win32_port.h:426
#define strcoll_l
Definition: win32_port.h:449
#define wcstombs_l
Definition: win32_port.h:452
#define strxfrm_l
Definition: win32_port.h:450
#define wcscoll_l
Definition: win32_port.h:451
#define mbstowcs_l
Definition: win32_port.h:453
#define setenv(x, y, z)
Definition: win32_port.h:541
#define setlocale(a, b)
Definition: win32_port.h:471