PostgreSQL Source Code  git master
pg_locale_libc.c
Go to the documentation of this file.
1 /*-----------------------------------------------------------------------
2  *
3  * PostgreSQL locale utilities for libc
4  *
5  * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
6  *
7  * src/backend/utils/adt/pg_locale_libc.c
8  *
9  *-----------------------------------------------------------------------
10  */
11 
12 #include "postgres.h"
13 
14 #include "catalog/pg_collation.h"
15 #include "mb/pg_wchar.h"
16 #include "utils/formatting.h"
17 #include "utils/pg_locale.h"
18 
19 /*
20  * Size of stack buffer to use for string transformations, used to avoid heap
21  * allocations in typical cases. This should be large enough that most strings
22  * will fit, but small enough that we feel comfortable putting it on the
23  * stack.
24  */
25 #define TEXTBUFLEN 1024
26 
27 extern locale_t make_libc_collator(const char *collate,
28  const char *ctype);
29 extern int strncoll_libc(const char *arg1, ssize_t len1,
30  const char *arg2, ssize_t len2,
32 extern size_t strnxfrm_libc(char *dest, size_t destsize,
33  const char *src, ssize_t srclen,
35 
36 static void report_newlocale_failure(const char *localename);
37 
38 #ifdef WIN32
39 static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
40  const char *arg2, ssize_t len2,
42 #endif
43 
44 /*
45  * Create a locale_t with the given collation and ctype.
46  *
47  * The "C" and "POSIX" locales are not actually handled by libc, so return
48  * NULL.
49  *
50  * Ensure that no path leaks a locale_t.
51  */
53 make_libc_collator(const char *collate, const char *ctype)
54 {
55  locale_t loc = 0;
56 
57  if (strcmp(collate, ctype) == 0)
58  {
59  if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
60  {
61  /* Normal case where they're the same */
62  errno = 0;
63 #ifndef WIN32
64  loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
65  NULL);
66 #else
67  loc = _create_locale(LC_ALL, collate);
68 #endif
69  if (!loc)
70  report_newlocale_failure(collate);
71  }
72  }
73  else
74  {
75 #ifndef WIN32
76  /* We need two newlocale() steps */
77  locale_t loc1 = 0;
78 
79  if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
80  {
81  errno = 0;
82  loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
83  if (!loc1)
84  report_newlocale_failure(collate);
85  }
86 
87  if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
88  {
89  errno = 0;
90  loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
91  if (!loc)
92  {
93  if (loc1)
94  freelocale(loc1);
96  }
97  }
98  else
99  loc = loc1;
100 #else
101 
102  /*
103  * XXX The _create_locale() API doesn't appear to support this. Could
104  * perhaps be worked around by changing pg_locale_t to contain two
105  * separate fields.
106  */
107  ereport(ERROR,
108  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
109  errmsg("collations with different collate and ctype values are not supported on this platform")));
110 #endif
111  }
112 
113  return loc;
114 }
115 
116 /*
117  * strncoll_libc
118  *
119  * NUL-terminate arguments, if necessary, and pass to strcoll_l().
120  *
121  * An input string length of -1 means that it's already NUL-terminated.
122  */
123 int
124 strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
126 {
127  char sbuf[TEXTBUFLEN];
128  char *buf = sbuf;
129  size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
130  size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
131  const char *arg1n;
132  const char *arg2n;
133  int result;
134 
135  Assert(locale->provider == COLLPROVIDER_LIBC);
136 
137 #ifdef WIN32
138  /* check for this case before doing the work for nul-termination */
139  if (GetDatabaseEncoding() == PG_UTF8)
140  return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
141 #endif /* WIN32 */
142 
143  if (bufsize1 + bufsize2 > TEXTBUFLEN)
144  buf = palloc(bufsize1 + bufsize2);
145 
146  /* nul-terminate arguments if necessary */
147  if (len1 == -1)
148  {
149  arg1n = arg1;
150  }
151  else
152  {
153  char *buf1 = buf;
154 
155  memcpy(buf1, arg1, len1);
156  buf1[len1] = '\0';
157  arg1n = buf1;
158  }
159 
160  if (len2 == -1)
161  {
162  arg2n = arg2;
163  }
164  else
165  {
166  char *buf2 = buf + bufsize1;
167 
168  memcpy(buf2, arg2, len2);
169  buf2[len2] = '\0';
170  arg2n = buf2;
171  }
172 
173  result = strcoll_l(arg1n, arg2n, locale->info.lt);
174 
175  if (buf != sbuf)
176  pfree(buf);
177 
178  return result;
179 }
180 
181 /*
182  * strnxfrm_libc
183  *
184  * NUL-terminate src, if necessary, and pass to strxfrm_l().
185  *
186  * A source length of -1 means that it's already NUL-terminated.
187  */
188 size_t
189 strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
191 {
192  char sbuf[TEXTBUFLEN];
193  char *buf = sbuf;
194  size_t bufsize = srclen + 1;
195  size_t result;
196 
197  Assert(locale->provider == COLLPROVIDER_LIBC);
198 
199  if (srclen == -1)
200  return strxfrm_l(dest, src, destsize, locale->info.lt);
201 
202  if (bufsize > TEXTBUFLEN)
203  buf = palloc(bufsize);
204 
205  /* nul-terminate argument */
206  memcpy(buf, src, srclen);
207  buf[srclen] = '\0';
208 
209  result = strxfrm_l(dest, buf, destsize, locale->info.lt);
210 
211  if (buf != sbuf)
212  pfree(buf);
213 
214  /* if dest is defined, it should be nul-terminated */
215  Assert(result >= destsize || dest[result] == '\0');
216 
217  return result;
218 }
219 
220 /*
221  * strncoll_libc_win32_utf8
222  *
223  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
224  * invoke wcscoll_l().
225  *
226  * An input string length of -1 means that it's NUL-terminated.
227  */
228 #ifdef WIN32
229 static int
230 strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
231  ssize_t len2, pg_locale_t locale)
232 {
233  char sbuf[TEXTBUFLEN];
234  char *buf = sbuf;
235  char *a1p,
236  *a2p;
237  int a1len;
238  int a2len;
239  int r;
240  int result;
241 
242  Assert(locale->provider == COLLPROVIDER_LIBC);
244 
245  if (len1 == -1)
246  len1 = strlen(arg1);
247  if (len2 == -1)
248  len2 = strlen(arg2);
249 
250  a1len = len1 * 2 + 2;
251  a2len = len2 * 2 + 2;
252 
253  if (a1len + a2len > TEXTBUFLEN)
254  buf = palloc(a1len + a2len);
255 
256  a1p = buf;
257  a2p = buf + a1len;
258 
259  /* API does not work for zero-length input */
260  if (len1 == 0)
261  r = 0;
262  else
263  {
264  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
265  (LPWSTR) a1p, a1len / 2);
266  if (!r)
267  ereport(ERROR,
268  (errmsg("could not convert string to UTF-16: error code %lu",
269  GetLastError())));
270  }
271  ((LPWSTR) a1p)[r] = 0;
272 
273  if (len2 == 0)
274  r = 0;
275  else
276  {
277  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
278  (LPWSTR) a2p, a2len / 2);
279  if (!r)
280  ereport(ERROR,
281  (errmsg("could not convert string to UTF-16: error code %lu",
282  GetLastError())));
283  }
284  ((LPWSTR) a2p)[r] = 0;
285 
286  errno = 0;
287  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
288  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
289  ereport(ERROR,
290  (errmsg("could not compare Unicode strings: %m")));
291 
292  if (buf != sbuf)
293  pfree(buf);
294 
295  return result;
296 }
297 #endif /* WIN32 */
298 
299 /* simple subroutine for reporting errors from newlocale() */
300 static void
301 report_newlocale_failure(const char *localename)
302 {
303  int save_errno;
304 
305  /*
306  * Windows doesn't provide any useful error indication from
307  * _create_locale(), and BSD-derived platforms don't seem to feel they
308  * need to set errno either (even though POSIX is pretty clear that
309  * newlocale should do so). So, if errno hasn't been set, assume ENOENT
310  * is what to report.
311  */
312  if (errno == 0)
313  errno = ENOENT;
314 
315  /*
316  * ENOENT means "no such locale", not "no such file", so clarify that
317  * errno with an errdetail message.
318  */
319  save_errno = errno; /* auxiliary funcs might change errno */
320  ereport(ERROR,
321  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
322  errmsg("could not create locale \"%s\": %m",
323  localename),
324  (save_errno == ENOENT ?
325  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
326  localename) : 0)));
327 }
328 
329 /*
330  * POSIX doesn't define _l-variants of these functions, but several systems
331  * have them. We provide our own replacements here.
332  */
333 #ifndef HAVE_MBSTOWCS_L
334 static size_t
335 mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
336 {
337 #ifdef WIN32
338  return _mbstowcs_l(dest, src, n, loc);
339 #else
340  size_t result;
341  locale_t save_locale = uselocale(loc);
342 
343  result = mbstowcs(dest, src, n);
344  uselocale(save_locale);
345  return result;
346 #endif
347 }
348 #endif
349 #ifndef HAVE_WCSTOMBS_L
350 static size_t
351 wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
352 {
353 #ifdef WIN32
354  return _wcstombs_l(dest, src, n, loc);
355 #else
356  size_t result;
357  locale_t save_locale = uselocale(loc);
358 
359  result = wcstombs(dest, src, n);
360  uselocale(save_locale);
361  return result;
362 #endif
363 }
364 #endif
365 
366 /*
367  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
368  * Therefore we keep them here rather than with the mbutils code.
369  */
370 
371 /*
372  * wchar2char --- convert wide characters to multibyte format
373  *
374  * This has the same API as the standard wcstombs_l() function; in particular,
375  * tolen is the maximum number of bytes to store at *to, and *from must be
376  * zero-terminated. The output will be zero-terminated iff there is room.
377  */
378 size_t
379 wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
380 {
381  size_t result;
382 
383  if (tolen == 0)
384  return 0;
385 
386 #ifdef WIN32
387 
388  /*
389  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
390  * for some reason mbstowcs and wcstombs won't do this for us, so we use
391  * MultiByteToWideChar().
392  */
393  if (GetDatabaseEncoding() == PG_UTF8)
394  {
395  result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
396  NULL, NULL);
397  /* A zero return is failure */
398  if (result <= 0)
399  result = -1;
400  else
401  {
402  Assert(result <= tolen);
403  /* Microsoft counts the zero terminator in the result */
404  result--;
405  }
406  }
407  else
408 #endif /* WIN32 */
409  if (locale == (pg_locale_t) 0)
410  {
411  /* Use wcstombs directly for the default locale */
412  result = wcstombs(to, from, tolen);
413  }
414  else
415  {
416  /* Use wcstombs_l for nondefault locales */
417  result = wcstombs_l(to, from, tolen, locale->info.lt);
418  }
419 
420  return result;
421 }
422 
423 /*
424  * char2wchar --- convert multibyte characters to wide characters
425  *
426  * This has almost the API of mbstowcs_l(), except that *from need not be
427  * null-terminated; instead, the number of input bytes is specified as
428  * fromlen. Also, we ereport() rather than returning -1 for invalid
429  * input encoding. tolen is the maximum number of wchar_t's to store at *to.
430  * The output will be zero-terminated iff there is room.
431  */
432 size_t
433 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
435 {
436  size_t result;
437 
438  if (tolen == 0)
439  return 0;
440 
441 #ifdef WIN32
442  /* See WIN32 "Unicode" comment above */
443  if (GetDatabaseEncoding() == PG_UTF8)
444  {
445  /* Win32 API does not work for zero-length input */
446  if (fromlen == 0)
447  result = 0;
448  else
449  {
450  result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
451  /* A zero return is failure */
452  if (result == 0)
453  result = -1;
454  }
455 
456  if (result != -1)
457  {
458  Assert(result < tolen);
459  /* Append trailing null wchar (MultiByteToWideChar() does not) */
460  to[result] = 0;
461  }
462  }
463  else
464 #endif /* WIN32 */
465  {
466  /* mbstowcs requires ending '\0' */
467  char *str = pnstrdup(from, fromlen);
468 
469  if (locale == (pg_locale_t) 0)
470  {
471  /* Use mbstowcs directly for the default locale */
472  result = mbstowcs(to, str, tolen);
473  }
474  else
475  {
476  /* Use mbstowcs_l for nondefault locales */
477  result = mbstowcs_l(to, str, tolen, locale->info.lt);
478  }
479 
480  pfree(str);
481  }
482 
483  if (result == -1)
484  {
485  /*
486  * Invalid multibyte character encountered. We try to give a useful
487  * error message by letting pg_verifymbstr check the string. But it's
488  * possible that the string is OK to us, and not OK to mbstowcs ---
489  * this suggests that the LC_CTYPE locale is different from the
490  * database encoding. Give a generic error message if pg_verifymbstr
491  * can't find anything wrong.
492  */
493  pg_verifymbstr(from, fromlen, false); /* might not return */
494  /* but if it does ... */
495  ereport(ERROR,
496  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
497  errmsg("invalid multibyte character for locale"),
498  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
499  }
500 
501  return result;
502 }
#define Assert(condition)
Definition: c.h:849
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
const char * str
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1556
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1707
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
locale_t make_libc_collator(const char *collate, const char *ctype)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static void report_newlocale_failure(const char *localename)
size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
#define TEXTBUFLEN
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static char * buf
Definition: pg_test_fsync.c:73
@ PG_UTF8
Definition: pg_wchar.h:232
#define locale_t
Definition: win32_port.h:442
#define strcoll_l
Definition: win32_port.h:465
#define strxfrm_l
Definition: win32_port.h:466
#define wcscoll_l
Definition: win32_port.h:467