PostgreSQL Source Code  git master
pg_locale_libc.c
Go to the documentation of this file.
1 /*-----------------------------------------------------------------------
2  *
3  * PostgreSQL locale utilities for libc
4  *
5  * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
6  *
7  * src/backend/utils/adt/pg_locale_libc.c
8  *
9  *-----------------------------------------------------------------------
10  */
11 
12 #include "postgres.h"
13 
14 #include "access/htup_details.h"
15 #include "catalog/pg_database.h"
16 #include "catalog/pg_collation.h"
17 #include "mb/pg_wchar.h"
18 #include "miscadmin.h"
19 #include "utils/builtins.h"
20 #include "utils/formatting.h"
21 #include "utils/memutils.h"
22 #include "utils/pg_locale.h"
23 #include "utils/syscache.h"
24 
25 /*
26  * Size of stack buffer to use for string transformations, used to avoid heap
27  * allocations in typical cases. This should be large enough that most strings
28  * will fit, but small enough that we feel comfortable putting it on the
29  * stack.
30  */
31 #define TEXTBUFLEN 1024
32 
34 
35 extern int strncoll_libc(const char *arg1, ssize_t len1,
36  const char *arg2, ssize_t len2,
38 extern size_t strnxfrm_libc(char *dest, size_t destsize,
39  const char *src, ssize_t srclen,
41 static locale_t make_libc_collator(const char *collate,
42  const char *ctype);
43 static void report_newlocale_failure(const char *localename);
44 
45 #ifdef WIN32
46 static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
47  const char *arg2, ssize_t len2,
49 #endif
50 
53 {
54  const char *collate;
55  const char *ctype;
56  locale_t loc;
57  pg_locale_t result;
58 
59  if (collid == DEFAULT_COLLATION_OID)
60  {
61  HeapTuple tp;
62  Datum datum;
63 
64  tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
65  if (!HeapTupleIsValid(tp))
66  elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
67  datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
68  Anum_pg_database_datcollate);
69  collate = TextDatumGetCString(datum);
70  datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
71  Anum_pg_database_datctype);
72  ctype = TextDatumGetCString(datum);
73 
74  ReleaseSysCache(tp);
75  }
76  else
77  {
78  HeapTuple tp;
79  Datum datum;
80 
81  tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
82  if (!HeapTupleIsValid(tp))
83  elog(ERROR, "cache lookup failed for collation %u", collid);
84 
85  datum = SysCacheGetAttrNotNull(COLLOID, tp,
86  Anum_pg_collation_collcollate);
87  collate = TextDatumGetCString(datum);
88  datum = SysCacheGetAttrNotNull(COLLOID, tp,
89  Anum_pg_collation_collctype);
90  ctype = TextDatumGetCString(datum);
91 
92  ReleaseSysCache(tp);
93  }
94 
95 
96  loc = make_libc_collator(collate, ctype);
97 
98  result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
99  result->provider = COLLPROVIDER_LIBC;
100  result->deterministic = true;
101  result->collate_is_c = (strcmp(collate, "C") == 0) ||
102  (strcmp(collate, "POSIX") == 0);
103  result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
104  (strcmp(ctype, "POSIX") == 0);
105  result->info.lt = loc;
106 
107  return result;
108 }
109 
110 /*
111  * Create a locale_t with the given collation and ctype.
112  *
113  * The "C" and "POSIX" locales are not actually handled by libc, so return
114  * NULL.
115  *
116  * Ensure that no path leaks a locale_t.
117  */
118 static locale_t
119 make_libc_collator(const char *collate, const char *ctype)
120 {
121  locale_t loc = 0;
122 
123  if (strcmp(collate, ctype) == 0)
124  {
125  if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
126  {
127  /* Normal case where they're the same */
128  errno = 0;
129 #ifndef WIN32
130  loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
131  NULL);
132 #else
133  loc = _create_locale(LC_ALL, collate);
134 #endif
135  if (!loc)
136  report_newlocale_failure(collate);
137  }
138  }
139  else
140  {
141 #ifndef WIN32
142  /* We need two newlocale() steps */
143  locale_t loc1 = 0;
144 
145  if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
146  {
147  errno = 0;
148  loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
149  if (!loc1)
150  report_newlocale_failure(collate);
151  }
152 
153  if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
154  {
155  errno = 0;
156  loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
157  if (!loc)
158  {
159  if (loc1)
160  freelocale(loc1);
162  }
163  }
164  else
165  loc = loc1;
166 #else
167 
168  /*
169  * XXX The _create_locale() API doesn't appear to support this. Could
170  * perhaps be worked around by changing pg_locale_t to contain two
171  * separate fields.
172  */
173  ereport(ERROR,
174  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
175  errmsg("collations with different collate and ctype values are not supported on this platform")));
176 #endif
177  }
178 
179  return loc;
180 }
181 
182 /*
183  * strncoll_libc
184  *
185  * NUL-terminate arguments, if necessary, and pass to strcoll_l().
186  *
187  * An input string length of -1 means that it's already NUL-terminated.
188  */
189 int
190 strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
192 {
193  char sbuf[TEXTBUFLEN];
194  char *buf = sbuf;
195  size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
196  size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
197  const char *arg1n;
198  const char *arg2n;
199  int result;
200 
201  Assert(locale->provider == COLLPROVIDER_LIBC);
202 
203 #ifdef WIN32
204  /* check for this case before doing the work for nul-termination */
205  if (GetDatabaseEncoding() == PG_UTF8)
206  return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
207 #endif /* WIN32 */
208 
209  if (bufsize1 + bufsize2 > TEXTBUFLEN)
210  buf = palloc(bufsize1 + bufsize2);
211 
212  /* nul-terminate arguments if necessary */
213  if (len1 == -1)
214  {
215  arg1n = arg1;
216  }
217  else
218  {
219  char *buf1 = buf;
220 
221  memcpy(buf1, arg1, len1);
222  buf1[len1] = '\0';
223  arg1n = buf1;
224  }
225 
226  if (len2 == -1)
227  {
228  arg2n = arg2;
229  }
230  else
231  {
232  char *buf2 = buf + bufsize1;
233 
234  memcpy(buf2, arg2, len2);
235  buf2[len2] = '\0';
236  arg2n = buf2;
237  }
238 
239  result = strcoll_l(arg1n, arg2n, locale->info.lt);
240 
241  if (buf != sbuf)
242  pfree(buf);
243 
244  return result;
245 }
246 
247 /*
248  * strnxfrm_libc
249  *
250  * NUL-terminate src, if necessary, and pass to strxfrm_l().
251  *
252  * A source length of -1 means that it's already NUL-terminated.
253  */
254 size_t
255 strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
257 {
258  char sbuf[TEXTBUFLEN];
259  char *buf = sbuf;
260  size_t bufsize = srclen + 1;
261  size_t result;
262 
263  Assert(locale->provider == COLLPROVIDER_LIBC);
264 
265  if (srclen == -1)
266  return strxfrm_l(dest, src, destsize, locale->info.lt);
267 
268  if (bufsize > TEXTBUFLEN)
269  buf = palloc(bufsize);
270 
271  /* nul-terminate argument */
272  memcpy(buf, src, srclen);
273  buf[srclen] = '\0';
274 
275  result = strxfrm_l(dest, buf, destsize, locale->info.lt);
276 
277  if (buf != sbuf)
278  pfree(buf);
279 
280  /* if dest is defined, it should be nul-terminated */
281  Assert(result >= destsize || dest[result] == '\0');
282 
283  return result;
284 }
285 
286 /*
287  * strncoll_libc_win32_utf8
288  *
289  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
290  * invoke wcscoll_l().
291  *
292  * An input string length of -1 means that it's NUL-terminated.
293  */
294 #ifdef WIN32
295 static int
296 strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
297  ssize_t len2, pg_locale_t locale)
298 {
299  char sbuf[TEXTBUFLEN];
300  char *buf = sbuf;
301  char *a1p,
302  *a2p;
303  int a1len;
304  int a2len;
305  int r;
306  int result;
307 
308  Assert(locale->provider == COLLPROVIDER_LIBC);
310 
311  if (len1 == -1)
312  len1 = strlen(arg1);
313  if (len2 == -1)
314  len2 = strlen(arg2);
315 
316  a1len = len1 * 2 + 2;
317  a2len = len2 * 2 + 2;
318 
319  if (a1len + a2len > TEXTBUFLEN)
320  buf = palloc(a1len + a2len);
321 
322  a1p = buf;
323  a2p = buf + a1len;
324 
325  /* API does not work for zero-length input */
326  if (len1 == 0)
327  r = 0;
328  else
329  {
330  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
331  (LPWSTR) a1p, a1len / 2);
332  if (!r)
333  ereport(ERROR,
334  (errmsg("could not convert string to UTF-16: error code %lu",
335  GetLastError())));
336  }
337  ((LPWSTR) a1p)[r] = 0;
338 
339  if (len2 == 0)
340  r = 0;
341  else
342  {
343  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
344  (LPWSTR) a2p, a2len / 2);
345  if (!r)
346  ereport(ERROR,
347  (errmsg("could not convert string to UTF-16: error code %lu",
348  GetLastError())));
349  }
350  ((LPWSTR) a2p)[r] = 0;
351 
352  errno = 0;
353  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
354  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
355  ereport(ERROR,
356  (errmsg("could not compare Unicode strings: %m")));
357 
358  if (buf != sbuf)
359  pfree(buf);
360 
361  return result;
362 }
363 #endif /* WIN32 */
364 
365 /* simple subroutine for reporting errors from newlocale() */
366 static void
367 report_newlocale_failure(const char *localename)
368 {
369  int save_errno;
370 
371  /*
372  * Windows doesn't provide any useful error indication from
373  * _create_locale(), and BSD-derived platforms don't seem to feel they
374  * need to set errno either (even though POSIX is pretty clear that
375  * newlocale should do so). So, if errno hasn't been set, assume ENOENT
376  * is what to report.
377  */
378  if (errno == 0)
379  errno = ENOENT;
380 
381  /*
382  * ENOENT means "no such locale", not "no such file", so clarify that
383  * errno with an errdetail message.
384  */
385  save_errno = errno; /* auxiliary funcs might change errno */
386  ereport(ERROR,
387  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
388  errmsg("could not create locale \"%s\": %m",
389  localename),
390  (save_errno == ENOENT ?
391  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
392  localename) : 0)));
393 }
394 
395 /*
396  * POSIX doesn't define _l-variants of these functions, but several systems
397  * have them. We provide our own replacements here.
398  */
399 #ifndef HAVE_MBSTOWCS_L
400 static size_t
401 mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
402 {
403 #ifdef WIN32
404  return _mbstowcs_l(dest, src, n, loc);
405 #else
406  size_t result;
407  locale_t save_locale = uselocale(loc);
408 
409  result = mbstowcs(dest, src, n);
410  uselocale(save_locale);
411  return result;
412 #endif
413 }
414 #endif
415 #ifndef HAVE_WCSTOMBS_L
416 static size_t
417 wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
418 {
419 #ifdef WIN32
420  return _wcstombs_l(dest, src, n, loc);
421 #else
422  size_t result;
423  locale_t save_locale = uselocale(loc);
424 
425  result = wcstombs(dest, src, n);
426  uselocale(save_locale);
427  return result;
428 #endif
429 }
430 #endif
431 
432 /*
433  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
434  * Therefore we keep them here rather than with the mbutils code.
435  */
436 
437 /*
438  * wchar2char --- convert wide characters to multibyte format
439  *
440  * This has the same API as the standard wcstombs_l() function; in particular,
441  * tolen is the maximum number of bytes to store at *to, and *from must be
442  * zero-terminated. The output will be zero-terminated iff there is room.
443  */
444 size_t
445 wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
446 {
447  size_t result;
448 
449  if (tolen == 0)
450  return 0;
451 
452 #ifdef WIN32
453 
454  /*
455  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
456  * for some reason mbstowcs and wcstombs won't do this for us, so we use
457  * MultiByteToWideChar().
458  */
459  if (GetDatabaseEncoding() == PG_UTF8)
460  {
461  result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
462  NULL, NULL);
463  /* A zero return is failure */
464  if (result <= 0)
465  result = -1;
466  else
467  {
468  Assert(result <= tolen);
469  /* Microsoft counts the zero terminator in the result */
470  result--;
471  }
472  }
473  else
474 #endif /* WIN32 */
475  if (locale == (pg_locale_t) 0)
476  {
477  /* Use wcstombs directly for the default locale */
478  result = wcstombs(to, from, tolen);
479  }
480  else
481  {
482  /* Use wcstombs_l for nondefault locales */
483  result = wcstombs_l(to, from, tolen, locale->info.lt);
484  }
485 
486  return result;
487 }
488 
489 /*
490  * char2wchar --- convert multibyte characters to wide characters
491  *
492  * This has almost the API of mbstowcs_l(), except that *from need not be
493  * null-terminated; instead, the number of input bytes is specified as
494  * fromlen. Also, we ereport() rather than returning -1 for invalid
495  * input encoding. tolen is the maximum number of wchar_t's to store at *to.
496  * The output will be zero-terminated iff there is room.
497  */
498 size_t
499 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
501 {
502  size_t result;
503 
504  if (tolen == 0)
505  return 0;
506 
507 #ifdef WIN32
508  /* See WIN32 "Unicode" comment above */
509  if (GetDatabaseEncoding() == PG_UTF8)
510  {
511  /* Win32 API does not work for zero-length input */
512  if (fromlen == 0)
513  result = 0;
514  else
515  {
516  result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
517  /* A zero return is failure */
518  if (result == 0)
519  result = -1;
520  }
521 
522  if (result != -1)
523  {
524  Assert(result < tolen);
525  /* Append trailing null wchar (MultiByteToWideChar() does not) */
526  to[result] = 0;
527  }
528  }
529  else
530 #endif /* WIN32 */
531  {
532  /* mbstowcs requires ending '\0' */
533  char *str = pnstrdup(from, fromlen);
534 
535  if (locale == (pg_locale_t) 0)
536  {
537  /* Use mbstowcs directly for the default locale */
538  result = mbstowcs(to, str, tolen);
539  }
540  else
541  {
542  /* Use mbstowcs_l for nondefault locales */
543  result = mbstowcs_l(to, str, tolen, locale->info.lt);
544  }
545 
546  pfree(str);
547  }
548 
549  if (result == -1)
550  {
551  /*
552  * Invalid multibyte character encountered. We try to give a useful
553  * error message by letting pg_verifymbstr check the string. But it's
554  * possible that the string is OK to us, and not OK to mbstowcs ---
555  * this suggests that the LC_CTYPE locale is different from the
556  * database encoding. Give a generic error message if pg_verifymbstr
557  * can't find anything wrong.
558  */
559  pg_verifymbstr(from, fromlen, false); /* might not return */
560  /* but if it does ... */
561  ereport(ERROR,
562  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
563  errmsg("invalid multibyte character for locale"),
564  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
565  }
566 
567  return result;
568 }
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define Assert(condition)
Definition: c.h:812
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
Oid MyDatabaseId
Definition: globals.c:93
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1556
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1707
void pfree(void *pointer)
Definition: mcxt.c:1521
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1215
void * palloc(Size size)
Definition: mcxt.c:1317
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static void report_newlocale_failure(const char *localename)
size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
#define TEXTBUFLEN
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
unsigned int Oid
Definition: postgres_ext.h:31
tree context
Definition: radixtree.h:1837
locale_t lt
Definition: pg_locale.h:79
union pg_locale_struct::@156 info
bool deterministic
Definition: pg_locale.h:69
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:631
#define locale_t
Definition: win32_port.h:442
#define strcoll_l
Definition: win32_port.h:465
#define strxfrm_l
Definition: win32_port.h:466
#define wcscoll_l
Definition: win32_port.h:467