PostgreSQL Source Code  git master
ts_locale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  * locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "common/string.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 
20 static void tsearch_readline_callback(void *arg);
21 
22 
23 /*
24  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
25  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
26  * getting from char2wchar() is UTF16 not UTF32. A single input character
27  * may therefore produce a surrogate pair rather than just one wchar_t;
28  * we also need room for a trailing null. When we do get a surrogate pair,
29  * we pass just the first code to iswdigit() etc, so that these functions will
30  * always return false for characters outside the Basic Multilingual Plane.
31  */
32 #define WC_BUF_LEN 3
33 
34 int
35 t_isdigit(const char *ptr)
36 {
37  int clen = pg_mblen(ptr);
38  wchar_t character[WC_BUF_LEN];
39  pg_locale_t mylocale = 0; /* TODO */
40 
41  if (clen == 1 || database_ctype_is_c)
42  return isdigit(TOUCHAR(ptr));
43 
44  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
45 
46  return iswdigit((wint_t) character[0]);
47 }
48 
49 int
50 t_isspace(const char *ptr)
51 {
52  int clen = pg_mblen(ptr);
53  wchar_t character[WC_BUF_LEN];
54  pg_locale_t mylocale = 0; /* TODO */
55 
56  if (clen == 1 || database_ctype_is_c)
57  return isspace(TOUCHAR(ptr));
58 
59  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
60 
61  return iswspace((wint_t) character[0]);
62 }
63 
64 int
65 t_isalpha(const char *ptr)
66 {
67  int clen = pg_mblen(ptr);
68  wchar_t character[WC_BUF_LEN];
69  pg_locale_t mylocale = 0; /* TODO */
70 
71  if (clen == 1 || database_ctype_is_c)
72  return isalpha(TOUCHAR(ptr));
73 
74  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
75 
76  return iswalpha((wint_t) character[0]);
77 }
78 
79 int
80 t_isalnum(const char *ptr)
81 {
82  int clen = pg_mblen(ptr);
83  wchar_t character[WC_BUF_LEN];
84  pg_locale_t mylocale = 0; /* TODO */
85 
86  if (clen == 1 || database_ctype_is_c)
87  return isalnum(TOUCHAR(ptr));
88 
89  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
90 
91  return iswalnum((wint_t) character[0]);
92 }
93 
94 int
95 t_isprint(const char *ptr)
96 {
97  int clen = pg_mblen(ptr);
98  wchar_t character[WC_BUF_LEN];
99  pg_locale_t mylocale = 0; /* TODO */
100 
101  if (clen == 1 || database_ctype_is_c)
102  return isprint(TOUCHAR(ptr));
103 
104  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
105 
106  return iswprint((wint_t) character[0]);
107 }
108 
109 
110 /*
111  * Set up to read a file using tsearch_readline(). This facility is
112  * better than just reading the file directly because it provides error
113  * context pointing to the specific line where a problem is detected.
114  *
115  * Expected usage is:
116  *
117  * tsearch_readline_state trst;
118  *
119  * if (!tsearch_readline_begin(&trst, filename))
120  * ereport(ERROR,
121  * (errcode(ERRCODE_CONFIG_FILE_ERROR),
122  * errmsg("could not open stop-word file \"%s\": %m",
123  * filename)));
124  * while ((line = tsearch_readline(&trst)) != NULL)
125  * process line;
126  * tsearch_readline_end(&trst);
127  *
128  * Note that the caller supplies the ereport() for file open failure;
129  * this is so that a custom message can be provided. The filename string
130  * passed to tsearch_readline_begin() must remain valid through
131  * tsearch_readline_end().
132  */
133 bool
135  const char *filename)
136 {
137  if ((stp->fp = AllocateFile(filename, "r")) == NULL)
138  return false;
139  stp->filename = filename;
140  stp->lineno = 0;
141  initStringInfo(&stp->buf);
142  stp->curline = NULL;
143  /* Setup error traceback support for ereport() */
145  stp->cb.arg = (void *) stp;
147  error_context_stack = &stp->cb;
148  return true;
149 }
150 
151 /*
152  * Read the next line from a tsearch data file (expected to be in UTF-8), and
153  * convert it to database encoding if needed. The returned string is palloc'd.
154  * NULL return means EOF.
155  */
156 char *
158 {
159  char *recoded;
160 
161  /* Advance line number to use in error reports */
162  stp->lineno++;
163 
164  /* Clear curline, it's no longer relevant */
165  if (stp->curline)
166  {
167  if (stp->curline != stp->buf.data)
168  pfree(stp->curline);
169  stp->curline = NULL;
170  }
171 
172  /* Collect next line, if there is one */
173  if (!pg_get_line_buf(stp->fp, &stp->buf))
174  return NULL;
175 
176  /* Validate the input as UTF-8, then convert to DB encoding if needed */
177  recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
178 
179  /* Save the correctly-encoded string for possible error reports */
180  stp->curline = recoded; /* might be equal to buf.data */
181 
182  /*
183  * We always return a freshly pstrdup'd string. This is clearly necessary
184  * if pg_any_to_server() returned buf.data, and we need a second copy even
185  * if encoding conversion did occur. The caller is entitled to pfree the
186  * returned string at any time, which would leave curline pointing to
187  * recycled storage, causing problems if an error occurs after that point.
188  * (It's preferable to return the result of pstrdup instead of the output
189  * of pg_any_to_server, because the conversion result tends to be
190  * over-allocated. Since callers might save the result string directly
191  * into a long-lived dictionary structure, we don't want it to be a larger
192  * palloc chunk than necessary. We'll reclaim the conversion result on
193  * the next call.)
194  */
195  return pstrdup(recoded);
196 }
197 
198 /*
199  * Close down after reading a file with tsearch_readline()
200  */
201 void
203 {
204  /* Suppress use of curline in any error reported below */
205  if (stp->curline)
206  {
207  if (stp->curline != stp->buf.data)
208  pfree(stp->curline);
209  stp->curline = NULL;
210  }
211 
212  /* Release other resources */
213  pfree(stp->buf.data);
214  FreeFile(stp->fp);
215 
216  /* Pop the error context stack */
218 }
219 
220 /*
221  * Error context callback for errors occurring while reading a tsearch
222  * configuration file.
223  */
224 static void
226 {
228 
229  /*
230  * We can't include the text of the config line for errors that occur
231  * during tsearch_readline() itself. The major cause of such errors is
232  * encoding violations, and we daren't try to print error messages
233  * containing badly-encoded data.
234  */
235  if (stp->curline)
236  errcontext("line %d of configuration file \"%s\": \"%s\"",
237  stp->lineno,
238  stp->filename,
239  stp->curline);
240  else
241  errcontext("line %d of configuration file \"%s\"",
242  stp->lineno,
243  stp->filename);
244 }
245 
246 
247 /*
248  * lowerstr --- fold null-terminated string to lower case
249  *
250  * Returned string is palloc'd
251  */
252 char *
253 lowerstr(const char *str)
254 {
255  return lowerstr_with_len(str, strlen(str));
256 }
257 
258 /*
259  * lowerstr_with_len --- fold string to lower case
260  *
261  * Input string need not be null-terminated.
262  *
263  * Returned string is palloc'd
264  */
265 char *
266 lowerstr_with_len(const char *str, int len)
267 {
268  char *out;
269  pg_locale_t mylocale = 0; /* TODO */
270 
271  if (len == 0)
272  return pstrdup("");
273 
274  /*
275  * Use wide char code only when max encoding length > 1 and ctype != C.
276  * Some operating systems fail with multi-byte encodings and a C locale.
277  * Also, for a C locale there is no need to process as multibyte. From
278  * backend/utils/adt/oracle_compat.c Teodor
279  */
281  {
282  wchar_t *wstr,
283  *wptr;
284  int wlen;
285 
286  /*
287  * alloc number of wchar_t for worst case, len contains number of
288  * bytes >= number of characters and alloc 1 wchar_t for 0, because
289  * wchar2char wants zero-terminated string
290  */
291  wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
292 
293  wlen = char2wchar(wstr, len + 1, str, len, mylocale);
294  Assert(wlen <= len);
295 
296  while (*wptr)
297  {
298  *wptr = towlower((wint_t) *wptr);
299  wptr++;
300  }
301 
302  /*
303  * Alloc result string for worst case + '\0'
304  */
305  len = pg_database_encoding_max_length() * wlen + 1;
306  out = (char *) palloc(len);
307 
308  wlen = wchar2char(out, wstr, len, mylocale);
309 
310  pfree(wstr);
311 
312  if (wlen < 0)
313  ereport(ERROR,
314  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
315  errmsg("conversion from wchar_t to server encoding failed: %m")));
316  Assert(wlen < len);
317  }
318  else
319  {
320  const char *ptr = str;
321  char *outptr;
322 
323  outptr = out = (char *) palloc(sizeof(char) * (len + 1));
324  while ((ptr - str) < len && *ptr)
325  {
326  *outptr++ = tolower(TOUCHAR(ptr));
327  ptr++;
328  }
329  *outptr = '\0';
330  }
331 
332  return out;
333 }
#define Assert(condition)
Definition: c.h:858
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define errcontext
Definition: elog.h:196
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2606
int FreeFile(FILE *file)
Definition: fd.c:2804
const char * str
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:676
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
void * arg
const void size_t len
static char * filename
Definition: pg_dumpall.c:119
bool pg_get_line_buf(FILE *stream, StringInfo buf)
Definition: pg_get_line.c:95
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:2992
bool database_ctype_is_c
Definition: pg_locale.c:118
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:3048
@ PG_UTF8
Definition: pg_wchar.h:232
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
struct ErrorContextCallback * previous
Definition: elog.h:296
void(* callback)(void *arg)
Definition: elog.h:297
StringInfoData buf
Definition: ts_locale.h:29
ErrorContextCallback cb
Definition: ts_locale.h:32
const char * filename
Definition: ts_locale.h:27
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:134
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:157
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:266
int t_isspace(const char *ptr)
Definition: ts_locale.c:50
int t_isalnum(const char *ptr)
Definition: ts_locale.c:80
int t_isdigit(const char *ptr)
Definition: ts_locale.c:35
int t_isalpha(const char *ptr)
Definition: ts_locale.c:65
int t_isprint(const char *ptr)
Definition: ts_locale.c:95
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:202
char * lowerstr(const char *str)
Definition: ts_locale.c:253
static void tsearch_readline_callback(void *arg)
Definition: ts_locale.c:225
#define WC_BUF_LEN
Definition: ts_locale.c:32
#define TOUCHAR(x)
Definition: ts_locale.h:35