PostgreSQL Source Code  git master
ts_locale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  * locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "common/string.h"
18 #include "storage/fd.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 
22 static void tsearch_readline_callback(void *arg);
23 
24 
25 /*
26  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28  * getting from char2wchar() is UTF16 not UTF32. A single input character
29  * may therefore produce a surrogate pair rather than just one wchar_t;
30  * we also need room for a trailing null. When we do get a surrogate pair,
31  * we pass just the first code to iswdigit() etc, so that these functions will
32  * always return false for characters outside the Basic Multilingual Plane.
33  */
34 #define WC_BUF_LEN 3
35 
36 int
37 t_isdigit(const char *ptr)
38 {
39  int clen = pg_mblen(ptr);
40  wchar_t character[WC_BUF_LEN];
41  pg_locale_t mylocale = 0; /* TODO */
42 
43  if (clen == 1 || database_ctype_is_c)
44  return isdigit(TOUCHAR(ptr));
45 
46  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47 
48  return iswdigit((wint_t) character[0]);
49 }
50 
51 int
52 t_isspace(const char *ptr)
53 {
54  int clen = pg_mblen(ptr);
55  wchar_t character[WC_BUF_LEN];
56  pg_locale_t mylocale = 0; /* TODO */
57 
58  if (clen == 1 || database_ctype_is_c)
59  return isspace(TOUCHAR(ptr));
60 
61  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
62 
63  return iswspace((wint_t) character[0]);
64 }
65 
66 int
67 t_isalpha(const char *ptr)
68 {
69  int clen = pg_mblen(ptr);
70  wchar_t character[WC_BUF_LEN];
71  pg_locale_t mylocale = 0; /* TODO */
72 
73  if (clen == 1 || database_ctype_is_c)
74  return isalpha(TOUCHAR(ptr));
75 
76  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
77 
78  return iswalpha((wint_t) character[0]);
79 }
80 
81 int
82 t_isalnum(const char *ptr)
83 {
84  int clen = pg_mblen(ptr);
85  wchar_t character[WC_BUF_LEN];
86  pg_locale_t mylocale = 0; /* TODO */
87 
88  if (clen == 1 || database_ctype_is_c)
89  return isalnum(TOUCHAR(ptr));
90 
91  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
92 
93  return iswalnum((wint_t) character[0]);
94 }
95 
96 int
97 t_isprint(const char *ptr)
98 {
99  int clen = pg_mblen(ptr);
100  wchar_t character[WC_BUF_LEN];
101  pg_locale_t mylocale = 0; /* TODO */
102 
103  if (clen == 1 || database_ctype_is_c)
104  return isprint(TOUCHAR(ptr));
105 
106  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
107 
108  return iswprint((wint_t) character[0]);
109 }
110 
111 
112 /*
113  * Set up to read a file using tsearch_readline(). This facility is
114  * better than just reading the file directly because it provides error
115  * context pointing to the specific line where a problem is detected.
116  *
117  * Expected usage is:
118  *
119  * tsearch_readline_state trst;
120  *
121  * if (!tsearch_readline_begin(&trst, filename))
122  * ereport(ERROR,
123  * (errcode(ERRCODE_CONFIG_FILE_ERROR),
124  * errmsg("could not open stop-word file \"%s\": %m",
125  * filename)));
126  * while ((line = tsearch_readline(&trst)) != NULL)
127  * process line;
128  * tsearch_readline_end(&trst);
129  *
130  * Note that the caller supplies the ereport() for file open failure;
131  * this is so that a custom message can be provided. The filename string
132  * passed to tsearch_readline_begin() must remain valid through
133  * tsearch_readline_end().
134  */
135 bool
137  const char *filename)
138 {
139  if ((stp->fp = AllocateFile(filename, "r")) == NULL)
140  return false;
141  stp->filename = filename;
142  stp->lineno = 0;
143  initStringInfo(&stp->buf);
144  stp->curline = NULL;
145  /* Setup error traceback support for ereport() */
147  stp->cb.arg = (void *) stp;
149  error_context_stack = &stp->cb;
150  return true;
151 }
152 
153 /*
154  * Read the next line from a tsearch data file (expected to be in UTF-8), and
155  * convert it to database encoding if needed. The returned string is palloc'd.
156  * NULL return means EOF.
157  */
158 char *
160 {
161  char *recoded;
162 
163  /* Advance line number to use in error reports */
164  stp->lineno++;
165 
166  /* Clear curline, it's no longer relevant */
167  if (stp->curline)
168  {
169  if (stp->curline != stp->buf.data)
170  pfree(stp->curline);
171  stp->curline = NULL;
172  }
173 
174  /* Collect next line, if there is one */
175  if (!pg_get_line_buf(stp->fp, &stp->buf))
176  return NULL;
177 
178  /* Validate the input as UTF-8, then convert to DB encoding if needed */
179  recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
180 
181  /* Save the correctly-encoded string for possible error reports */
182  stp->curline = recoded; /* might be equal to buf.data */
183 
184  /*
185  * We always return a freshly pstrdup'd string. This is clearly necessary
186  * if pg_any_to_server() returned buf.data, and we need a second copy even
187  * if encoding conversion did occur. The caller is entitled to pfree the
188  * returned string at any time, which would leave curline pointing to
189  * recycled storage, causing problems if an error occurs after that point.
190  * (It's preferable to return the result of pstrdup instead of the output
191  * of pg_any_to_server, because the conversion result tends to be
192  * over-allocated. Since callers might save the result string directly
193  * into a long-lived dictionary structure, we don't want it to be a larger
194  * palloc chunk than necessary. We'll reclaim the conversion result on
195  * the next call.)
196  */
197  return pstrdup(recoded);
198 }
199 
200 /*
201  * Close down after reading a file with tsearch_readline()
202  */
203 void
205 {
206  /* Suppress use of curline in any error reported below */
207  if (stp->curline)
208  {
209  if (stp->curline != stp->buf.data)
210  pfree(stp->curline);
211  stp->curline = NULL;
212  }
213 
214  /* Release other resources */
215  pfree(stp->buf.data);
216  FreeFile(stp->fp);
217 
218  /* Pop the error context stack */
220 }
221 
222 /*
223  * Error context callback for errors occurring while reading a tsearch
224  * configuration file.
225  */
226 static void
228 {
230 
231  /*
232  * We can't include the text of the config line for errors that occur
233  * during tsearch_readline() itself. The major cause of such errors is
234  * encoding violations, and we daren't try to print error messages
235  * containing badly-encoded data.
236  */
237  if (stp->curline)
238  errcontext("line %d of configuration file \"%s\": \"%s\"",
239  stp->lineno,
240  stp->filename,
241  stp->curline);
242  else
243  errcontext("line %d of configuration file \"%s\"",
244  stp->lineno,
245  stp->filename);
246 }
247 
248 
249 /*
250  * lowerstr --- fold null-terminated string to lower case
251  *
252  * Returned string is palloc'd
253  */
254 char *
255 lowerstr(const char *str)
256 {
257  return lowerstr_with_len(str, strlen(str));
258 }
259 
260 /*
261  * lowerstr_with_len --- fold string to lower case
262  *
263  * Input string need not be null-terminated.
264  *
265  * Returned string is palloc'd
266  */
267 char *
268 lowerstr_with_len(const char *str, int len)
269 {
270  char *out;
271  pg_locale_t mylocale = 0; /* TODO */
272 
273  if (len == 0)
274  return pstrdup("");
275 
276  /*
277  * Use wide char code only when max encoding length > 1 and ctype != C.
278  * Some operating systems fail with multi-byte encodings and a C locale.
279  * Also, for a C locale there is no need to process as multibyte. From
280  * backend/utils/adt/oracle_compat.c Teodor
281  */
283  {
284  wchar_t *wstr,
285  *wptr;
286  int wlen;
287 
288  /*
289  * alloc number of wchar_t for worst case, len contains number of
290  * bytes >= number of characters and alloc 1 wchar_t for 0, because
291  * wchar2char wants zero-terminated string
292  */
293  wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
294 
295  wlen = char2wchar(wstr, len + 1, str, len, mylocale);
296  Assert(wlen <= len);
297 
298  while (*wptr)
299  {
300  *wptr = towlower((wint_t) *wptr);
301  wptr++;
302  }
303 
304  /*
305  * Alloc result string for worst case + '\0'
306  */
307  len = pg_database_encoding_max_length() * wlen + 1;
308  out = (char *) palloc(len);
309 
310  wlen = wchar2char(out, wstr, len, mylocale);
311 
312  pfree(wstr);
313 
314  if (wlen < 0)
315  ereport(ERROR,
316  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
317  errmsg("conversion from wchar_t to server encoding failed: %m")));
318  Assert(wlen < len);
319  }
320  else
321  {
322  const char *ptr = str;
323  char *outptr;
324 
325  outptr = out = (char *) palloc(sizeof(char) * (len + 1));
326  while ((ptr - str) < len && *ptr)
327  {
328  *outptr++ = tolower(TOUCHAR(ptr));
329  ptr++;
330  }
331  *outptr = '\0';
332  }
333 
334  return out;
335 }
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define errcontext
Definition: elog.h:196
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2553
int FreeFile(FILE *file)
Definition: fd.c:2751
Assert(fmt[strlen(fmt) - 1] !='\n')
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:677
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
char * pstrdup(const char *in)
Definition: mcxt.c:1644
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1226
void * arg
const void size_t len
static char * filename
Definition: pg_dumpall.c:121
bool pg_get_line_buf(FILE *stream, StringInfo buf)
Definition: pg_get_line.c:95
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:2934
bool database_ctype_is_c
Definition: pg_locale.c:118
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2990
@ PG_UTF8
Definition: pg_wchar.h:235
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
StringInfoData buf
Definition: ts_locale.h:29
ErrorContextCallback cb
Definition: ts_locale.h:32
const char * filename
Definition: ts_locale.h:27
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:136
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:159
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:268
int t_isspace(const char *ptr)
Definition: ts_locale.c:52
int t_isalnum(const char *ptr)
Definition: ts_locale.c:82
int t_isdigit(const char *ptr)
Definition: ts_locale.c:37
int t_isalpha(const char *ptr)
Definition: ts_locale.c:67
int t_isprint(const char *ptr)
Definition: ts_locale.c:97
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:204
char * lowerstr(const char *str)
Definition: ts_locale.c:255
static void tsearch_readline_callback(void *arg)
Definition: ts_locale.c:227
#define WC_BUF_LEN
Definition: ts_locale.c:34
#define TOUCHAR(x)
Definition: ts_locale.h:35