PostgreSQL Source Code  git master
ts_locale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  * locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "common/string.h"
18 #include "storage/fd.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 
22 static void tsearch_readline_callback(void *arg);
23 
24 
25 /*
26  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28  * getting from char2wchar() is UTF16 not UTF32. A single input character
29  * may therefore produce a surrogate pair rather than just one wchar_t;
30  * we also need room for a trailing null. When we do get a surrogate pair,
31  * we pass just the first code to iswdigit() etc, so that these functions will
32  * always return false for characters outside the Basic Multilingual Plane.
33  */
34 #define WC_BUF_LEN 3
35 
36 int
37 t_isdigit(const char *ptr)
38 {
39  int clen = pg_mblen(ptr);
40  wchar_t character[WC_BUF_LEN];
41  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
42  pg_locale_t mylocale = 0; /* TODO */
43 
44  if (clen == 1 || lc_ctype_is_c(collation))
45  return isdigit(TOUCHAR(ptr));
46 
47  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
48 
49  return iswdigit((wint_t) character[0]);
50 }
51 
52 int
53 t_isspace(const char *ptr)
54 {
55  int clen = pg_mblen(ptr);
56  wchar_t character[WC_BUF_LEN];
57  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
58  pg_locale_t mylocale = 0; /* TODO */
59 
60  if (clen == 1 || lc_ctype_is_c(collation))
61  return isspace(TOUCHAR(ptr));
62 
63  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
64 
65  return iswspace((wint_t) character[0]);
66 }
67 
68 int
69 t_isalpha(const char *ptr)
70 {
71  int clen = pg_mblen(ptr);
72  wchar_t character[WC_BUF_LEN];
73  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
74  pg_locale_t mylocale = 0; /* TODO */
75 
76  if (clen == 1 || lc_ctype_is_c(collation))
77  return isalpha(TOUCHAR(ptr));
78 
79  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
80 
81  return iswalpha((wint_t) character[0]);
82 }
83 
84 int
85 t_isalnum(const char *ptr)
86 {
87  int clen = pg_mblen(ptr);
88  wchar_t character[WC_BUF_LEN];
89  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
90  pg_locale_t mylocale = 0; /* TODO */
91 
92  if (clen == 1 || lc_ctype_is_c(collation))
93  return isalnum(TOUCHAR(ptr));
94 
95  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
96 
97  return iswalnum((wint_t) character[0]);
98 }
99 
100 int
101 t_isprint(const char *ptr)
102 {
103  int clen = pg_mblen(ptr);
104  wchar_t character[WC_BUF_LEN];
105  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
106  pg_locale_t mylocale = 0; /* TODO */
107 
108  if (clen == 1 || lc_ctype_is_c(collation))
109  return isprint(TOUCHAR(ptr));
110 
111  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
112 
113  return iswprint((wint_t) character[0]);
114 }
115 
116 
117 /*
118  * Set up to read a file using tsearch_readline(). This facility is
119  * better than just reading the file directly because it provides error
120  * context pointing to the specific line where a problem is detected.
121  *
122  * Expected usage is:
123  *
124  * tsearch_readline_state trst;
125  *
126  * if (!tsearch_readline_begin(&trst, filename))
127  * ereport(ERROR,
128  * (errcode(ERRCODE_CONFIG_FILE_ERROR),
129  * errmsg("could not open stop-word file \"%s\": %m",
130  * filename)));
131  * while ((line = tsearch_readline(&trst)) != NULL)
132  * process line;
133  * tsearch_readline_end(&trst);
134  *
135  * Note that the caller supplies the ereport() for file open failure;
136  * this is so that a custom message can be provided. The filename string
137  * passed to tsearch_readline_begin() must remain valid through
138  * tsearch_readline_end().
139  */
140 bool
142  const char *filename)
143 {
144  if ((stp->fp = AllocateFile(filename, "r")) == NULL)
145  return false;
146  stp->filename = filename;
147  stp->lineno = 0;
148  initStringInfo(&stp->buf);
149  stp->curline = NULL;
150  /* Setup error traceback support for ereport() */
152  stp->cb.arg = (void *) stp;
154  error_context_stack = &stp->cb;
155  return true;
156 }
157 
158 /*
159  * Read the next line from a tsearch data file (expected to be in UTF-8), and
160  * convert it to database encoding if needed. The returned string is palloc'd.
161  * NULL return means EOF.
162  */
163 char *
165 {
166  char *recoded;
167 
168  /* Advance line number to use in error reports */
169  stp->lineno++;
170 
171  /* Clear curline, it's no longer relevant */
172  if (stp->curline)
173  {
174  if (stp->curline != stp->buf.data)
175  pfree(stp->curline);
176  stp->curline = NULL;
177  }
178 
179  /* Collect next line, if there is one */
180  if (!pg_get_line_buf(stp->fp, &stp->buf))
181  return NULL;
182 
183  /* Validate the input as UTF-8, then convert to DB encoding if needed */
184  recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
185 
186  /* Save the correctly-encoded string for possible error reports */
187  stp->curline = recoded; /* might be equal to buf.data */
188 
189  /*
190  * We always return a freshly pstrdup'd string. This is clearly necessary
191  * if pg_any_to_server() returned buf.data, and we need a second copy even
192  * if encoding conversion did occur. The caller is entitled to pfree the
193  * returned string at any time, which would leave curline pointing to
194  * recycled storage, causing problems if an error occurs after that point.
195  * (It's preferable to return the result of pstrdup instead of the output
196  * of pg_any_to_server, because the conversion result tends to be
197  * over-allocated. Since callers might save the result string directly
198  * into a long-lived dictionary structure, we don't want it to be a larger
199  * palloc chunk than necessary. We'll reclaim the conversion result on
200  * the next call.)
201  */
202  return pstrdup(recoded);
203 }
204 
205 /*
206  * Close down after reading a file with tsearch_readline()
207  */
208 void
210 {
211  /* Suppress use of curline in any error reported below */
212  if (stp->curline)
213  {
214  if (stp->curline != stp->buf.data)
215  pfree(stp->curline);
216  stp->curline = NULL;
217  }
218 
219  /* Release other resources */
220  pfree(stp->buf.data);
221  FreeFile(stp->fp);
222 
223  /* Pop the error context stack */
225 }
226 
227 /*
228  * Error context callback for errors occurring while reading a tsearch
229  * configuration file.
230  */
231 static void
233 {
235 
236  /*
237  * We can't include the text of the config line for errors that occur
238  * during tsearch_readline() itself. The major cause of such errors is
239  * encoding violations, and we daren't try to print error messages
240  * containing badly-encoded data.
241  */
242  if (stp->curline)
243  errcontext("line %d of configuration file \"%s\": \"%s\"",
244  stp->lineno,
245  stp->filename,
246  stp->curline);
247  else
248  errcontext("line %d of configuration file \"%s\"",
249  stp->lineno,
250  stp->filename);
251 }
252 
253 
254 /*
255  * lowerstr --- fold null-terminated string to lower case
256  *
257  * Returned string is palloc'd
258  */
259 char *
260 lowerstr(const char *str)
261 {
262  return lowerstr_with_len(str, strlen(str));
263 }
264 
265 /*
266  * lowerstr_with_len --- fold string to lower case
267  *
268  * Input string need not be null-terminated.
269  *
270  * Returned string is palloc'd
271  */
272 char *
273 lowerstr_with_len(const char *str, int len)
274 {
275  char *out;
276  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
277  pg_locale_t mylocale = 0; /* TODO */
278 
279  if (len == 0)
280  return pstrdup("");
281 
282  /*
283  * Use wide char code only when max encoding length > 1 and ctype != C.
284  * Some operating systems fail with multi-byte encodings and a C locale.
285  * Also, for a C locale there is no need to process as multibyte. From
286  * backend/utils/adt/oracle_compat.c Teodor
287  */
288  if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
289  {
290  wchar_t *wstr,
291  *wptr;
292  int wlen;
293 
294  /*
295  * alloc number of wchar_t for worst case, len contains number of
296  * bytes >= number of characters and alloc 1 wchar_t for 0, because
297  * wchar2char wants zero-terminated string
298  */
299  wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
300 
301  wlen = char2wchar(wstr, len + 1, str, len, mylocale);
302  Assert(wlen <= len);
303 
304  while (*wptr)
305  {
306  *wptr = towlower((wint_t) *wptr);
307  wptr++;
308  }
309 
310  /*
311  * Alloc result string for worst case + '\0'
312  */
313  len = pg_database_encoding_max_length() * wlen + 1;
314  out = (char *) palloc(len);
315 
316  wlen = wchar2char(out, wstr, len, mylocale);
317 
318  pfree(wstr);
319 
320  if (wlen < 0)
321  ereport(ERROR,
322  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
323  errmsg("conversion from wchar_t to server encoding failed: %m")));
324  Assert(wlen < len);
325  }
326  else
327  {
328  const char *ptr = str;
329  char *outptr;
330 
331  outptr = out = (char *) palloc(sizeof(char) * (len + 1));
332  while ((ptr - str) < len && *ptr)
333  {
334  *outptr++ = tolower(TOUCHAR(ptr));
335  ptr++;
336  }
337  *outptr = '\0';
338  }
339 
340  return out;
341 }
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define errcontext
Definition: elog.h:196
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2383
int FreeFile(FILE *file)
Definition: fd.c:2581
Assert(fmt[strlen(fmt) - 1] !='\n')
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:677
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
char * pstrdup(const char *in)
Definition: mcxt.c:1624
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc(Size size)
Definition: mcxt.c:1210
void * arg
const void size_t len
static char * filename
Definition: pg_dumpall.c:119
bool pg_get_line_buf(FILE *stream, StringInfo buf)
Definition: pg_get_line.c:95
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:1990
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1352
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2061
@ PG_UTF8
Definition: pg_wchar.h:232
unsigned int Oid
Definition: postgres_ext.h:31
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
StringInfoData buf
Definition: ts_locale.h:29
ErrorContextCallback cb
Definition: ts_locale.h:32
const char * filename
Definition: ts_locale.h:27
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:141
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:164
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:273
int t_isspace(const char *ptr)
Definition: ts_locale.c:53
int t_isalnum(const char *ptr)
Definition: ts_locale.c:85
int t_isdigit(const char *ptr)
Definition: ts_locale.c:37
int t_isalpha(const char *ptr)
Definition: ts_locale.c:69
int t_isprint(const char *ptr)
Definition: ts_locale.c:101
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:209
char * lowerstr(const char *str)
Definition: ts_locale.c:260
static void tsearch_readline_callback(void *arg)
Definition: ts_locale.c:232
#define WC_BUF_LEN
Definition: ts_locale.c:34
#define TOUCHAR(x)
Definition: ts_locale.h:35