PostgreSQL Source Code  git master
ts_locale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  * locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "common/string.h"
18 #include "storage/fd.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 
22 static void tsearch_readline_callback(void *arg);
23 
24 
25 /*
26  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28  * getting from char2wchar() is UTF16 not UTF32. A single input character
29  * may therefore produce a surrogate pair rather than just one wchar_t;
30  * we also need room for a trailing null. When we do get a surrogate pair,
31  * we pass just the first code to iswdigit() etc, so that these functions will
32  * always return false for characters outside the Basic Multilingual Plane.
33  */
34 #define WC_BUF_LEN 3
35 
36 int
37 t_isdigit(const char *ptr)
38 {
39  int clen = pg_mblen(ptr);
40  wchar_t character[WC_BUF_LEN];
41  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
42  pg_locale_t mylocale = 0; /* TODO */
43 
44  if (clen == 1 || lc_ctype_is_c(collation))
45  return isdigit(TOUCHAR(ptr));
46 
47  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
48 
49  return iswdigit((wint_t) character[0]);
50 }
51 
52 int
53 t_isspace(const char *ptr)
54 {
55  int clen = pg_mblen(ptr);
56  wchar_t character[WC_BUF_LEN];
57  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
58  pg_locale_t mylocale = 0; /* TODO */
59 
60  if (clen == 1 || lc_ctype_is_c(collation))
61  return isspace(TOUCHAR(ptr));
62 
63  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
64 
65  return iswspace((wint_t) character[0]);
66 }
67 
68 int
69 t_isalpha(const char *ptr)
70 {
71  int clen = pg_mblen(ptr);
72  wchar_t character[WC_BUF_LEN];
73  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
74  pg_locale_t mylocale = 0; /* TODO */
75 
76  if (clen == 1 || lc_ctype_is_c(collation))
77  return isalpha(TOUCHAR(ptr));
78 
79  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
80 
81  return iswalpha((wint_t) character[0]);
82 }
83 
84 int
85 t_isprint(const char *ptr)
86 {
87  int clen = pg_mblen(ptr);
88  wchar_t character[WC_BUF_LEN];
89  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
90  pg_locale_t mylocale = 0; /* TODO */
91 
92  if (clen == 1 || lc_ctype_is_c(collation))
93  return isprint(TOUCHAR(ptr));
94 
95  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
96 
97  return iswprint((wint_t) character[0]);
98 }
99 
100 
101 /*
102  * Set up to read a file using tsearch_readline(). This facility is
103  * better than just reading the file directly because it provides error
104  * context pointing to the specific line where a problem is detected.
105  *
106  * Expected usage is:
107  *
108  * tsearch_readline_state trst;
109  *
110  * if (!tsearch_readline_begin(&trst, filename))
111  * ereport(ERROR,
112  * (errcode(ERRCODE_CONFIG_FILE_ERROR),
113  * errmsg("could not open stop-word file \"%s\": %m",
114  * filename)));
115  * while ((line = tsearch_readline(&trst)) != NULL)
116  * process line;
117  * tsearch_readline_end(&trst);
118  *
119  * Note that the caller supplies the ereport() for file open failure;
120  * this is so that a custom message can be provided. The filename string
121  * passed to tsearch_readline_begin() must remain valid through
122  * tsearch_readline_end().
123  */
124 bool
126  const char *filename)
127 {
128  if ((stp->fp = AllocateFile(filename, "r")) == NULL)
129  return false;
130  stp->filename = filename;
131  stp->lineno = 0;
132  initStringInfo(&stp->buf);
133  stp->curline = NULL;
134  /* Setup error traceback support for ereport() */
136  stp->cb.arg = (void *) stp;
138  error_context_stack = &stp->cb;
139  return true;
140 }
141 
142 /*
143  * Read the next line from a tsearch data file (expected to be in UTF-8), and
144  * convert it to database encoding if needed. The returned string is palloc'd.
145  * NULL return means EOF.
146  */
147 char *
149 {
150  char *recoded;
151 
152  /* Advance line number to use in error reports */
153  stp->lineno++;
154 
155  /* Clear curline, it's no longer relevant */
156  if (stp->curline)
157  {
158  if (stp->curline != stp->buf.data)
159  pfree(stp->curline);
160  stp->curline = NULL;
161  }
162 
163  /* Collect next line, if there is one */
164  if (!pg_get_line_buf(stp->fp, &stp->buf))
165  return NULL;
166 
167  /* Validate the input as UTF-8, then convert to DB encoding if needed */
168  recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
169 
170  /* Save the correctly-encoded string for possible error reports */
171  stp->curline = recoded; /* might be equal to buf.data */
172 
173  /*
174  * We always return a freshly pstrdup'd string. This is clearly necessary
175  * if pg_any_to_server() returned buf.data, and we need a second copy even
176  * if encoding conversion did occur. The caller is entitled to pfree the
177  * returned string at any time, which would leave curline pointing to
178  * recycled storage, causing problems if an error occurs after that point.
179  * (It's preferable to return the result of pstrdup instead of the output
180  * of pg_any_to_server, because the conversion result tends to be
181  * over-allocated. Since callers might save the result string directly
182  * into a long-lived dictionary structure, we don't want it to be a larger
183  * palloc chunk than necessary. We'll reclaim the conversion result on
184  * the next call.)
185  */
186  return pstrdup(recoded);
187 }
188 
189 /*
190  * Close down after reading a file with tsearch_readline()
191  */
192 void
194 {
195  /* Suppress use of curline in any error reported below */
196  if (stp->curline)
197  {
198  if (stp->curline != stp->buf.data)
199  pfree(stp->curline);
200  stp->curline = NULL;
201  }
202 
203  /* Release other resources */
204  pfree(stp->buf.data);
205  FreeFile(stp->fp);
206 
207  /* Pop the error context stack */
209 }
210 
211 /*
212  * Error context callback for errors occurring while reading a tsearch
213  * configuration file.
214  */
215 static void
217 {
219 
220  /*
221  * We can't include the text of the config line for errors that occur
222  * during tsearch_readline() itself. The major cause of such errors is
223  * encoding violations, and we daren't try to print error messages
224  * containing badly-encoded data.
225  */
226  if (stp->curline)
227  errcontext("line %d of configuration file \"%s\": \"%s\"",
228  stp->lineno,
229  stp->filename,
230  stp->curline);
231  else
232  errcontext("line %d of configuration file \"%s\"",
233  stp->lineno,
234  stp->filename);
235 }
236 
237 
238 /*
239  * lowerstr --- fold null-terminated string to lower case
240  *
241  * Returned string is palloc'd
242  */
243 char *
244 lowerstr(const char *str)
245 {
246  return lowerstr_with_len(str, strlen(str));
247 }
248 
249 /*
250  * lowerstr_with_len --- fold string to lower case
251  *
252  * Input string need not be null-terminated.
253  *
254  * Returned string is palloc'd
255  */
256 char *
257 lowerstr_with_len(const char *str, int len)
258 {
259  char *out;
260  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
261  pg_locale_t mylocale = 0; /* TODO */
262 
263  if (len == 0)
264  return pstrdup("");
265 
266  /*
267  * Use wide char code only when max encoding length > 1 and ctype != C.
268  * Some operating systems fail with multi-byte encodings and a C locale.
269  * Also, for a C locale there is no need to process as multibyte. From
270  * backend/utils/adt/oracle_compat.c Teodor
271  */
272  if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
273  {
274  wchar_t *wstr,
275  *wptr;
276  int wlen;
277 
278  /*
279  * alloc number of wchar_t for worst case, len contains number of
280  * bytes >= number of characters and alloc 1 wchar_t for 0, because
281  * wchar2char wants zero-terminated string
282  */
283  wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
284 
285  wlen = char2wchar(wstr, len + 1, str, len, mylocale);
286  Assert(wlen <= len);
287 
288  while (*wptr)
289  {
290  *wptr = towlower((wint_t) *wptr);
291  wptr++;
292  }
293 
294  /*
295  * Alloc result string for worst case + '\0'
296  */
297  len = pg_database_encoding_max_length() * wlen + 1;
298  out = (char *) palloc(len);
299 
300  wlen = wchar2char(out, wstr, len, mylocale);
301 
302  pfree(wstr);
303 
304  if (wlen < 0)
305  ereport(ERROR,
306  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
307  errmsg("conversion from wchar_t to server encoding failed: %m")));
308  Assert(wlen < len);
309  }
310  else
311  {
312  const char *ptr = str;
313  char *outptr;
314 
315  outptr = out = (char *) palloc(sizeof(char) * (len + 1));
316  while ((ptr - str) < len && *ptr)
317  {
318  *outptr++ = tolower(TOUCHAR(ptr));
319  ptr++;
320  }
321  *outptr = '\0';
322  }
323 
324  return out;
325 }
static void tsearch_readline_callback(void *arg)
Definition: ts_locale.c:216
bool pg_get_line_buf(FILE *stream, StringInfo buf)
Definition: pg_get_line.c:88
int t_isprint(const char *ptr)
Definition: ts_locale.c:85
#define WC_BUF_LEN
Definition: ts_locale.c:34
char * pstrdup(const char *in)
Definition: mcxt.c:1187
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:257
int errcode(int sqlerrcode)
Definition: elog.c:610
char * lowerstr(const char *str)
Definition: ts_locale.c:244
unsigned int Oid
Definition: postgres_ext.h:31
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
ErrorContextCallback * error_context_stack
Definition: elog.c:92
int t_isdigit(const char *ptr)
Definition: ts_locale.c:37
void pfree(void *pointer)
Definition: mcxt.c:1057
#define ERROR
Definition: elog.h:43
int t_isspace(const char *ptr)
Definition: ts_locale.c:53
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2322
ErrorContextCallback cb
Definition: ts_locale.h:40
StringInfoData buf
Definition: ts_locale.h:37
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2052
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
const char * filename
Definition: ts_locale.h:35
#define ereport(elevel,...)
Definition: elog.h:144
#define Assert(condition)
Definition: c.h:745
#define TOUCHAR(x)
Definition: ts_locale.h:43
int pg_mblen(const char *mbstr)
Definition: mbutils.c:907
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:193
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:148
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1436
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:125
int FreeFile(FILE *file)
Definition: fd.c:2521
static char * filename
Definition: pg_dumpall.c:91
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:824
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:1981
#define errcontext
Definition: elog.h:185
void * arg
int t_isalpha(const char *ptr)
Definition: ts_locale.c:69
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1397
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:619