PostgreSQL Source Code  git master
ts_locale.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  * locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 
21 static void tsearch_readline_callback(void *arg);
22 
23 
24 /*
25  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
26  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
27  * getting from char2wchar() is UTF16 not UTF32. A single input character
28  * may therefore produce a surrogate pair rather than just one wchar_t;
29  * we also need room for a trailing null. When we do get a surrogate pair,
30  * we pass just the first code to iswdigit() etc, so that these functions will
31  * always return false for characters outside the Basic Multilingual Plane.
32  */
33 #define WC_BUF_LEN 3
34 
35 int
36 t_isdigit(const char *ptr)
37 {
38  int clen = pg_mblen(ptr);
39  wchar_t character[WC_BUF_LEN];
40  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
41  pg_locale_t mylocale = 0; /* TODO */
42 
43  if (clen == 1 || lc_ctype_is_c(collation))
44  return isdigit(TOUCHAR(ptr));
45 
46  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47 
48  return iswdigit((wint_t) character[0]);
49 }
50 
51 int
52 t_isspace(const char *ptr)
53 {
54  int clen = pg_mblen(ptr);
55  wchar_t character[WC_BUF_LEN];
56  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
57  pg_locale_t mylocale = 0; /* TODO */
58 
59  if (clen == 1 || lc_ctype_is_c(collation))
60  return isspace(TOUCHAR(ptr));
61 
62  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
63 
64  return iswspace((wint_t) character[0]);
65 }
66 
67 int
68 t_isalpha(const char *ptr)
69 {
70  int clen = pg_mblen(ptr);
71  wchar_t character[WC_BUF_LEN];
72  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
73  pg_locale_t mylocale = 0; /* TODO */
74 
75  if (clen == 1 || lc_ctype_is_c(collation))
76  return isalpha(TOUCHAR(ptr));
77 
78  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
79 
80  return iswalpha((wint_t) character[0]);
81 }
82 
83 int
84 t_isprint(const char *ptr)
85 {
86  int clen = pg_mblen(ptr);
87  wchar_t character[WC_BUF_LEN];
88  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
89  pg_locale_t mylocale = 0; /* TODO */
90 
91  if (clen == 1 || lc_ctype_is_c(collation))
92  return isprint(TOUCHAR(ptr));
93 
94  char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
95 
96  return iswprint((wint_t) character[0]);
97 }
98 
99 
100 /*
101  * Set up to read a file using tsearch_readline(). This facility is
102  * better than just reading the file directly because it provides error
103  * context pointing to the specific line where a problem is detected.
104  *
105  * Expected usage is:
106  *
107  * tsearch_readline_state trst;
108  *
109  * if (!tsearch_readline_begin(&trst, filename))
110  * ereport(ERROR,
111  * (errcode(ERRCODE_CONFIG_FILE_ERROR),
112  * errmsg("could not open stop-word file \"%s\": %m",
113  * filename)));
114  * while ((line = tsearch_readline(&trst)) != NULL)
115  * process line;
116  * tsearch_readline_end(&trst);
117  *
118  * Note that the caller supplies the ereport() for file open failure;
119  * this is so that a custom message can be provided. The filename string
120  * passed to tsearch_readline_begin() must remain valid through
121  * tsearch_readline_end().
122  */
123 bool
125  const char *filename)
126 {
127  if ((stp->fp = AllocateFile(filename, "r")) == NULL)
128  return false;
129  stp->filename = filename;
130  stp->lineno = 0;
131  stp->curline = NULL;
132  /* Setup error traceback support for ereport() */
134  stp->cb.arg = (void *) stp;
136  error_context_stack = &stp->cb;
137  return true;
138 }
139 
140 /*
141  * Read the next line from a tsearch data file (expected to be in UTF-8), and
142  * convert it to database encoding if needed. The returned string is palloc'd.
143  * NULL return means EOF.
144  */
145 char *
147 {
148  char *result;
149 
150  stp->lineno++;
151  stp->curline = NULL;
152  result = t_readline(stp->fp);
153  stp->curline = result;
154  return result;
155 }
156 
157 /*
158  * Close down after reading a file with tsearch_readline()
159  */
160 void
162 {
163  FreeFile(stp->fp);
164  /* Pop the error context stack */
166 }
167 
168 /*
169  * Error context callback for errors occurring while reading a tsearch
170  * configuration file.
171  */
172 static void
174 {
176 
177  /*
178  * We can't include the text of the config line for errors that occur
179  * during t_readline() itself. This is only partly a consequence of our
180  * arms-length use of that routine: the major cause of such errors is
181  * encoding violations, and we daren't try to print error messages
182  * containing badly-encoded data.
183  */
184  if (stp->curline)
185  errcontext("line %d of configuration file \"%s\": \"%s\"",
186  stp->lineno,
187  stp->filename,
188  stp->curline);
189  else
190  errcontext("line %d of configuration file \"%s\"",
191  stp->lineno,
192  stp->filename);
193 }
194 
195 
196 /*
197  * Read the next line from a tsearch data file (expected to be in UTF-8), and
198  * convert it to database encoding if needed. The returned string is palloc'd.
199  * NULL return means EOF.
200  *
201  * Note: direct use of this function is now deprecated. Go through
202  * tsearch_readline() to provide better error reporting.
203  */
204 char *
205 t_readline(FILE *fp)
206 {
207  int len;
208  char *recoded;
209  char buf[4096]; /* lines must not be longer than this */
210 
211  if (fgets(buf, sizeof(buf), fp) == NULL)
212  return NULL;
213 
214  len = strlen(buf);
215 
216  /* Make sure the input is valid UTF-8 */
217  (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
218 
219  /* And convert */
220  recoded = pg_any_to_server(buf, len, PG_UTF8);
221  if (recoded == buf)
222  {
223  /*
224  * conversion didn't pstrdup, so we must. We can use the length of the
225  * original string, because no conversion was done.
226  */
227  recoded = pnstrdup(recoded, len);
228  }
229 
230  return recoded;
231 }
232 
233 /*
234  * lowerstr --- fold null-terminated string to lower case
235  *
236  * Returned string is palloc'd
237  */
238 char *
239 lowerstr(const char *str)
240 {
241  return lowerstr_with_len(str, strlen(str));
242 }
243 
244 /*
245  * lowerstr_with_len --- fold string to lower case
246  *
247  * Input string need not be null-terminated.
248  *
249  * Returned string is palloc'd
250  */
251 char *
252 lowerstr_with_len(const char *str, int len)
253 {
254  char *out;
255  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
256  pg_locale_t mylocale = 0; /* TODO */
257 
258  if (len == 0)
259  return pstrdup("");
260 
261  /*
262  * Use wide char code only when max encoding length > 1 and ctype != C.
263  * Some operating systems fail with multi-byte encodings and a C locale.
264  * Also, for a C locale there is no need to process as multibyte. From
265  * backend/utils/adt/oracle_compat.c Teodor
266  */
267  if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
268  {
269  wchar_t *wstr,
270  *wptr;
271  int wlen;
272 
273  /*
274  * alloc number of wchar_t for worst case, len contains number of
275  * bytes >= number of characters and alloc 1 wchar_t for 0, because
276  * wchar2char wants zero-terminated string
277  */
278  wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
279 
280  wlen = char2wchar(wstr, len + 1, str, len, mylocale);
281  Assert(wlen <= len);
282 
283  while (*wptr)
284  {
285  *wptr = towlower((wint_t) *wptr);
286  wptr++;
287  }
288 
289  /*
290  * Alloc result string for worst case + '\0'
291  */
292  len = pg_database_encoding_max_length() * wlen + 1;
293  out = (char *) palloc(len);
294 
295  wlen = wchar2char(out, wstr, len, mylocale);
296 
297  pfree(wstr);
298 
299  if (wlen < 0)
300  ereport(ERROR,
301  (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
302  errmsg("conversion from wchar_t to server encoding failed: %m")));
303  Assert(wlen < len);
304  }
305  else
306  {
307  const char *ptr = str;
308  char *outptr;
309 
310  outptr = out = (char *) palloc(sizeof(char) * (len + 1));
311  while ((ptr - str) < len && *ptr)
312  {
313  *outptr++ = tolower(TOUCHAR(ptr));
314  ptr++;
315  }
316  *outptr = '\0';
317  }
318 
319  return out;
320 }
static void tsearch_readline_callback(void *arg)
Definition: ts_locale.c:173
int t_isprint(const char *ptr)
Definition: ts_locale.c:84
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1197
#define WC_BUF_LEN
Definition: ts_locale.c:33
char * pstrdup(const char *in)
Definition: mcxt.c:1186
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:252
int errcode(int sqlerrcode)
Definition: elog.c:570
char * lowerstr(const char *str)
Definition: ts_locale.c:239
unsigned int Oid
Definition: postgres_ext.h:31
void(* callback)(void *arg)
Definition: elog.h:254
struct ErrorContextCallback * previous
Definition: elog.h:253
char * t_readline(FILE *fp)
Definition: ts_locale.c:205
ErrorContextCallback * error_context_stack
Definition: elog.c:88
int t_isdigit(const char *ptr)
Definition: ts_locale.c:36
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ERROR
Definition: elog.h:43
int t_isspace(const char *ptr)
Definition: ts_locale.c:52
static char * buf
Definition: pg_test_fsync.c:68
int pg_database_encoding_max_length(void)
Definition: wchar.c:1881
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2205
ErrorContextCallback cb
Definition: ts_locale.h:39
bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
Definition: wchar.c:1925
#define ereport(elevel, rest)
Definition: elog.h:141
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:1836
const char * filename
Definition: ts_locale.h:36
#define Assert(condition)
Definition: c.h:732
#define TOUCHAR(x)
Definition: ts_locale.h:42
int pg_mblen(const char *mbstr)
Definition: mbutils.c:802
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:161
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:146
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:124
int FreeFile(FILE *file)
Definition: fd.c:2404
static char * filename
Definition: pg_dumpall.c:91
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:784
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
Definition: pg_locale.c:1765
#define errcontext
Definition: elog.h:183
void * arg
int t_isalpha(const char *ptr)
Definition: ts_locale.c:68
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1226
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:581