PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
unaccent.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * unaccent.c
4  * Text search unaccent dictionary
5  *
6  * Copyright (c) 2009-2017, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * contrib/unaccent/unaccent.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "lib/stringinfo.h"
19 #include "tsearch/ts_cache.h"
20 #include "tsearch/ts_locale.h"
21 #include "tsearch/ts_public.h"
22 #include "utils/builtins.h"
23 #include "utils/regproc.h"
24 
26 
27 /*
28  * An unaccent dictionary uses a trie to find a string to replace. Each node
29  * of the trie is an array of 256 TrieChar structs; the N-th element of the
30  * array corresponds to next byte value N. That element can contain both a
31  * replacement string (to be used if the source string ends with this byte)
32  * and a link to another trie node (to be followed if there are more bytes).
33  *
34  * Note that the trie search logic pays no attention to multibyte character
35  * boundaries. This is OK as long as both the data entered into the trie and
36  * the data we're trying to look up are validly encoded; no partial-character
37  * matches will occur.
38  */
39 typedef struct TrieChar
40 {
41  struct TrieChar *nextChar;
42  char *replaceTo;
44 } TrieChar;
45 
46 /*
47  * placeChar - put str into trie's structure, byte by byte.
48  *
49  * If node is NULL, we need to make a new node, which will be returned;
50  * otherwise the return value is the same as node.
51  */
52 static TrieChar *
53 placeChar(TrieChar *node, const unsigned char *str, int lenstr,
54  const char *replaceTo, int replacelen)
55 {
56  TrieChar *curnode;
57 
58  if (!node)
59  node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
60 
61  Assert(lenstr > 0); /* else str[0] doesn't exist */
62 
63  curnode = node + *str;
64 
65  if (lenstr <= 1)
66  {
67  if (curnode->replaceTo)
69  (errcode(ERRCODE_CONFIG_FILE_ERROR),
70  errmsg("duplicate source strings, first one will be used")));
71  else
72  {
73  curnode->replacelen = replacelen;
74  curnode->replaceTo = (char *) palloc(replacelen);
75  memcpy(curnode->replaceTo, replaceTo, replacelen);
76  }
77  }
78  else
79  {
80  curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
81  replaceTo, replacelen);
82  }
83 
84  return node;
85 }
86 
87 /*
88  * initTrie - create trie from file.
89  *
90  * Function converts UTF8-encoded file into current encoding.
91  */
92 static TrieChar *
94 {
95  TrieChar *volatile rootTrie = NULL;
98  volatile bool skip;
99 
100  filename = get_tsearch_config_filename(filename, "rules");
101  if (!tsearch_readline_begin(&trst, filename))
102  ereport(ERROR,
103  (errcode(ERRCODE_CONFIG_FILE_ERROR),
104  errmsg("could not open unaccent file \"%s\": %m",
105  filename)));
106 
107  do
108  {
109  /*
110  * pg_do_encoding_conversion() (called by tsearch_readline()) will
111  * emit exception if it finds untranslatable characters in current
112  * locale. We just skip such lines, continuing with the next.
113  */
114  skip = true;
115 
116  PG_TRY();
117  {
118  char *line;
119 
120  while ((line = tsearch_readline(&trst)) != NULL)
121  {
122  /*----------
123  * The format of each line must be "src" or "src trg", where
124  * src and trg are sequences of one or more non-whitespace
125  * characters, separated by whitespace. Whitespace at start
126  * or end of line is ignored. If trg is omitted, an empty
127  * string is used as the replacement.
128  *
129  * We use a simple state machine, with states
130  * 0 initial (before src)
131  * 1 in src
132  * 2 in whitespace after src
133  * 3 in trg
134  * 4 in whitespace after trg
135  * -1 syntax error detected
136  *----------
137  */
138  int state;
139  char *ptr;
140  char *src = NULL;
141  char *trg = NULL;
142  int ptrlen;
143  int srclen = 0;
144  int trglen = 0;
145 
146  state = 0;
147  for (ptr = line; *ptr; ptr += ptrlen)
148  {
149  ptrlen = pg_mblen(ptr);
150  /* ignore whitespace, but end src or trg */
151  if (t_isspace(ptr))
152  {
153  if (state == 1)
154  state = 2;
155  else if (state == 3)
156  state = 4;
157  continue;
158  }
159  switch (state)
160  {
161  case 0:
162  /* start of src */
163  src = ptr;
164  srclen = ptrlen;
165  state = 1;
166  break;
167  case 1:
168  /* continue src */
169  srclen += ptrlen;
170  break;
171  case 2:
172  /* start of trg */
173  trg = ptr;
174  trglen = ptrlen;
175  state = 3;
176  break;
177  case 3:
178  /* continue trg */
179  trglen += ptrlen;
180  break;
181  default:
182  /* bogus line format */
183  state = -1;
184  break;
185  }
186  }
187 
188  if (state == 1 || state == 2)
189  {
190  /* trg was omitted, so use "" */
191  trg = "";
192  trglen = 0;
193  }
194 
195  if (state > 0)
196  rootTrie = placeChar(rootTrie,
197  (unsigned char *) src, srclen,
198  trg, trglen);
199  else if (state < 0)
201  (errcode(ERRCODE_CONFIG_FILE_ERROR),
202  errmsg("invalid syntax: more than two strings in unaccent rule")));
203 
204  pfree(line);
205  }
206  skip = false;
207  }
208  PG_CATCH();
209  {
210  ErrorData *errdata;
211  MemoryContext ecxt;
212 
213  ecxt = MemoryContextSwitchTo(ccxt);
214  errdata = CopyErrorData();
215  if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
216  {
217  FlushErrorState();
218  }
219  else
220  {
221  MemoryContextSwitchTo(ecxt);
222  PG_RE_THROW();
223  }
224  }
225  PG_END_TRY();
226  }
227  while (skip);
228 
229  tsearch_readline_end(&trst);
230 
231  return rootTrie;
232 }
233 
234 /*
235  * findReplaceTo - find longest possible match in trie
236  *
237  * On success, returns pointer to ending subnode, plus length of matched
238  * source string in *p_matchlen. On failure, returns NULL.
239  */
240 static TrieChar *
241 findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
242  int *p_matchlen)
243 {
244  TrieChar *result = NULL;
245  int matchlen = 0;
246 
247  *p_matchlen = 0; /* prevent uninitialized-variable warnings */
248 
249  while (node && matchlen < srclen)
250  {
251  node = node + src[matchlen];
252  matchlen++;
253 
254  if (node->replaceTo)
255  {
256  result = node;
257  *p_matchlen = matchlen;
258  }
259 
260  node = node->nextChar;
261  }
262 
263  return result;
264 }
265 
267 Datum
269 {
270  List *dictoptions = (List *) PG_GETARG_POINTER(0);
271  TrieChar *rootTrie = NULL;
272  bool fileloaded = false;
273  ListCell *l;
274 
275  foreach(l, dictoptions)
276  {
277  DefElem *defel = (DefElem *) lfirst(l);
278 
279  if (pg_strcasecmp("Rules", defel->defname) == 0)
280  {
281  if (fileloaded)
282  ereport(ERROR,
283  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
284  errmsg("multiple Rules parameters")));
285  rootTrie = initTrie(defGetString(defel));
286  fileloaded = true;
287  }
288  else
289  {
290  ereport(ERROR,
291  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
292  errmsg("unrecognized Unaccent parameter: \"%s\"",
293  defel->defname)));
294  }
295  }
296 
297  if (!fileloaded)
298  {
299  ereport(ERROR,
300  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
301  errmsg("missing Rules parameter")));
302  }
303 
304  PG_RETURN_POINTER(rootTrie);
305 }
306 
308 Datum
310 {
311  TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
312  char *srcchar = (char *) PG_GETARG_POINTER(1);
313  int32 len = PG_GETARG_INT32(2);
314  char *srcstart = srcchar;
315  TSLexeme *res;
317 
318  /* we allocate storage for the buffer only if needed */
319  buf.data = NULL;
320 
321  while (len > 0)
322  {
323  TrieChar *node;
324  int matchlen;
325 
326  node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
327  &matchlen);
328  if (node && node->replaceTo)
329  {
330  if (buf.data == NULL)
331  {
332  /* initialize buffer */
333  initStringInfo(&buf);
334  /* insert any data we already skipped over */
335  if (srcchar != srcstart)
336  appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
337  }
338  appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
339  }
340  else
341  {
342  matchlen = pg_mblen(srcchar);
343  if (buf.data != NULL)
344  appendBinaryStringInfo(&buf, srcchar, matchlen);
345  }
346 
347  srcchar += matchlen;
348  len -= matchlen;
349  }
350 
351  /* return a result only if we made at least one substitution */
352  if (buf.data != NULL)
353  {
354  res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
355  res->lexeme = buf.data;
356  res->flags = TSL_FILTER;
357  }
358  else
359  res = NULL;
360 
361  PG_RETURN_POINTER(res);
362 }
363 
364 /*
365  * Function-like wrapper for dictionary
366  */
368 Datum
370 {
371  text *str;
372  int strArg;
373  Oid dictOid;
375  TSLexeme *res;
376 
377  if (PG_NARGS() == 1)
378  {
379  dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
380  strArg = 0;
381  }
382  else
383  {
384  dictOid = PG_GETARG_OID(0);
385  strArg = 1;
386  }
387  str = PG_GETARG_TEXT_PP(strArg);
388 
389  dict = lookup_ts_dictionary_cache(dictOid);
390 
391  res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
392  PointerGetDatum(dict->dictData),
396 
397  PG_FREE_IF_COPY(str, strArg);
398 
399  if (res == NULL)
400  {
402  }
403  else if (res->lexeme == NULL)
404  {
405  pfree(res);
407  }
408  else
409  {
410  text *txt = cstring_to_text(res->lexeme);
411 
412  pfree(res->lexeme);
413  pfree(res);
414 
415  PG_RETURN_TEXT_P(txt);
416  }
417 }
#define t_isspace(x)
Definition: ts_locale.h:58
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
static void skip(struct vars *v)
Definition: regc_lex.c:1109
#define TSL_FILTER
Definition: ts_public.h:117
#define VARDATA_ANY(PTR)
Definition: postgres.h:347
int sqlerrcode
Definition: elog.h:342
ErrorData * CopyErrorData(void)
Definition: elog.c:1497
#define PointerGetDatum(X)
Definition: postgres.h:562
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
int errcode(int sqlerrcode)
Definition: elog.c:575
return result
Definition: formatting.c:1633
struct TrieChar * nextChar
Definition: unaccent.c:41
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
unsigned int Oid
Definition: postgres_ext.h:31
void FlushErrorState(void)
Definition: elog.c:1587
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:279
signed int int32
Definition: c.h:256
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:273
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:210
static TrieChar * initTrie(char *filename)
Definition: unaccent.c:93
Oid get_ts_dict_oid(List *names, bool missing_ok)
Definition: namespace.c:2343
void pfree(void *pointer)
Definition: mcxt.c:950
#define ERROR
Definition: elog.h:43
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:33
char * defGetString(DefElem *def)
Definition: define.c:49
char * lexeme
Definition: ts_public.h:111
static char * buf
Definition: pg_test_fsync.c:66
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
Datum unaccent_lexize(PG_FUNCTION_ARGS)
Definition: unaccent.c:309
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:608
#define ereport(elevel, rest)
Definition: elog.h:122
uint16 flags
Definition: ts_public.h:109
Datum unaccent_init(PG_FUNCTION_ARGS)
Definition: unaccent.c:268
void initStringInfo(StringInfo str)
Definition: stringinfo.c:46
#define WARNING
Definition: elog.h:40
struct TrieChar TrieChar
void * palloc0(Size size)
Definition: mcxt.c:878
uintptr_t Datum
Definition: postgres.h:372
char * replaceTo
Definition: unaccent.c:42
PG_FUNCTION_INFO_V1(unaccent_init)
int replacelen
Definition: unaccent.c:43
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:330
#define PG_CATCH()
Definition: elog.h:293
text * cstring_to_text(const char *s)
Definition: varlena.c:149
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
#define lfirst(lc)
Definition: pg_list.h:106
Definition: regguts.h:298
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:225
Datum unaccent_dict(PG_FUNCTION_ARGS)
Definition: unaccent.c:369
#define PG_NARGS()
Definition: fmgr.h:168
#define PG_RE_THROW()
Definition: elog.h:314
int pg_mblen(const char *mbstr)
Definition: mbutils.c:771
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:153
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:138
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:241
#define DatumGetPointer(X)
Definition: postgres.h:555
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:116
PG_MODULE_MAGIC
Definition: unaccent.c:25
#define Int32GetDatum(X)
Definition: postgres.h:485
List * stringToQualifiedNameList(const char *string)
Definition: regproc.c:1687
static char * filename
Definition: pg_dumpall.c:89
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:340
void * palloc(Size size)
Definition: mcxt.c:849
int errmsg(const char *fmt,...)
Definition: elog.c:797
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:53
Definition: c.h:439
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
char * defname
Definition: parsenodes.h:719
#define PG_TRY()
Definition: elog.h:284
Definition: pg_list.h:45
void appendBinaryStringInfo(StringInfo str, const char *data, int datalen)
Definition: stringinfo.c:208
#define PG_END_TRY()
Definition: elog.h:300