PostgreSQL Source Code  git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
unaccent.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * unaccent.c
4  * Text search unaccent dictionary
5  *
6  * Copyright (c) 2009-2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * contrib/unaccent/unaccent.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 #include "catalog/pg_ts_dict.h"
17 #include "commands/defrem.h"
18 #include "lib/stringinfo.h"
19 #include "tsearch/ts_cache.h"
20 #include "tsearch/ts_locale.h"
21 #include "tsearch/ts_public.h"
22 #include "utils/builtins.h"
23 #include "utils/lsyscache.h"
24 #include "utils/syscache.h"
25 
27 
28 /*
29  * An unaccent dictionary uses a trie to find a string to replace. Each node
30  * of the trie is an array of 256 TrieChar structs; the N-th element of the
31  * array corresponds to next byte value N. That element can contain both a
32  * replacement string (to be used if the source string ends with this byte)
33  * and a link to another trie node (to be followed if there are more bytes).
34  *
35  * Note that the trie search logic pays no attention to multibyte character
36  * boundaries. This is OK as long as both the data entered into the trie and
37  * the data we're trying to look up are validly encoded; no partial-character
38  * matches will occur.
39  */
40 typedef struct TrieChar
41 {
42  struct TrieChar *nextChar;
43  char *replaceTo;
46 
47 /*
48  * placeChar - put str into trie's structure, byte by byte.
49  *
50  * If node is NULL, we need to make a new node, which will be returned;
51  * otherwise the return value is the same as node.
52  */
53 static TrieChar *
54 placeChar(TrieChar *node, const unsigned char *str, int lenstr,
55  const char *replaceTo, int replacelen)
56 {
57  TrieChar *curnode;
58 
59  if (!node)
60  node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
61 
62  Assert(lenstr > 0); /* else str[0] doesn't exist */
63 
64  curnode = node + *str;
65 
66  if (lenstr <= 1)
67  {
68  if (curnode->replaceTo)
70  (errcode(ERRCODE_CONFIG_FILE_ERROR),
71  errmsg("duplicate source strings, first one will be used")));
72  else
73  {
74  curnode->replacelen = replacelen;
75  curnode->replaceTo = (char *) palloc(replacelen);
76  memcpy(curnode->replaceTo, replaceTo, replacelen);
77  }
78  }
79  else
80  {
81  curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
83  }
84 
85  return node;
86 }
87 
88 /*
89  * initTrie - create trie from file.
90  *
91  * Function converts UTF8-encoded file into current encoding.
92  */
93 static TrieChar *
94 initTrie(const char *filename)
95 {
96  TrieChar *volatile rootTrie = NULL;
99  volatile bool skip;
100 
102  if (!tsearch_readline_begin(&trst, filename))
103  ereport(ERROR,
104  (errcode(ERRCODE_CONFIG_FILE_ERROR),
105  errmsg("could not open unaccent file \"%s\": %m",
106  filename)));
107 
108  do
109  {
110  /*
111  * pg_do_encoding_conversion() (called by tsearch_readline()) will
112  * emit exception if it finds untranslatable characters in current
113  * locale. We just skip such lines, continuing with the next.
114  */
115  skip = true;
116 
117  PG_TRY();
118  {
119  char *line;
120 
121  while ((line = tsearch_readline(&trst)) != NULL)
122  {
123  /*----------
124  * The format of each line must be "src" or "src trg", where
125  * src and trg are sequences of one or more non-whitespace
126  * characters, separated by whitespace. Whitespace at start
127  * or end of line is ignored. If trg is omitted, an empty
128  * string is used as the replacement. trg can be optionally
129  * quoted, in which case whitespaces are included in it.
130  *
131  * We use a simple state machine, with states
132  * 0 initial (before src)
133  * 1 in src
134  * 2 in whitespace after src
135  * 3 in trg (non-quoted)
136  * 4 in trg (quoted)
137  * 5 in whitespace after trg
138  * -1 syntax error detected (two strings)
139  * -2 syntax error detected (unfinished quoted string)
140  *----------
141  */
142  int state;
143  char *ptr;
144  char *src = NULL;
145  char *trg = NULL;
146  char *trgstore = NULL;
147  int ptrlen;
148  int srclen = 0;
149  int trglen = 0;
150  int trgstorelen = 0;
151  bool trgquoted = false;
152 
153  state = 0;
154  for (ptr = line; *ptr; ptr += ptrlen)
155  {
156  ptrlen = pg_mblen(ptr);
157  /* ignore whitespace, but end src or trg */
158  if (t_isspace(ptr))
159  {
160  if (state == 1)
161  state = 2;
162  else if (state == 3)
163  state = 5;
164  /* whitespaces are OK in quoted area */
165  if (state != 4)
166  continue;
167  }
168  switch (state)
169  {
170  case 0:
171  /* start of src */
172  src = ptr;
173  srclen = ptrlen;
174  state = 1;
175  break;
176  case 1:
177  /* continue src */
178  srclen += ptrlen;
179  break;
180  case 2:
181  /* start of trg */
182  if (*ptr == '"')
183  {
184  trgquoted = true;
185  state = 4;
186  }
187  else
188  state = 3;
189 
190  trg = ptr;
191  trglen = ptrlen;
192  break;
193  case 3:
194  /* continue non-quoted trg */
195  trglen += ptrlen;
196  break;
197  case 4:
198  /* continue quoted trg */
199  trglen += ptrlen;
200 
201  /*
202  * If this is a quote, consider it as the end of
203  * trg except if the follow-up character is itself
204  * a quote.
205  */
206  if (*ptr == '"')
207  {
208  if (*(ptr + 1) == '"')
209  {
210  ptr++;
211  trglen += 1;
212  }
213  else
214  state = 5;
215  }
216  break;
217  default:
218  /* bogus line format */
219  state = -1;
220  break;
221  }
222  }
223 
224  if (state == 1 || state == 2)
225  {
226  /* trg was omitted, so use "" */
227  trg = "";
228  trglen = 0;
229  }
230 
231  /* If still in a quoted area, fallback to an error */
232  if (state == 4)
233  state = -2;
234 
235  /* If trg was quoted, remove its quotes and unescape it */
236  if (trgquoted && state > 0)
237  {
238  /* Ignore first and end quotes */
239  trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
240  trgstorelen = 0;
241  for (int i = 1; i < trglen - 1; i++)
242  {
243  trgstore[trgstorelen] = trg[i];
244  trgstorelen++;
245  /* skip second double quotes */
246  if (trg[i] == '"' && trg[i + 1] == '"')
247  i++;
248  }
249  }
250  else
251  {
252  trgstore = (char *) palloc(sizeof(char) * trglen);
253  trgstorelen = trglen;
254  memcpy(trgstore, trg, trgstorelen);
255  }
256 
257  if (state > 0)
258  rootTrie = placeChar(rootTrie,
259  (unsigned char *) src, srclen,
260  trgstore, trgstorelen);
261  else if (state == -1)
263  (errcode(ERRCODE_CONFIG_FILE_ERROR),
264  errmsg("invalid syntax: more than two strings in unaccent rule")));
265  else if (state == -2)
267  (errcode(ERRCODE_CONFIG_FILE_ERROR),
268  errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
269 
270  pfree(trgstore);
271  pfree(line);
272  }
273  skip = false;
274  }
275  PG_CATCH();
276  {
277  ErrorData *errdata;
278  MemoryContext ecxt;
279 
280  ecxt = MemoryContextSwitchTo(ccxt);
281  errdata = CopyErrorData();
282  if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
283  {
284  FlushErrorState();
285  }
286  else
287  {
288  MemoryContextSwitchTo(ecxt);
289  PG_RE_THROW();
290  }
291  }
292  PG_END_TRY();
293  }
294  while (skip);
295 
296  tsearch_readline_end(&trst);
297 
298  return rootTrie;
299 }
300 
301 /*
302  * findReplaceTo - find longest possible match in trie
303  *
304  * On success, returns pointer to ending subnode, plus length of matched
305  * source string in *p_matchlen. On failure, returns NULL.
306  */
307 static TrieChar *
308 findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
309  int *p_matchlen)
310 {
311  TrieChar *result = NULL;
312  int matchlen = 0;
313 
314  *p_matchlen = 0; /* prevent uninitialized-variable warnings */
315 
316  while (node && matchlen < srclen)
317  {
318  node = node + src[matchlen];
319  matchlen++;
320 
321  if (node->replaceTo)
322  {
323  result = node;
324  *p_matchlen = matchlen;
325  }
326 
327  node = node->nextChar;
328  }
329 
330  return result;
331 }
332 
334 Datum
336 {
337  List *dictoptions = (List *) PG_GETARG_POINTER(0);
338  TrieChar *rootTrie = NULL;
339  bool fileloaded = false;
340  ListCell *l;
341 
342  foreach(l, dictoptions)
343  {
344  DefElem *defel = (DefElem *) lfirst(l);
345 
346  if (strcmp(defel->defname, "rules") == 0)
347  {
348  if (fileloaded)
349  ereport(ERROR,
350  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
351  errmsg("multiple Rules parameters")));
352  rootTrie = initTrie(defGetString(defel));
353  fileloaded = true;
354  }
355  else
356  {
357  ereport(ERROR,
358  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
359  errmsg("unrecognized Unaccent parameter: \"%s\"",
360  defel->defname)));
361  }
362  }
363 
364  if (!fileloaded)
365  {
366  ereport(ERROR,
367  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
368  errmsg("missing Rules parameter")));
369  }
370 
371  PG_RETURN_POINTER(rootTrie);
372 }
373 
375 Datum
377 {
378  TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
379  char *srcchar = (char *) PG_GETARG_POINTER(1);
381  char *srcstart = srcchar;
382  TSLexeme *res;
384 
385  /* we allocate storage for the buffer only if needed */
386  buf.data = NULL;
387 
388  while (len > 0)
389  {
390  TrieChar *node;
391  int matchlen;
392 
393  node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
394  &matchlen);
395  if (node && node->replaceTo)
396  {
397  if (buf.data == NULL)
398  {
399  /* initialize buffer */
401  /* insert any data we already skipped over */
402  if (srcchar != srcstart)
403  appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
404  }
406  }
407  else
408  {
409  matchlen = pg_mblen(srcchar);
410  if (buf.data != NULL)
411  appendBinaryStringInfo(&buf, srcchar, matchlen);
412  }
413 
414  srcchar += matchlen;
415  len -= matchlen;
416  }
417 
418  /* return a result only if we made at least one substitution */
419  if (buf.data != NULL)
420  {
421  res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
422  res->lexeme = buf.data;
423  res->flags = TSL_FILTER;
424  }
425  else
426  res = NULL;
427 
429 }
430 
431 /*
432  * Function-like wrapper for dictionary
433  */
435 Datum
437 {
438  text *str;
439  int strArg;
440  Oid dictOid;
442  TSLexeme *res;
443 
444  if (PG_NARGS() == 1)
445  {
446  /*
447  * Use the "unaccent" dictionary that is in the same schema that this
448  * function is in.
449  */
450  Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
451  const char *dictname = "unaccent";
452 
453  dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
454  PointerGetDatum(dictname),
455  ObjectIdGetDatum(procnspid));
456  if (!OidIsValid(dictOid))
457  ereport(ERROR,
458  (errcode(ERRCODE_UNDEFINED_OBJECT),
459  errmsg("text search dictionary \"%s.%s\" does not exist",
460  get_namespace_name(procnspid), dictname)));
461  strArg = 0;
462  }
463  else
464  {
465  dictOid = PG_GETARG_OID(0);
466  strArg = 1;
467  }
468  str = PG_GETARG_TEXT_PP(strArg);
469 
470  dict = lookup_ts_dictionary_cache(dictOid);
471 
473  PointerGetDatum(dict->dictData),
476  PointerGetDatum(NULL)));
477 
478  PG_FREE_IF_COPY(str, strArg);
479 
480  if (res == NULL)
481  {
483  }
484  else if (res->lexeme == NULL)
485  {
486  pfree(res);
488  }
489  else
490  {
491  text *txt = cstring_to_text(res->lexeme);
492 
493  pfree(res->lexeme);
494  pfree(res);
495 
496  PG_RETURN_TEXT_P(txt);
497  }
498 }
signed int int32
Definition: c.h:508
#define Assert(condition)
Definition: c.h:863
#define OidIsValid(objectId)
Definition: c.h:780
char * defGetString(DefElem *def)
Definition: define.c:48
void FlushErrorState(void)
Definition: elog.c:1867
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
ErrorData * CopyErrorData(void)
Definition: elog.c:1746
#define PG_RE_THROW()
Definition: elog.h:412
#define PG_TRY(...)
Definition: elog.h:371
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:396
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:381
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_NARGS()
Definition: fmgr.h:203
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:665
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:315
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
const char * str
int i
Definition: isn.c:72
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3366
Oid get_func_namespace(Oid funcid)
Definition: lsyscache.c:1632
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void * palloc(Size size)
Definition: mcxt.c:1317
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:107
const void size_t len
static char * filename
Definition: pg_dumpall.c:119
#define lfirst(lc)
Definition: pg_list.h:172
static char * buf
Definition: pg_test_fsync.c:72
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
unsigned int Oid
Definition: postgres_ext.h:31
MemoryContextSwitchTo(old_ctx)
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:230
void initStringInfo(StringInfo str)
Definition: stringinfo.c:56
char * defname
Definition: parsenodes.h:817
int sqlerrcode
Definition: elog.h:439
Definition: pg_list.h:54
struct TrieChar * nextChar
Definition: unaccent.c:42
int replacelen
Definition: unaccent.c:44
char * replaceTo
Definition: unaccent.c:43
Definition: regguts.h:323
Definition: c.h:692
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
Definition: syscache.h:111
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:208
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:134
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:157
int t_isspace(const char *ptr)
Definition: ts_locale.c:50
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:202
#define TSL_FILTER
Definition: ts_public.h:144
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:33
static TrieChar * initTrie(const char *filename)
Definition: unaccent.c:94
Datum unaccent_init(PG_FUNCTION_ARGS)
Definition: unaccent.c:335
PG_FUNCTION_INFO_V1(unaccent_init)
PG_MODULE_MAGIC
Definition: unaccent.c:26
struct TrieChar TrieChar
Datum unaccent_lexize(PG_FUNCTION_ARGS)
Definition: unaccent.c:376
Datum unaccent_dict(PG_FUNCTION_ARGS)
Definition: unaccent.c:436
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:54
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:308
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
text * cstring_to_text(const char *s)
Definition: varlena.c:184