PostgreSQL Source Code  git master
unaccent.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * unaccent.c
4  * Text search unaccent dictionary
5  *
6  * Copyright (c) 2009-2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * contrib/unaccent/unaccent.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 #include "catalog/namespace.h"
17 #include "catalog/pg_ts_dict.h"
18 #include "commands/defrem.h"
19 #include "lib/stringinfo.h"
20 #include "tsearch/ts_cache.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "utils/builtins.h"
24 #include "utils/lsyscache.h"
25 #include "utils/regproc.h"
26 #include "utils/syscache.h"
27 
29 
30 /*
31  * An unaccent dictionary uses a trie to find a string to replace. Each node
32  * of the trie is an array of 256 TrieChar structs; the N-th element of the
33  * array corresponds to next byte value N. That element can contain both a
34  * replacement string (to be used if the source string ends with this byte)
35  * and a link to another trie node (to be followed if there are more bytes).
36  *
37  * Note that the trie search logic pays no attention to multibyte character
38  * boundaries. This is OK as long as both the data entered into the trie and
39  * the data we're trying to look up are validly encoded; no partial-character
40  * matches will occur.
41  */
42 typedef struct TrieChar
43 {
44  struct TrieChar *nextChar;
45  char *replaceTo;
48 
49 /*
50  * placeChar - put str into trie's structure, byte by byte.
51  *
52  * If node is NULL, we need to make a new node, which will be returned;
53  * otherwise the return value is the same as node.
54  */
55 static TrieChar *
56 placeChar(TrieChar *node, const unsigned char *str, int lenstr,
57  const char *replaceTo, int replacelen)
58 {
59  TrieChar *curnode;
60 
61  if (!node)
62  node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
63 
64  Assert(lenstr > 0); /* else str[0] doesn't exist */
65 
66  curnode = node + *str;
67 
68  if (lenstr <= 1)
69  {
70  if (curnode->replaceTo)
72  (errcode(ERRCODE_CONFIG_FILE_ERROR),
73  errmsg("duplicate source strings, first one will be used")));
74  else
75  {
76  curnode->replacelen = replacelen;
77  curnode->replaceTo = (char *) palloc(replacelen);
78  memcpy(curnode->replaceTo, replaceTo, replacelen);
79  }
80  }
81  else
82  {
83  curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
85  }
86 
87  return node;
88 }
89 
90 /*
91  * initTrie - create trie from file.
92  *
93  * Function converts UTF8-encoded file into current encoding.
94  */
95 static TrieChar *
96 initTrie(const char *filename)
97 {
98  TrieChar *volatile rootTrie = NULL;
101  volatile bool skip;
102 
104  if (!tsearch_readline_begin(&trst, filename))
105  ereport(ERROR,
106  (errcode(ERRCODE_CONFIG_FILE_ERROR),
107  errmsg("could not open unaccent file \"%s\": %m",
108  filename)));
109 
110  do
111  {
112  /*
113  * pg_do_encoding_conversion() (called by tsearch_readline()) will
114  * emit exception if it finds untranslatable characters in current
115  * locale. We just skip such lines, continuing with the next.
116  */
117  skip = true;
118 
119  PG_TRY();
120  {
121  char *line;
122 
123  while ((line = tsearch_readline(&trst)) != NULL)
124  {
125  /*----------
126  * The format of each line must be "src" or "src trg", where
127  * src and trg are sequences of one or more non-whitespace
128  * characters, separated by whitespace. Whitespace at start
129  * or end of line is ignored. If trg is omitted, an empty
130  * string is used as the replacement. trg can be optionally
131  * quoted, in which case whitespaces are included in it.
132  *
133  * We use a simple state machine, with states
134  * 0 initial (before src)
135  * 1 in src
136  * 2 in whitespace after src
137  * 3 in trg (non-quoted)
138  * 4 in trg (quoted)
139  * 5 in whitespace after trg
140  * -1 syntax error detected (two strings)
141  * -2 syntax error detected (unfinished quoted string)
142  *----------
143  */
144  int state;
145  char *ptr;
146  char *src = NULL;
147  char *trg = NULL;
148  char *trgstore = NULL;
149  int ptrlen;
150  int srclen = 0;
151  int trglen = 0;
152  int trgstorelen = 0;
153  bool trgquoted = false;
154 
155  state = 0;
156  for (ptr = line; *ptr; ptr += ptrlen)
157  {
158  ptrlen = pg_mblen(ptr);
159  /* ignore whitespace, but end src or trg */
160  if (t_isspace(ptr))
161  {
162  if (state == 1)
163  state = 2;
164  else if (state == 3)
165  state = 5;
166  /* whitespaces are OK in quoted area */
167  if (state != 4)
168  continue;
169  }
170  switch (state)
171  {
172  case 0:
173  /* start of src */
174  src = ptr;
175  srclen = ptrlen;
176  state = 1;
177  break;
178  case 1:
179  /* continue src */
180  srclen += ptrlen;
181  break;
182  case 2:
183  /* start of trg */
184  if (*ptr == '"')
185  {
186  trgquoted = true;
187  state = 4;
188  }
189  else
190  state = 3;
191 
192  trg = ptr;
193  trglen = ptrlen;
194  break;
195  case 3:
196  /* continue non-quoted trg */
197  trglen += ptrlen;
198  break;
199  case 4:
200  /* continue quoted trg */
201  trglen += ptrlen;
202 
203  /*
204  * If this is a quote, consider it as the end of
205  * trg except if the follow-up character is itself
206  * a quote.
207  */
208  if (*ptr == '"')
209  {
210  if (*(ptr + 1) == '"')
211  {
212  ptr++;
213  trglen += 1;
214  }
215  else
216  state = 5;
217  }
218  break;
219  default:
220  /* bogus line format */
221  state = -1;
222  break;
223  }
224  }
225 
226  if (state == 1 || state == 2)
227  {
228  /* trg was omitted, so use "" */
229  trg = "";
230  trglen = 0;
231  }
232 
233  /* If still in a quoted area, fallback to an error */
234  if (state == 4)
235  state = -2;
236 
237  /* If trg was quoted, remove its quotes and unescape it */
238  if (trgquoted && state > 0)
239  {
240  /* Ignore first and end quotes */
241  trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
242  trgstorelen = 0;
243  for (int i = 1; i < trglen - 1; i++)
244  {
245  trgstore[trgstorelen] = trg[i];
246  trgstorelen++;
247  /* skip second double quotes */
248  if (trg[i] == '"' && trg[i + 1] == '"')
249  i++;
250  }
251  }
252  else
253  {
254  trgstore = (char *) palloc(sizeof(char) * trglen);
255  trgstorelen = trglen;
256  memcpy(trgstore, trg, trgstorelen);
257  }
258 
259  if (state > 0)
260  rootTrie = placeChar(rootTrie,
261  (unsigned char *) src, srclen,
262  trgstore, trgstorelen);
263  else if (state == -1)
265  (errcode(ERRCODE_CONFIG_FILE_ERROR),
266  errmsg("invalid syntax: more than two strings in unaccent rule")));
267  else if (state == -2)
269  (errcode(ERRCODE_CONFIG_FILE_ERROR),
270  errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
271 
272  pfree(trgstore);
273  pfree(line);
274  }
275  skip = false;
276  }
277  PG_CATCH();
278  {
279  ErrorData *errdata;
280  MemoryContext ecxt;
281 
282  ecxt = MemoryContextSwitchTo(ccxt);
283  errdata = CopyErrorData();
284  if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
285  {
286  FlushErrorState();
287  }
288  else
289  {
290  MemoryContextSwitchTo(ecxt);
291  PG_RE_THROW();
292  }
293  }
294  PG_END_TRY();
295  }
296  while (skip);
297 
298  tsearch_readline_end(&trst);
299 
300  return rootTrie;
301 }
302 
303 /*
304  * findReplaceTo - find longest possible match in trie
305  *
306  * On success, returns pointer to ending subnode, plus length of matched
307  * source string in *p_matchlen. On failure, returns NULL.
308  */
309 static TrieChar *
310 findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
311  int *p_matchlen)
312 {
313  TrieChar *result = NULL;
314  int matchlen = 0;
315 
316  *p_matchlen = 0; /* prevent uninitialized-variable warnings */
317 
318  while (node && matchlen < srclen)
319  {
320  node = node + src[matchlen];
321  matchlen++;
322 
323  if (node->replaceTo)
324  {
325  result = node;
326  *p_matchlen = matchlen;
327  }
328 
329  node = node->nextChar;
330  }
331 
332  return result;
333 }
334 
336 Datum
338 {
339  List *dictoptions = (List *) PG_GETARG_POINTER(0);
340  TrieChar *rootTrie = NULL;
341  bool fileloaded = false;
342  ListCell *l;
343 
344  foreach(l, dictoptions)
345  {
346  DefElem *defel = (DefElem *) lfirst(l);
347 
348  if (strcmp(defel->defname, "rules") == 0)
349  {
350  if (fileloaded)
351  ereport(ERROR,
352  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
353  errmsg("multiple Rules parameters")));
354  rootTrie = initTrie(defGetString(defel));
355  fileloaded = true;
356  }
357  else
358  {
359  ereport(ERROR,
360  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
361  errmsg("unrecognized Unaccent parameter: \"%s\"",
362  defel->defname)));
363  }
364  }
365 
366  if (!fileloaded)
367  {
368  ereport(ERROR,
369  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
370  errmsg("missing Rules parameter")));
371  }
372 
373  PG_RETURN_POINTER(rootTrie);
374 }
375 
377 Datum
379 {
380  TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
381  char *srcchar = (char *) PG_GETARG_POINTER(1);
383  char *srcstart = srcchar;
384  TSLexeme *res;
386 
387  /* we allocate storage for the buffer only if needed */
388  buf.data = NULL;
389 
390  while (len > 0)
391  {
392  TrieChar *node;
393  int matchlen;
394 
395  node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
396  &matchlen);
397  if (node && node->replaceTo)
398  {
399  if (buf.data == NULL)
400  {
401  /* initialize buffer */
403  /* insert any data we already skipped over */
404  if (srcchar != srcstart)
405  appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
406  }
408  }
409  else
410  {
411  matchlen = pg_mblen(srcchar);
412  if (buf.data != NULL)
413  appendBinaryStringInfo(&buf, srcchar, matchlen);
414  }
415 
416  srcchar += matchlen;
417  len -= matchlen;
418  }
419 
420  /* return a result only if we made at least one substitution */
421  if (buf.data != NULL)
422  {
423  res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
424  res->lexeme = buf.data;
425  res->flags = TSL_FILTER;
426  }
427  else
428  res = NULL;
429 
431 }
432 
433 /*
434  * Function-like wrapper for dictionary
435  */
437 Datum
439 {
440  text *str;
441  int strArg;
442  Oid dictOid;
444  TSLexeme *res;
445 
446  if (PG_NARGS() == 1)
447  {
448  /*
449  * Use the "unaccent" dictionary that is in the same schema that this
450  * function is in.
451  */
452  Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
453  const char *dictname = "unaccent";
454 
455  dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
456  PointerGetDatum(dictname),
457  ObjectIdGetDatum(procnspid));
458  if (!OidIsValid(dictOid))
459  ereport(ERROR,
460  (errcode(ERRCODE_UNDEFINED_OBJECT),
461  errmsg("text search dictionary \"%s.%s\" does not exist",
462  get_namespace_name(procnspid), dictname)));
463  strArg = 0;
464  }
465  else
466  {
467  dictOid = PG_GETARG_OID(0);
468  strArg = 1;
469  }
470  str = PG_GETARG_TEXT_PP(strArg);
471 
472  dict = lookup_ts_dictionary_cache(dictOid);
473 
475  PointerGetDatum(dict->dictData),
478  PointerGetDatum(NULL)));
479 
480  PG_FREE_IF_COPY(str, strArg);
481 
482  if (res == NULL)
483  {
485  }
486  else if (res->lexeme == NULL)
487  {
488  pfree(res);
490  }
491  else
492  {
493  text *txt = cstring_to_text(res->lexeme);
494 
495  pfree(res->lexeme);
496  pfree(res);
497 
498  PG_RETURN_TEXT_P(txt);
499  }
500 }
signed int int32
Definition: c.h:481
#define OidIsValid(objectId)
Definition: c.h:762
char * defGetString(DefElem *def)
Definition: define.c:48
void FlushErrorState(void)
Definition: elog.c:1828
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
ErrorData * CopyErrorData(void)
Definition: elog.c:1723
#define PG_RE_THROW()
Definition: elog.h:411
#define PG_TRY(...)
Definition: elog.h:370
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:395
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:380
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_NARGS()
Definition: fmgr.h:203
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:666
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:315
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3344
Oid get_func_namespace(Oid funcid)
Definition: lsyscache.c:1610
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:1508
void * palloc0(Size size)
Definition: mcxt.c:1334
MemoryContext CurrentMemoryContext
Definition: mcxt.c:131
void * palloc(Size size)
Definition: mcxt.c:1304
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:108
const void size_t len
static char * filename
Definition: pg_dumpall.c:121
#define lfirst(lc)
Definition: pg_list.h:172
static char * buf
Definition: pg_test_fsync.c:73
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
unsigned int Oid
Definition: postgres_ext.h:31
MemoryContextSwitchTo(old_ctx)
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:233
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
char * defname
Definition: parsenodes.h:811
int sqlerrcode
Definition: elog.h:438
Definition: pg_list.h:54
struct TrieChar * nextChar
Definition: unaccent.c:44
int replacelen
Definition: unaccent.c:46
char * replaceTo
Definition: unaccent.c:45
Definition: regguts.h:323
Definition: c.h:674
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
Definition: syscache.h:106
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:208
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:134
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:157
int t_isspace(const char *ptr)
Definition: ts_locale.c:50
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:202
#define TSL_FILTER
Definition: ts_public.h:144
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:33
static TrieChar * initTrie(const char *filename)
Definition: unaccent.c:96
Datum unaccent_init(PG_FUNCTION_ARGS)
Definition: unaccent.c:337
PG_FUNCTION_INFO_V1(unaccent_init)
PG_MODULE_MAGIC
Definition: unaccent.c:28
struct TrieChar TrieChar
Datum unaccent_lexize(PG_FUNCTION_ARGS)
Definition: unaccent.c:378
Datum unaccent_dict(PG_FUNCTION_ARGS)
Definition: unaccent.c:438
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:56
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:310
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
text * cstring_to_text(const char *s)
Definition: varlena.c:184