PostgreSQL Source Code git master
unaccent.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * unaccent.c
4 * Text search unaccent dictionary
5 *
6 * Copyright (c) 2009-2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * contrib/unaccent/unaccent.c
10 *
11 *-------------------------------------------------------------------------
12 */
13
14#include "postgres.h"
15
16#include "catalog/pg_ts_dict.h"
17#include "commands/defrem.h"
18#include "lib/stringinfo.h"
19#include "tsearch/ts_cache.h"
20#include "tsearch/ts_locale.h"
21#include "tsearch/ts_public.h"
22#include "utils/builtins.h"
23#include "utils/lsyscache.h"
24#include "utils/syscache.h"
25
27
28/*
29 * An unaccent dictionary uses a trie to find a string to replace. Each node
30 * of the trie is an array of 256 TrieChar structs; the N-th element of the
31 * array corresponds to next byte value N. That element can contain both a
32 * replacement string (to be used if the source string ends with this byte)
33 * and a link to another trie node (to be followed if there are more bytes).
34 *
35 * Note that the trie search logic pays no attention to multibyte character
36 * boundaries. This is OK as long as both the data entered into the trie and
37 * the data we're trying to look up are validly encoded; no partial-character
38 * matches will occur.
39 */
40typedef struct TrieChar
41{
43 char *replaceTo;
46
47/*
48 * placeChar - put str into trie's structure, byte by byte.
49 *
50 * If node is NULL, we need to make a new node, which will be returned;
51 * otherwise the return value is the same as node.
52 */
53static TrieChar *
54placeChar(TrieChar *node, const unsigned char *str, int lenstr,
55 const char *replaceTo, int replacelen)
56{
57 TrieChar *curnode;
58
59 if (!node)
60 node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
61
62 Assert(lenstr > 0); /* else str[0] doesn't exist */
63
64 curnode = node + *str;
65
66 if (lenstr <= 1)
67 {
68 if (curnode->replaceTo)
70 (errcode(ERRCODE_CONFIG_FILE_ERROR),
71 errmsg("duplicate source strings, first one will be used")));
72 else
73 {
74 curnode->replacelen = replacelen;
75 curnode->replaceTo = (char *) palloc(replacelen);
76 memcpy(curnode->replaceTo, replaceTo, replacelen);
77 }
78 }
79 else
80 {
81 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
83 }
84
85 return node;
86}
87
88/*
89 * initTrie - create trie from file.
90 *
91 * Function converts UTF8-encoded file into current encoding.
92 */
93static TrieChar *
94initTrie(const char *filename)
95{
96 TrieChar *volatile rootTrie = NULL;
99 volatile bool skip;
100
102 if (!tsearch_readline_begin(&trst, filename))
104 (errcode(ERRCODE_CONFIG_FILE_ERROR),
105 errmsg("could not open unaccent file \"%s\": %m",
106 filename)));
107
108 do
109 {
110 /*
111 * pg_do_encoding_conversion() (called by tsearch_readline()) will
112 * emit exception if it finds untranslatable characters in current
113 * locale. We just skip such lines, continuing with the next.
114 */
115 skip = true;
116
117 PG_TRY();
118 {
119 char *line;
120
121 while ((line = tsearch_readline(&trst)) != NULL)
122 {
123 /*----------
124 * The format of each line must be "src" or "src trg", where
125 * src and trg are sequences of one or more non-whitespace
126 * characters, separated by whitespace. Whitespace at start
127 * or end of line is ignored. If trg is omitted, an empty
128 * string is used as the replacement. trg can be optionally
129 * quoted, in which case whitespaces are included in it.
130 *
131 * We use a simple state machine, with states
132 * 0 initial (before src)
133 * 1 in src
134 * 2 in whitespace after src
135 * 3 in trg (non-quoted)
136 * 4 in trg (quoted)
137 * 5 in whitespace after trg
138 * -1 syntax error detected (two strings)
139 * -2 syntax error detected (unfinished quoted string)
140 *----------
141 */
142 int state;
143 char *ptr;
144 char *src = NULL;
145 char *trg = NULL;
146 char *trgstore = NULL;
147 int ptrlen;
148 int srclen = 0;
149 int trglen = 0;
150 int trgstorelen = 0;
151 bool trgquoted = false;
152
153 state = 0;
154 for (ptr = line; *ptr; ptr += ptrlen)
155 {
156 ptrlen = pg_mblen(ptr);
157 /* ignore whitespace, but end src or trg */
158 if (isspace((unsigned char) *ptr))
159 {
160 if (state == 1)
161 state = 2;
162 else if (state == 3)
163 state = 5;
164 /* whitespaces are OK in quoted area */
165 if (state != 4)
166 continue;
167 }
168 switch (state)
169 {
170 case 0:
171 /* start of src */
172 src = ptr;
173 srclen = ptrlen;
174 state = 1;
175 break;
176 case 1:
177 /* continue src */
178 srclen += ptrlen;
179 break;
180 case 2:
181 /* start of trg */
182 if (*ptr == '"')
183 {
184 trgquoted = true;
185 state = 4;
186 }
187 else
188 state = 3;
189
190 trg = ptr;
191 trglen = ptrlen;
192 break;
193 case 3:
194 /* continue non-quoted trg */
195 trglen += ptrlen;
196 break;
197 case 4:
198 /* continue quoted trg */
199 trglen += ptrlen;
200
201 /*
202 * If this is a quote, consider it as the end of
203 * trg except if the follow-up character is itself
204 * a quote.
205 */
206 if (*ptr == '"')
207 {
208 if (*(ptr + 1) == '"')
209 {
210 ptr++;
211 trglen += 1;
212 }
213 else
214 state = 5;
215 }
216 break;
217 default:
218 /* bogus line format */
219 state = -1;
220 break;
221 }
222 }
223
224 if (state == 1 || state == 2)
225 {
226 /* trg was omitted, so use "" */
227 trg = "";
228 trglen = 0;
229 }
230
231 /* If still in a quoted area, fallback to an error */
232 if (state == 4)
233 state = -2;
234
235 /* If trg was quoted, remove its quotes and unescape it */
236 if (trgquoted && state > 0)
237 {
238 /* Ignore first and end quotes */
239 trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
240 trgstorelen = 0;
241 for (int i = 1; i < trglen - 1; i++)
242 {
243 trgstore[trgstorelen] = trg[i];
244 trgstorelen++;
245 /* skip second double quotes */
246 if (trg[i] == '"' && trg[i + 1] == '"')
247 i++;
248 }
249 }
250 else
251 {
252 trgstore = (char *) palloc(sizeof(char) * trglen);
253 trgstorelen = trglen;
254 memcpy(trgstore, trg, trgstorelen);
255 }
256
257 if (state > 0)
258 rootTrie = placeChar(rootTrie,
259 (unsigned char *) src, srclen,
260 trgstore, trgstorelen);
261 else if (state == -1)
263 (errcode(ERRCODE_CONFIG_FILE_ERROR),
264 errmsg("invalid syntax: more than two strings in unaccent rule")));
265 else if (state == -2)
267 (errcode(ERRCODE_CONFIG_FILE_ERROR),
268 errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
269
270 pfree(trgstore);
271 pfree(line);
272 }
273 skip = false;
274 }
275 PG_CATCH();
276 {
277 ErrorData *errdata;
278 MemoryContext ecxt;
279
280 ecxt = MemoryContextSwitchTo(ccxt);
281 errdata = CopyErrorData();
282 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
283 {
285 }
286 else
287 {
289 PG_RE_THROW();
290 }
291 }
292 PG_END_TRY();
293 }
294 while (skip);
295
297
298 return rootTrie;
299}
300
301/*
302 * findReplaceTo - find longest possible match in trie
303 *
304 * On success, returns pointer to ending subnode, plus length of matched
305 * source string in *p_matchlen. On failure, returns NULL.
306 */
307static TrieChar *
308findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
309 int *p_matchlen)
310{
311 TrieChar *result = NULL;
312 int matchlen = 0;
313
314 *p_matchlen = 0; /* prevent uninitialized-variable warnings */
315
316 while (node && matchlen < srclen)
317 {
318 node = node + src[matchlen];
319 matchlen++;
320
321 if (node->replaceTo)
322 {
323 result = node;
324 *p_matchlen = matchlen;
325 }
326
327 node = node->nextChar;
328 }
329
330 return result;
331}
332
334Datum
336{
337 List *dictoptions = (List *) PG_GETARG_POINTER(0);
338 TrieChar *rootTrie = NULL;
339 bool fileloaded = false;
340 ListCell *l;
341
342 foreach(l, dictoptions)
343 {
344 DefElem *defel = (DefElem *) lfirst(l);
345
346 if (strcmp(defel->defname, "rules") == 0)
347 {
348 if (fileloaded)
350 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
351 errmsg("multiple Rules parameters")));
352 rootTrie = initTrie(defGetString(defel));
353 fileloaded = true;
354 }
355 else
356 {
358 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
359 errmsg("unrecognized Unaccent parameter: \"%s\"",
360 defel->defname)));
361 }
362 }
363
364 if (!fileloaded)
365 {
367 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
368 errmsg("missing Rules parameter")));
369 }
370
371 PG_RETURN_POINTER(rootTrie);
372}
373
375Datum
377{
378 TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
379 char *srcchar = (char *) PG_GETARG_POINTER(1);
381 char *srcstart = srcchar;
382 TSLexeme *res;
384
385 /* we allocate storage for the buffer only if needed */
386 buf.data = NULL;
387
388 while (len > 0)
389 {
390 TrieChar *node;
391 int matchlen;
392
393 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
394 &matchlen);
395 if (node && node->replaceTo)
396 {
397 if (buf.data == NULL)
398 {
399 /* initialize buffer */
401 /* insert any data we already skipped over */
402 if (srcchar != srcstart)
403 appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
404 }
406 }
407 else
408 {
409 matchlen = pg_mblen(srcchar);
410 if (buf.data != NULL)
411 appendBinaryStringInfo(&buf, srcchar, matchlen);
412 }
413
414 srcchar += matchlen;
415 len -= matchlen;
416 }
417
418 /* return a result only if we made at least one substitution */
419 if (buf.data != NULL)
420 {
421 res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
422 res->lexeme = buf.data;
423 res->flags = TSL_FILTER;
424 }
425 else
426 res = NULL;
427
429}
430
431/*
432 * Function-like wrapper for dictionary
433 */
435Datum
437{
438 text *str;
439 int strArg;
440 Oid dictOid;
442 TSLexeme *res;
443
444 if (PG_NARGS() == 1)
445 {
446 /*
447 * Use the "unaccent" dictionary that is in the same schema that this
448 * function is in.
449 */
450 Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
451 const char *dictname = "unaccent";
452
453 dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
454 PointerGetDatum(dictname),
455 ObjectIdGetDatum(procnspid));
456 if (!OidIsValid(dictOid))
458 (errcode(ERRCODE_UNDEFINED_OBJECT),
459 errmsg("text search dictionary \"%s.%s\" does not exist",
460 get_namespace_name(procnspid), dictname)));
461 strArg = 0;
462 }
463 else
464 {
465 dictOid = PG_GETARG_OID(0);
466 strArg = 1;
467 }
468 str = PG_GETARG_TEXT_PP(strArg);
469
470 dict = lookup_ts_dictionary_cache(dictOid);
471
476 PointerGetDatum(NULL)));
477
478 PG_FREE_IF_COPY(str, strArg);
479
480 if (res == NULL)
481 {
483 }
484 else if (res->lexeme == NULL)
485 {
486 pfree(res);
488 }
489 else
490 {
491 text *txt = cstring_to_text(res->lexeme);
492
493 pfree(res->lexeme);
494 pfree(res);
495
496 PG_RETURN_TEXT_P(txt);
497 }
498}
#define Assert(condition)
Definition: c.h:815
int32_t int32
Definition: c.h:484
#define OidIsValid(objectId)
Definition: c.h:732
char * defGetString(DefElem *def)
Definition: define.c:35
ErrorData * CopyErrorData(void)
Definition: elog.c:1746
void FlushErrorState(void)
Definition: elog.c:1867
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define PG_RE_THROW()
Definition: elog.h:412
#define PG_TRY(...)
Definition: elog.h:371
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:396
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:381
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_NARGS()
Definition: fmgr.h:203
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:665
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:315
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
const char * str
int i
Definition: isn.c:72
Oid get_func_namespace(Oid funcid)
Definition: lsyscache.c:1632
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3366
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:107
const void size_t len
static char * filename
Definition: pg_dumpall.c:119
#define lfirst(lc)
Definition: pg_list.h:172
static char * buf
Definition: pg_test_fsync.c:72
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
unsigned int Oid
Definition: postgres_ext.h:32
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:281
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
char * defname
Definition: parsenodes.h:826
int sqlerrcode
Definition: elog.h:439
Definition: pg_list.h:54
struct TrieChar * nextChar
Definition: unaccent.c:42
int replacelen
Definition: unaccent.c:44
char * replaceTo
Definition: unaccent.c:43
Definition: regguts.h:323
Definition: c.h:644
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
Definition: syscache.h:111
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:208
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:89
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:112
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:157
#define TSL_FILTER
Definition: ts_public.h:144
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:34
static TrieChar * initTrie(const char *filename)
Definition: unaccent.c:94
Datum unaccent_init(PG_FUNCTION_ARGS)
Definition: unaccent.c:335
PG_FUNCTION_INFO_V1(unaccent_init)
PG_MODULE_MAGIC
Definition: unaccent.c:26
struct TrieChar TrieChar
Datum unaccent_lexize(PG_FUNCTION_ARGS)
Definition: unaccent.c:376
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:54
Datum unaccent_dict(PG_FUNCTION_ARGS)
Definition: unaccent.c:436
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:308
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
text * cstring_to_text(const char *s)
Definition: varlena.c:184