PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
unaccent.c File Reference
#include "postgres.h"
#include "catalog/pg_ts_dict.h"
#include "commands/defrem.h"
#include "lib/stringinfo.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
Include dependency graph for unaccent.c:

Go to the source code of this file.

Data Structures

struct  TrieChar
 

Typedefs

typedef struct TrieChar TrieChar
 

Functions

 PG_MODULE_MAGIC_EXT (.name="unaccent",.version=PG_VERSION)
 
static TrieCharplaceChar (TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
 
static TrieCharinitTrie (const char *filename)
 
static TrieCharfindReplaceTo (TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
 
 PG_FUNCTION_INFO_V1 (unaccent_init)
 
Datum unaccent_init (PG_FUNCTION_ARGS)
 
 PG_FUNCTION_INFO_V1 (unaccent_lexize)
 
Datum unaccent_lexize (PG_FUNCTION_ARGS)
 
 PG_FUNCTION_INFO_V1 (unaccent_dict)
 
Datum unaccent_dict (PG_FUNCTION_ARGS)
 

Typedef Documentation

◆ TrieChar

typedef struct TrieChar TrieChar

Function Documentation

◆ findReplaceTo()

static TrieChar * findReplaceTo ( TrieChar node,
const unsigned char *  src,
int  srclen,
int *  p_matchlen 
)
static

Definition at line 311 of file unaccent.c.

313{
314 TrieChar *result = NULL;
315 int matchlen = 0;
316
317 *p_matchlen = 0; /* prevent uninitialized-variable warnings */
318
319 while (node && matchlen < srclen)
320 {
321 node = node + src[matchlen];
322 matchlen++;
323
324 if (node->replaceTo)
325 {
326 result = node;
327 *p_matchlen = matchlen;
328 }
329
330 node = node->nextChar;
331 }
332
333 return result;
334}
struct TrieChar * nextChar
Definition: unaccent.c:45
char * replaceTo
Definition: unaccent.c:46

References TrieChar::nextChar, and TrieChar::replaceTo.

Referenced by unaccent_lexize().

◆ initTrie()

static TrieChar * initTrie ( const char *  filename)
static

Definition at line 97 of file unaccent.c.

98{
99 TrieChar *volatile rootTrie = NULL;
102 volatile bool skip;
103
105 if (!tsearch_readline_begin(&trst, filename))
107 (errcode(ERRCODE_CONFIG_FILE_ERROR),
108 errmsg("could not open unaccent file \"%s\": %m",
109 filename)));
110
111 do
112 {
113 /*
114 * pg_do_encoding_conversion() (called by tsearch_readline()) will
115 * emit exception if it finds untranslatable characters in current
116 * locale. We just skip such lines, continuing with the next.
117 */
118 skip = true;
119
120 PG_TRY();
121 {
122 char *line;
123
124 while ((line = tsearch_readline(&trst)) != NULL)
125 {
126 /*----------
127 * The format of each line must be "src" or "src trg", where
128 * src and trg are sequences of one or more non-whitespace
129 * characters, separated by whitespace. Whitespace at start
130 * or end of line is ignored. If trg is omitted, an empty
131 * string is used as the replacement. trg can be optionally
132 * quoted, in which case whitespaces are included in it.
133 *
134 * We use a simple state machine, with states
135 * 0 initial (before src)
136 * 1 in src
137 * 2 in whitespace after src
138 * 3 in trg (non-quoted)
139 * 4 in trg (quoted)
140 * 5 in whitespace after trg
141 * -1 syntax error detected (two strings)
142 * -2 syntax error detected (unfinished quoted string)
143 *----------
144 */
145 int state;
146 char *ptr;
147 char *src = NULL;
148 char *trg = NULL;
149 char *trgstore = NULL;
150 int ptrlen;
151 int srclen = 0;
152 int trglen = 0;
153 int trgstorelen = 0;
154 bool trgquoted = false;
155
156 state = 0;
157 for (ptr = line; *ptr; ptr += ptrlen)
158 {
159 ptrlen = pg_mblen(ptr);
160 /* ignore whitespace, but end src or trg */
161 if (isspace((unsigned char) *ptr))
162 {
163 if (state == 1)
164 state = 2;
165 else if (state == 3)
166 state = 5;
167 /* whitespaces are OK in quoted area */
168 if (state != 4)
169 continue;
170 }
171 switch (state)
172 {
173 case 0:
174 /* start of src */
175 src = ptr;
176 srclen = ptrlen;
177 state = 1;
178 break;
179 case 1:
180 /* continue src */
181 srclen += ptrlen;
182 break;
183 case 2:
184 /* start of trg */
185 if (*ptr == '"')
186 {
187 trgquoted = true;
188 state = 4;
189 }
190 else
191 state = 3;
192
193 trg = ptr;
194 trglen = ptrlen;
195 break;
196 case 3:
197 /* continue non-quoted trg */
198 trglen += ptrlen;
199 break;
200 case 4:
201 /* continue quoted trg */
202 trglen += ptrlen;
203
204 /*
205 * If this is a quote, consider it as the end of
206 * trg except if the follow-up character is itself
207 * a quote.
208 */
209 if (*ptr == '"')
210 {
211 if (*(ptr + 1) == '"')
212 {
213 ptr++;
214 trglen += 1;
215 }
216 else
217 state = 5;
218 }
219 break;
220 default:
221 /* bogus line format */
222 state = -1;
223 break;
224 }
225 }
226
227 if (state == 1 || state == 2)
228 {
229 /* trg was omitted, so use "" */
230 trg = "";
231 trglen = 0;
232 }
233
234 /* If still in a quoted area, fallback to an error */
235 if (state == 4)
236 state = -2;
237
238 /* If trg was quoted, remove its quotes and unescape it */
239 if (trgquoted && state > 0)
240 {
241 /* Ignore first and end quotes */
242 trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
243 trgstorelen = 0;
244 for (int i = 1; i < trglen - 1; i++)
245 {
246 trgstore[trgstorelen] = trg[i];
247 trgstorelen++;
248 /* skip second double quotes */
249 if (trg[i] == '"' && trg[i + 1] == '"')
250 i++;
251 }
252 }
253 else
254 {
255 trgstore = (char *) palloc(sizeof(char) * trglen);
256 trgstorelen = trglen;
257 memcpy(trgstore, trg, trgstorelen);
258 }
259
260 if (state > 0)
261 rootTrie = placeChar(rootTrie,
262 (unsigned char *) src, srclen,
263 trgstore, trgstorelen);
264 else if (state == -1)
266 (errcode(ERRCODE_CONFIG_FILE_ERROR),
267 errmsg("invalid syntax: more than two strings in unaccent rule")));
268 else if (state == -2)
270 (errcode(ERRCODE_CONFIG_FILE_ERROR),
271 errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
272
273 pfree(trgstore);
274 pfree(line);
275 }
276 skip = false;
277 }
278 PG_CATCH();
279 {
280 ErrorData *errdata;
281 MemoryContext ecxt;
282
283 ecxt = MemoryContextSwitchTo(ccxt);
284 errdata = CopyErrorData();
285 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
286 {
288 }
289 else
290 {
292 PG_RE_THROW();
293 }
294 }
295 PG_END_TRY();
296 }
297 while (skip);
298
300
301 return rootTrie;
302}
ErrorData * CopyErrorData(void)
Definition: elog.c:1751
void FlushErrorState(void)
Definition: elog.c:1872
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define PG_RE_THROW()
Definition: elog.h:404
#define PG_TRY(...)
Definition: elog.h:371
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:396
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:381
#define ereport(elevel,...)
Definition: elog.h:149
int i
Definition: isn.c:77
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:2150
void * palloc(Size size)
Definition: mcxt.c:1943
MemoryContext CurrentMemoryContext
Definition: mcxt.c:159
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:107
static char * filename
Definition: pg_dumpall.c:123
int sqlerrcode
Definition: elog.h:430
Definition: regguts.h:323
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:89
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:112
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:157
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:34
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:57

References CopyErrorData(), CurrentMemoryContext, ereport, errcode(), errmsg(), ERROR, filename, FlushErrorState(), get_tsearch_config_filename(), i, MemoryContextSwitchTo(), palloc(), pfree(), PG_CATCH, PG_END_TRY, pg_mblen(), PG_RE_THROW, PG_TRY, placeChar(), skip, ErrorData::sqlerrcode, tsearch_readline(), tsearch_readline_begin(), tsearch_readline_end(), and WARNING.

Referenced by unaccent_init().

◆ PG_FUNCTION_INFO_V1() [1/3]

PG_FUNCTION_INFO_V1 ( unaccent_dict  )

◆ PG_FUNCTION_INFO_V1() [2/3]

PG_FUNCTION_INFO_V1 ( unaccent_init  )

◆ PG_FUNCTION_INFO_V1() [3/3]

PG_FUNCTION_INFO_V1 ( unaccent_lexize  )

◆ PG_MODULE_MAGIC_EXT()

PG_MODULE_MAGIC_EXT ( name = "unaccent",
version = PG_VERSION 
)

◆ placeChar()

static TrieChar * placeChar ( TrieChar node,
const unsigned char *  str,
int  lenstr,
const char *  replaceTo,
int  replacelen 
)
static

Definition at line 57 of file unaccent.c.

59{
60 TrieChar *curnode;
61
62 if (!node)
63 node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
64
65 Assert(lenstr > 0); /* else str[0] doesn't exist */
66
67 curnode = node + *str;
68
69 if (lenstr <= 1)
70 {
71 if (curnode->replaceTo)
73 (errcode(ERRCODE_CONFIG_FILE_ERROR),
74 errmsg("duplicate source strings, first one will be used")));
75 else
76 {
77 curnode->replacelen = replacelen;
78 curnode->replaceTo = (char *) palloc(replacelen);
79 memcpy(curnode->replaceTo, replaceTo, replacelen);
80 }
81 }
82 else
83 {
84 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
85 replaceTo, replacelen);
86 }
87
88 return node;
89}
Assert(PointerIsAligned(start, uint64))
const char * str
void * palloc0(Size size)
Definition: mcxt.c:1973
int replacelen
Definition: unaccent.c:47

References Assert(), ereport, errcode(), errmsg(), TrieChar::nextChar, palloc(), palloc0(), placeChar(), TrieChar::replacelen, TrieChar::replaceTo, str, and WARNING.

Referenced by initTrie(), and placeChar().

◆ unaccent_dict()

Datum unaccent_dict ( PG_FUNCTION_ARGS  )

Definition at line 439 of file unaccent.c.

440{
441 text *str;
442 int strArg;
443 Oid dictOid;
445 TSLexeme *res;
446
447 if (PG_NARGS() == 1)
448 {
449 /*
450 * Use the "unaccent" dictionary that is in the same schema that this
451 * function is in.
452 */
453 Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
454 const char *dictname = "unaccent";
455
456 dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
457 PointerGetDatum(dictname),
458 ObjectIdGetDatum(procnspid));
459 if (!OidIsValid(dictOid))
461 (errcode(ERRCODE_UNDEFINED_OBJECT),
462 errmsg("text search dictionary \"%s.%s\" does not exist",
463 get_namespace_name(procnspid), dictname)));
464 strArg = 0;
465 }
466 else
467 {
468 dictOid = PG_GETARG_OID(0);
469 strArg = 1;
470 }
471 str = PG_GETARG_TEXT_PP(strArg);
472
473 dict = lookup_ts_dictionary_cache(dictOid);
474
475 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
479 PointerGetDatum(NULL)));
480
481 PG_FREE_IF_COPY(str, strArg);
482
483 if (res == NULL)
484 {
486 }
487 else if (res->lexeme == NULL)
488 {
489 pfree(res);
491 }
492 else
493 {
494 text *txt = cstring_to_text(res->lexeme);
495
496 pfree(res->lexeme);
497 pfree(res);
498
499 PG_RETURN_TEXT_P(txt);
500 }
501}
#define OidIsValid(objectId)
Definition: c.h:746
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_NARGS()
Definition: fmgr.h:203
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:706
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:315
Oid get_func_namespace(Oid funcid)
Definition: lsyscache.c:1772
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3506
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
unsigned int Oid
Definition: postgres_ext.h:30
char * lexeme
Definition: ts_public.h:138
Definition: c.h:658
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
Definition: syscache.h:111
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:208
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
text * cstring_to_text(const char *s)
Definition: varlena.c:192

References cstring_to_text(), DatumGetPointer(), TSDictionaryCacheEntry::dictData, ereport, errcode(), errmsg(), ERROR, FunctionCall4, get_func_namespace(), get_namespace_name(), GetSysCacheOid2, Int32GetDatum(), TSLexeme::lexeme, TSDictionaryCacheEntry::lexize, lookup_ts_dictionary_cache(), ObjectIdGetDatum(), OidIsValid, pfree(), PG_FREE_IF_COPY, PG_GETARG_OID, PG_GETARG_TEXT_P_COPY, PG_GETARG_TEXT_PP, PG_NARGS, PG_RETURN_TEXT_P, PointerGetDatum(), str, VARDATA_ANY, and VARSIZE_ANY_EXHDR.

◆ unaccent_init()

Datum unaccent_init ( PG_FUNCTION_ARGS  )

Definition at line 338 of file unaccent.c.

339{
340 List *dictoptions = (List *) PG_GETARG_POINTER(0);
341 TrieChar *rootTrie = NULL;
342 bool fileloaded = false;
343 ListCell *l;
344
345 foreach(l, dictoptions)
346 {
347 DefElem *defel = (DefElem *) lfirst(l);
348
349 if (strcmp(defel->defname, "rules") == 0)
350 {
351 if (fileloaded)
353 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
354 errmsg("multiple Rules parameters")));
355 rootTrie = initTrie(defGetString(defel));
356 fileloaded = true;
357 }
358 else
359 {
361 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
362 errmsg("unrecognized Unaccent parameter: \"%s\"",
363 defel->defname)));
364 }
365 }
366
367 if (!fileloaded)
368 {
370 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
371 errmsg("missing Rules parameter")));
372 }
373
374 PG_RETURN_POINTER(rootTrie);
375}
char * defGetString(DefElem *def)
Definition: define.c:35
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define lfirst(lc)
Definition: pg_list.h:172
char * defname
Definition: parsenodes.h:826
Definition: pg_list.h:54
static TrieChar * initTrie(const char *filename)
Definition: unaccent.c:97

References defGetString(), DefElem::defname, ereport, errcode(), errmsg(), ERROR, initTrie(), lfirst, PG_GETARG_POINTER, and PG_RETURN_POINTER.

◆ unaccent_lexize()

Datum unaccent_lexize ( PG_FUNCTION_ARGS  )

Definition at line 379 of file unaccent.c.

380{
381 TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
382 char *srcchar = (char *) PG_GETARG_POINTER(1);
384 char *srcstart = srcchar;
385 TSLexeme *res;
387
388 /* we allocate storage for the buffer only if needed */
389 buf.data = NULL;
390
391 while (len > 0)
392 {
393 TrieChar *node;
394 int matchlen;
395
396 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
397 &matchlen);
398 if (node && node->replaceTo)
399 {
400 if (buf.data == NULL)
401 {
402 /* initialize buffer */
404 /* insert any data we already skipped over */
405 if (srcchar != srcstart)
406 appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
407 }
409 }
410 else
411 {
412 matchlen = pg_mblen(srcchar);
413 if (buf.data != NULL)
414 appendBinaryStringInfo(&buf, srcchar, matchlen);
415 }
416
417 srcchar += matchlen;
418 len -= matchlen;
419 }
420
421 /* return a result only if we made at least one substitution */
422 if (buf.data != NULL)
423 {
424 res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
425 res->lexeme = buf.data;
426 res->flags = TSL_FILTER;
427 }
428 else
429 res = NULL;
430
432}
int32_t int32
Definition: c.h:498
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
const void size_t len
static char * buf
Definition: pg_test_fsync.c:72
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:281
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
uint16 flags
Definition: ts_public.h:136
#define TSL_FILTER
Definition: ts_public.h:144
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:311

References appendBinaryStringInfo(), buf, findReplaceTo(), TSLexeme::flags, initStringInfo(), len, TSLexeme::lexeme, palloc0(), PG_GETARG_INT32, PG_GETARG_POINTER, pg_mblen(), PG_RETURN_POINTER, TrieChar::replacelen, TrieChar::replaceTo, and TSL_FILTER.