PostgreSQL Source Code  git master
unaccent.c File Reference
#include "postgres.h"
#include "catalog/namespace.h"
#include "catalog/pg_ts_dict.h"
#include "commands/defrem.h"
#include "lib/stringinfo.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/regproc.h"
#include "utils/syscache.h"
Include dependency graph for unaccent.c:

Go to the source code of this file.

Data Structures

struct  TrieChar
 

Typedefs

typedef struct TrieChar TrieChar
 

Functions

static TrieCharplaceChar (TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
 
static TrieCharinitTrie (const char *filename)
 
static TrieCharfindReplaceTo (TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
 
 PG_FUNCTION_INFO_V1 (unaccent_init)
 
Datum unaccent_init (PG_FUNCTION_ARGS)
 
 PG_FUNCTION_INFO_V1 (unaccent_lexize)
 
Datum unaccent_lexize (PG_FUNCTION_ARGS)
 
 PG_FUNCTION_INFO_V1 (unaccent_dict)
 
Datum unaccent_dict (PG_FUNCTION_ARGS)
 

Variables

 PG_MODULE_MAGIC
 

Typedef Documentation

◆ TrieChar

typedef struct TrieChar TrieChar

Function Documentation

◆ findReplaceTo()

static TrieChar* findReplaceTo ( TrieChar node,
const unsigned char *  src,
int  srclen,
int *  p_matchlen 
)
static

Definition at line 244 of file unaccent.c.

246 {
247  TrieChar *result = NULL;
248  int matchlen = 0;
249 
250  *p_matchlen = 0; /* prevent uninitialized-variable warnings */
251 
252  while (node && matchlen < srclen)
253  {
254  node = node + src[matchlen];
255  matchlen++;
256 
257  if (node->replaceTo)
258  {
259  result = node;
260  *p_matchlen = matchlen;
261  }
262 
263  node = node->nextChar;
264  }
265 
266  return result;
267 }
struct TrieChar * nextChar
Definition: unaccent.c:44
char * replaceTo
Definition: unaccent.c:45

References TrieChar::nextChar, and TrieChar::replaceTo.

Referenced by unaccent_lexize().

◆ initTrie()

static TrieChar* initTrie ( const char *  filename)
static

Definition at line 96 of file unaccent.c.

97 {
98  TrieChar *volatile rootTrie = NULL;
101  volatile bool skip;
102 
104  if (!tsearch_readline_begin(&trst, filename))
105  ereport(ERROR,
106  (errcode(ERRCODE_CONFIG_FILE_ERROR),
107  errmsg("could not open unaccent file \"%s\": %m",
108  filename)));
109 
110  do
111  {
112  /*
113  * pg_do_encoding_conversion() (called by tsearch_readline()) will
114  * emit exception if it finds untranslatable characters in current
115  * locale. We just skip such lines, continuing with the next.
116  */
117  skip = true;
118 
119  PG_TRY();
120  {
121  char *line;
122 
123  while ((line = tsearch_readline(&trst)) != NULL)
124  {
125  /*----------
126  * The format of each line must be "src" or "src trg", where
127  * src and trg are sequences of one or more non-whitespace
128  * characters, separated by whitespace. Whitespace at start
129  * or end of line is ignored. If trg is omitted, an empty
130  * string is used as the replacement.
131  *
132  * We use a simple state machine, with states
133  * 0 initial (before src)
134  * 1 in src
135  * 2 in whitespace after src
136  * 3 in trg
137  * 4 in whitespace after trg
138  * -1 syntax error detected
139  *----------
140  */
141  int state;
142  char *ptr;
143  char *src = NULL;
144  char *trg = NULL;
145  int ptrlen;
146  int srclen = 0;
147  int trglen = 0;
148 
149  state = 0;
150  for (ptr = line; *ptr; ptr += ptrlen)
151  {
152  ptrlen = pg_mblen(ptr);
153  /* ignore whitespace, but end src or trg */
154  if (t_isspace(ptr))
155  {
156  if (state == 1)
157  state = 2;
158  else if (state == 3)
159  state = 4;
160  continue;
161  }
162  switch (state)
163  {
164  case 0:
165  /* start of src */
166  src = ptr;
167  srclen = ptrlen;
168  state = 1;
169  break;
170  case 1:
171  /* continue src */
172  srclen += ptrlen;
173  break;
174  case 2:
175  /* start of trg */
176  trg = ptr;
177  trglen = ptrlen;
178  state = 3;
179  break;
180  case 3:
181  /* continue trg */
182  trglen += ptrlen;
183  break;
184  default:
185  /* bogus line format */
186  state = -1;
187  break;
188  }
189  }
190 
191  if (state == 1 || state == 2)
192  {
193  /* trg was omitted, so use "" */
194  trg = "";
195  trglen = 0;
196  }
197 
198  if (state > 0)
199  rootTrie = placeChar(rootTrie,
200  (unsigned char *) src, srclen,
201  trg, trglen);
202  else if (state < 0)
204  (errcode(ERRCODE_CONFIG_FILE_ERROR),
205  errmsg("invalid syntax: more than two strings in unaccent rule")));
206 
207  pfree(line);
208  }
209  skip = false;
210  }
211  PG_CATCH();
212  {
213  ErrorData *errdata;
214  MemoryContext ecxt;
215 
216  ecxt = MemoryContextSwitchTo(ccxt);
217  errdata = CopyErrorData();
218  if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
219  {
220  FlushErrorState();
221  }
222  else
223  {
224  MemoryContextSwitchTo(ecxt);
225  PG_RE_THROW();
226  }
227  }
228  PG_END_TRY();
229  }
230  while (skip);
231 
232  tsearch_readline_end(&trst);
233 
234  return rootTrie;
235 }
void FlushErrorState(void)
Definition: elog.c:1825
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
ErrorData * CopyErrorData(void)
Definition: elog.c:1720
#define PG_RE_THROW()
Definition: elog.h:411
#define PG_TRY(...)
Definition: elog.h:370
#define WARNING
Definition: elog.h:36
#define PG_END_TRY(...)
Definition: elog.h:395
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:380
#define ereport(elevel,...)
Definition: elog.h:149
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
void pfree(void *pointer)
Definition: mcxt.c:1456
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:116
static char * filename
Definition: pg_dumpall.c:119
int sqlerrcode
Definition: elog.h:438
Definition: regguts.h:323
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:136
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:159
int t_isspace(const char *ptr)
Definition: ts_locale.c:52
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:204
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:33
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:56

References CopyErrorData(), CurrentMemoryContext, ereport, errcode(), errmsg(), ERROR, filename, FlushErrorState(), get_tsearch_config_filename(), MemoryContextSwitchTo(), pfree(), PG_CATCH, PG_END_TRY, pg_mblen(), PG_RE_THROW, PG_TRY, placeChar(), skip, ErrorData::sqlerrcode, t_isspace(), tsearch_readline(), tsearch_readline_begin(), tsearch_readline_end(), and WARNING.

Referenced by unaccent_init().

◆ PG_FUNCTION_INFO_V1() [1/3]

PG_FUNCTION_INFO_V1 ( unaccent_dict  )

◆ PG_FUNCTION_INFO_V1() [2/3]

PG_FUNCTION_INFO_V1 ( unaccent_init  )

◆ PG_FUNCTION_INFO_V1() [3/3]

PG_FUNCTION_INFO_V1 ( unaccent_lexize  )

◆ placeChar()

static TrieChar* placeChar ( TrieChar node,
const unsigned char *  str,
int  lenstr,
const char *  replaceTo,
int  replacelen 
)
static

Definition at line 56 of file unaccent.c.

58 {
59  TrieChar *curnode;
60 
61  if (!node)
62  node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
63 
64  Assert(lenstr > 0); /* else str[0] doesn't exist */
65 
66  curnode = node + *str;
67 
68  if (lenstr <= 1)
69  {
70  if (curnode->replaceTo)
72  (errcode(ERRCODE_CONFIG_FILE_ERROR),
73  errmsg("duplicate source strings, first one will be used")));
74  else
75  {
76  curnode->replacelen = replacelen;
77  curnode->replaceTo = (char *) palloc(replacelen);
78  memcpy(curnode->replaceTo, replaceTo, replacelen);
79  }
80  }
81  else
82  {
83  curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
84  replaceTo, replacelen);
85  }
86 
87  return node;
88 }
Assert(fmt[strlen(fmt) - 1] !='\n')
void * palloc0(Size size)
Definition: mcxt.c:1257
void * palloc(Size size)
Definition: mcxt.c:1226
int replacelen
Definition: unaccent.c:46

References Assert(), ereport, errcode(), errmsg(), TrieChar::nextChar, palloc(), palloc0(), TrieChar::replacelen, TrieChar::replaceTo, generate_unaccent_rules::str, and WARNING.

Referenced by initTrie().

◆ unaccent_dict()

Datum unaccent_dict ( PG_FUNCTION_ARGS  )

Definition at line 372 of file unaccent.c.

373 {
374  text *str;
375  int strArg;
376  Oid dictOid;
378  TSLexeme *res;
379 
380  if (PG_NARGS() == 1)
381  {
382  /*
383  * Use the "unaccent" dictionary that is in the same schema that this
384  * function is in.
385  */
386  Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
387  const char *dictname = "unaccent";
388 
389  dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
390  PointerGetDatum(dictname),
391  ObjectIdGetDatum(procnspid));
392  if (!OidIsValid(dictOid))
393  ereport(ERROR,
394  (errcode(ERRCODE_UNDEFINED_OBJECT),
395  errmsg("text search dictionary \"%s.%s\" does not exist",
396  get_namespace_name(procnspid), dictname)));
397  strArg = 0;
398  }
399  else
400  {
401  dictOid = PG_GETARG_OID(0);
402  strArg = 1;
403  }
404  str = PG_GETARG_TEXT_PP(strArg);
405 
406  dict = lookup_ts_dictionary_cache(dictOid);
407 
409  PointerGetDatum(dict->dictData),
412  PointerGetDatum(NULL)));
413 
414  PG_FREE_IF_COPY(str, strArg);
415 
416  if (res == NULL)
417  {
419  }
420  else if (res->lexeme == NULL)
421  {
422  pfree(res);
424  }
425  else
426  {
427  text *txt = cstring_to_text(res->lexeme);
428 
429  pfree(res->lexeme);
430  pfree(res);
431 
432  PG_RETURN_TEXT_P(txt);
433  }
434 }
#define OidIsValid(objectId)
Definition: c.h:759
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_NARGS()
Definition: fmgr.h:203
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:666
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:315
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3324
Oid get_func_namespace(Oid funcid)
Definition: lsyscache.c:1614
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
unsigned int Oid
Definition: postgres_ext.h:31
Definition: c.h:671
@ TSDICTNAMENSP
Definition: syscache.h:107
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
Definition: syscache.h:202
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:208
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
text * cstring_to_text(const char *s)
Definition: varlena.c:182

References cstring_to_text(), DatumGetPointer(), TSDictionaryCacheEntry::dictData, ereport, errcode(), errmsg(), ERROR, FunctionCall4, get_func_namespace(), get_namespace_name(), GetSysCacheOid2, Int32GetDatum(), TSDictionaryCacheEntry::lexize, lookup_ts_dictionary_cache(), ObjectIdGetDatum(), OidIsValid, pfree(), PG_FREE_IF_COPY, PG_GETARG_OID, PG_GETARG_TEXT_P_COPY, PG_GETARG_TEXT_PP, PG_NARGS, PG_RETURN_TEXT_P, PointerGetDatum(), res, generate_unaccent_rules::str, TSDICTNAMENSP, VARDATA_ANY, and VARSIZE_ANY_EXHDR.

◆ unaccent_init()

Datum unaccent_init ( PG_FUNCTION_ARGS  )

Definition at line 271 of file unaccent.c.

272 {
273  List *dictoptions = (List *) PG_GETARG_POINTER(0);
274  TrieChar *rootTrie = NULL;
275  bool fileloaded = false;
276  ListCell *l;
277 
278  foreach(l, dictoptions)
279  {
280  DefElem *defel = (DefElem *) lfirst(l);
281 
282  if (strcmp(defel->defname, "rules") == 0)
283  {
284  if (fileloaded)
285  ereport(ERROR,
286  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
287  errmsg("multiple Rules parameters")));
288  rootTrie = initTrie(defGetString(defel));
289  fileloaded = true;
290  }
291  else
292  {
293  ereport(ERROR,
294  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
295  errmsg("unrecognized Unaccent parameter: \"%s\"",
296  defel->defname)));
297  }
298  }
299 
300  if (!fileloaded)
301  {
302  ereport(ERROR,
303  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
304  errmsg("missing Rules parameter")));
305  }
306 
307  PG_RETURN_POINTER(rootTrie);
308 }
char * defGetString(DefElem *def)
Definition: define.c:49
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define lfirst(lc)
Definition: pg_list.h:172
char * defname
Definition: parsenodes.h:810
Definition: pg_list.h:54
static TrieChar * initTrie(const char *filename)
Definition: unaccent.c:96

References defGetString(), DefElem::defname, ereport, errcode(), errmsg(), ERROR, initTrie(), lfirst, PG_GETARG_POINTER, and PG_RETURN_POINTER.

◆ unaccent_lexize()

Datum unaccent_lexize ( PG_FUNCTION_ARGS  )

Definition at line 312 of file unaccent.c.

313 {
314  TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
315  char *srcchar = (char *) PG_GETARG_POINTER(1);
317  char *srcstart = srcchar;
318  TSLexeme *res;
320 
321  /* we allocate storage for the buffer only if needed */
322  buf.data = NULL;
323 
324  while (len > 0)
325  {
326  TrieChar *node;
327  int matchlen;
328 
329  node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
330  &matchlen);
331  if (node && node->replaceTo)
332  {
333  if (buf.data == NULL)
334  {
335  /* initialize buffer */
337  /* insert any data we already skipped over */
338  if (srcchar != srcstart)
339  appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
340  }
342  }
343  else
344  {
345  matchlen = pg_mblen(srcchar);
346  if (buf.data != NULL)
347  appendBinaryStringInfo(&buf, srcchar, matchlen);
348  }
349 
350  srcchar += matchlen;
351  len -= matchlen;
352  }
353 
354  /* return a result only if we made at least one substitution */
355  if (buf.data != NULL)
356  {
357  res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
358  res->lexeme = buf.data;
359  res->flags = TSL_FILTER;
360  }
361  else
362  res = NULL;
363 
365 }
signed int int32
Definition: c.h:478
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
const void size_t len
static char * buf
Definition: pg_test_fsync.c:67
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:227
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
#define TSL_FILTER
Definition: ts_public.h:144
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:244

References appendBinaryStringInfo(), buf, findReplaceTo(), initStringInfo(), len, palloc0(), PG_GETARG_INT32, PG_GETARG_POINTER, pg_mblen(), PG_RETURN_POINTER, TrieChar::replacelen, TrieChar::replaceTo, res, and TSL_FILTER.

Variable Documentation

◆ PG_MODULE_MAGIC

PG_MODULE_MAGIC

Definition at line 28 of file unaccent.c.