PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
unaccent.c File Reference
#include "postgres.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "lib/stringinfo.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
#include "utils/regproc.h"
Include dependency graph for unaccent.c:

Go to the source code of this file.

Data Structures

struct  TrieChar
 

Typedefs

typedef struct TrieChar TrieChar
 

Functions

static TrieCharplaceChar (TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
 
static TrieCharinitTrie (char *filename)
 
static TrieCharfindReplaceTo (TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
 
 PG_FUNCTION_INFO_V1 (unaccent_init)
 
Datum unaccent_init (PG_FUNCTION_ARGS)
 
 PG_FUNCTION_INFO_V1 (unaccent_lexize)
 
Datum unaccent_lexize (PG_FUNCTION_ARGS)
 
 PG_FUNCTION_INFO_V1 (unaccent_dict)
 
Datum unaccent_dict (PG_FUNCTION_ARGS)
 

Variables

 PG_MODULE_MAGIC
 

Typedef Documentation

Function Documentation

static TrieChar* findReplaceTo ( TrieChar node,
const unsigned char *  src,
int  srclen,
int *  p_matchlen 
)
static

Definition at line 241 of file unaccent.c.

References TrieChar::nextChar, and TrieChar::replaceTo.

Referenced by unaccent_lexize().

243 {
244  TrieChar *result = NULL;
245  int matchlen = 0;
246 
247  *p_matchlen = 0; /* prevent uninitialized-variable warnings */
248 
249  while (node && matchlen < srclen)
250  {
251  node = node + src[matchlen];
252  matchlen++;
253 
254  if (node->replaceTo)
255  {
256  result = node;
257  *p_matchlen = matchlen;
258  }
259 
260  node = node->nextChar;
261  }
262 
263  return result;
264 }
struct TrieChar * nextChar
Definition: unaccent.c:41
char * replaceTo
Definition: unaccent.c:42
static TrieChar* initTrie ( char *  filename)
static

Definition at line 93 of file unaccent.c.

References CopyErrorData(), CurrentMemoryContext, ereport, errcode(), errmsg(), ERROR, FlushErrorState(), get_tsearch_config_filename(), MemoryContextSwitchTo(), pfree(), PG_CATCH, PG_END_TRY, pg_mblen(), PG_RE_THROW, PG_TRY, placeChar(), skip(), ErrorData::sqlerrcode, t_isspace(), tsearch_readline(), tsearch_readline_begin(), tsearch_readline_end(), and WARNING.

Referenced by unaccent_init().

94 {
95  TrieChar *volatile rootTrie = NULL;
98  volatile bool skip;
99 
101  if (!tsearch_readline_begin(&trst, filename))
102  ereport(ERROR,
103  (errcode(ERRCODE_CONFIG_FILE_ERROR),
104  errmsg("could not open unaccent file \"%s\": %m",
105  filename)));
106 
107  do
108  {
109  /*
110  * pg_do_encoding_conversion() (called by tsearch_readline()) will
111  * emit exception if it finds untranslatable characters in current
112  * locale. We just skip such lines, continuing with the next.
113  */
114  skip = true;
115 
116  PG_TRY();
117  {
118  char *line;
119 
120  while ((line = tsearch_readline(&trst)) != NULL)
121  {
122  /*----------
123  * The format of each line must be "src" or "src trg", where
124  * src and trg are sequences of one or more non-whitespace
125  * characters, separated by whitespace. Whitespace at start
126  * or end of line is ignored. If trg is omitted, an empty
127  * string is used as the replacement.
128  *
129  * We use a simple state machine, with states
130  * 0 initial (before src)
131  * 1 in src
132  * 2 in whitespace after src
133  * 3 in trg
134  * 4 in whitespace after trg
135  * -1 syntax error detected
136  *----------
137  */
138  int state;
139  char *ptr;
140  char *src = NULL;
141  char *trg = NULL;
142  int ptrlen;
143  int srclen = 0;
144  int trglen = 0;
145 
146  state = 0;
147  for (ptr = line; *ptr; ptr += ptrlen)
148  {
149  ptrlen = pg_mblen(ptr);
150  /* ignore whitespace, but end src or trg */
151  if (t_isspace(ptr))
152  {
153  if (state == 1)
154  state = 2;
155  else if (state == 3)
156  state = 4;
157  continue;
158  }
159  switch (state)
160  {
161  case 0:
162  /* start of src */
163  src = ptr;
164  srclen = ptrlen;
165  state = 1;
166  break;
167  case 1:
168  /* continue src */
169  srclen += ptrlen;
170  break;
171  case 2:
172  /* start of trg */
173  trg = ptr;
174  trglen = ptrlen;
175  state = 3;
176  break;
177  case 3:
178  /* continue trg */
179  trglen += ptrlen;
180  break;
181  default:
182  /* bogus line format */
183  state = -1;
184  break;
185  }
186  }
187 
188  if (state == 1 || state == 2)
189  {
190  /* trg was omitted, so use "" */
191  trg = "";
192  trglen = 0;
193  }
194 
195  if (state > 0)
196  rootTrie = placeChar(rootTrie,
197  (unsigned char *) src, srclen,
198  trg, trglen);
199  else if (state < 0)
201  (errcode(ERRCODE_CONFIG_FILE_ERROR),
202  errmsg("invalid syntax: more than two strings in unaccent rule")));
203 
204  pfree(line);
205  }
206  skip = false;
207  }
208  PG_CATCH();
209  {
210  ErrorData *errdata;
211  MemoryContext ecxt;
212 
213  ecxt = MemoryContextSwitchTo(ccxt);
214  errdata = CopyErrorData();
215  if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
216  {
217  FlushErrorState();
218  }
219  else
220  {
221  MemoryContextSwitchTo(ecxt);
222  PG_RE_THROW();
223  }
224  }
225  PG_END_TRY();
226  }
227  while (skip);
228 
229  tsearch_readline_end(&trst);
230 
231  return rootTrie;
232 }
static void skip(struct vars *v)
Definition: regc_lex.c:1109
int sqlerrcode
Definition: elog.h:342
ErrorData * CopyErrorData(void)
Definition: elog.c:1497
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
int errcode(int sqlerrcode)
Definition: elog.c:575
void FlushErrorState(void)
Definition: elog.c:1587
void pfree(void *pointer)
Definition: mcxt.c:949
#define ERROR
Definition: elog.h:43
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:33
int t_isspace(const char *ptr)
Definition: ts_locale.c:41
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
#define ereport(elevel, rest)
Definition: elog.h:122
#define WARNING
Definition: elog.h:40
#define PG_CATCH()
Definition: elog.h:293
Definition: regguts.h:298
#define PG_RE_THROW()
Definition: elog.h:314
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:150
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:135
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:113
static char * filename
Definition: pg_dumpall.c:90
int errmsg(const char *fmt,...)
Definition: elog.c:797
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:53
#define PG_TRY()
Definition: elog.h:284
#define PG_END_TRY()
Definition: elog.h:300
PG_FUNCTION_INFO_V1 ( unaccent_init  )
PG_FUNCTION_INFO_V1 ( unaccent_lexize  )
PG_FUNCTION_INFO_V1 ( unaccent_dict  )
static TrieChar* placeChar ( TrieChar node,
const unsigned char *  str,
int  lenstr,
const char *  replaceTo,
int  replacelen 
)
static

Definition at line 53 of file unaccent.c.

References Assert, ereport, errcode(), errmsg(), TrieChar::nextChar, palloc(), palloc0(), TrieChar::replacelen, TrieChar::replaceTo, and WARNING.

Referenced by initTrie().

55 {
56  TrieChar *curnode;
57 
58  if (!node)
59  node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
60 
61  Assert(lenstr > 0); /* else str[0] doesn't exist */
62 
63  curnode = node + *str;
64 
65  if (lenstr <= 1)
66  {
67  if (curnode->replaceTo)
69  (errcode(ERRCODE_CONFIG_FILE_ERROR),
70  errmsg("duplicate source strings, first one will be used")));
71  else
72  {
73  curnode->replacelen = replacelen;
74  curnode->replaceTo = (char *) palloc(replacelen);
75  memcpy(curnode->replaceTo, replaceTo, replacelen);
76  }
77  }
78  else
79  {
80  curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
81  replaceTo, replacelen);
82  }
83 
84  return node;
85 }
int errcode(int sqlerrcode)
Definition: elog.c:575
struct TrieChar * nextChar
Definition: unaccent.c:41
#define ereport(elevel, rest)
Definition: elog.h:122
#define WARNING
Definition: elog.h:40
void * palloc0(Size size)
Definition: mcxt.c:877
char * replaceTo
Definition: unaccent.c:42
int replacelen
Definition: unaccent.c:43
#define Assert(condition)
Definition: c.h:681
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:53
Datum unaccent_dict ( PG_FUNCTION_ARGS  )

Definition at line 369 of file unaccent.c.

References cstring_to_text(), DatumGetPointer, TSDictionaryCacheEntry::dictData, FunctionCall4, get_ts_dict_oid(), Int32GetDatum, TSLexeme::lexeme, TSDictionaryCacheEntry::lexize, lookup_ts_dictionary_cache(), pfree(), PG_FREE_IF_COPY, PG_GETARG_OID, PG_GETARG_TEXT_P_COPY, PG_GETARG_TEXT_PP, PG_NARGS, PG_RETURN_TEXT_P, PointerGetDatum, stringToQualifiedNameList(), VARDATA_ANY, and VARSIZE_ANY_EXHDR.

370 {
371  text *str;
372  int strArg;
373  Oid dictOid;
375  TSLexeme *res;
376 
377  if (PG_NARGS() == 1)
378  {
379  dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
380  strArg = 0;
381  }
382  else
383  {
384  dictOid = PG_GETARG_OID(0);
385  strArg = 1;
386  }
387  str = PG_GETARG_TEXT_PP(strArg);
388 
389  dict = lookup_ts_dictionary_cache(dictOid);
390 
391  res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
392  PointerGetDatum(dict->dictData),
395  PointerGetDatum(NULL)));
396 
397  PG_FREE_IF_COPY(str, strArg);
398 
399  if (res == NULL)
400  {
402  }
403  else if (res->lexeme == NULL)
404  {
405  pfree(res);
407  }
408  else
409  {
410  text *txt = cstring_to_text(res->lexeme);
411 
412  pfree(res->lexeme);
413  pfree(res);
414 
415  PG_RETURN_TEXT_P(txt);
416  }
417 }
#define VARDATA_ANY(PTR)
Definition: postgres.h:347
#define PointerGetDatum(X)
Definition: postgres.h:562
unsigned int Oid
Definition: postgres_ext.h:31
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:279
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:273
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:210
Oid get_ts_dict_oid(List *names, bool missing_ok)
Definition: namespace.c:2386
void pfree(void *pointer)
Definition: mcxt.c:949
char * lexeme
Definition: ts_public.h:111
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:609
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:331
text * cstring_to_text(const char *s)
Definition: varlena.c:149
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:225
#define PG_NARGS()
Definition: fmgr.h:168
#define DatumGetPointer(X)
Definition: postgres.h:555
#define Int32GetDatum(X)
Definition: postgres.h:485
List * stringToQualifiedNameList(const char *string)
Definition: regproc.c:1687
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:340
Definition: c.h:433
Datum unaccent_init ( PG_FUNCTION_ARGS  )

Definition at line 268 of file unaccent.c.

References defGetString(), DefElem::defname, ereport, errcode(), errmsg(), ERROR, initTrie(), lfirst, PG_GETARG_POINTER, PG_RETURN_POINTER, and pg_strcasecmp().

269 {
270  List *dictoptions = (List *) PG_GETARG_POINTER(0);
271  TrieChar *rootTrie = NULL;
272  bool fileloaded = false;
273  ListCell *l;
274 
275  foreach(l, dictoptions)
276  {
277  DefElem *defel = (DefElem *) lfirst(l);
278 
279  if (pg_strcasecmp("Rules", defel->defname) == 0)
280  {
281  if (fileloaded)
282  ereport(ERROR,
283  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
284  errmsg("multiple Rules parameters")));
285  rootTrie = initTrie(defGetString(defel));
286  fileloaded = true;
287  }
288  else
289  {
290  ereport(ERROR,
291  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
292  errmsg("unrecognized Unaccent parameter: \"%s\"",
293  defel->defname)));
294  }
295  }
296 
297  if (!fileloaded)
298  {
299  ereport(ERROR,
300  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
301  errmsg("missing Rules parameter")));
302  }
303 
304  PG_RETURN_POINTER(rootTrie);
305 }
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
int errcode(int sqlerrcode)
Definition: elog.c:575
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
static TrieChar * initTrie(char *filename)
Definition: unaccent.c:93
#define ERROR
Definition: elog.h:43
char * defGetString(DefElem *def)
Definition: define.c:49
#define ereport(elevel, rest)
Definition: elog.h:122
#define lfirst(lc)
Definition: pg_list.h:106
int errmsg(const char *fmt,...)
Definition: elog.c:797
char * defname
Definition: parsenodes.h:719
Definition: pg_list.h:45
Datum unaccent_lexize ( PG_FUNCTION_ARGS  )

Definition at line 309 of file unaccent.c.

References appendBinaryStringInfo(), buf, StringInfoData::data, findReplaceTo(), TSLexeme::flags, initStringInfo(), TSLexeme::lexeme, palloc0(), PG_GETARG_INT32, PG_GETARG_POINTER, pg_mblen(), PG_RETURN_POINTER, TrieChar::replacelen, TrieChar::replaceTo, and TSL_FILTER.

310 {
311  TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
312  char *srcchar = (char *) PG_GETARG_POINTER(1);
313  int32 len = PG_GETARG_INT32(2);
314  char *srcstart = srcchar;
315  TSLexeme *res;
317 
318  /* we allocate storage for the buffer only if needed */
319  buf.data = NULL;
320 
321  while (len > 0)
322  {
323  TrieChar *node;
324  int matchlen;
325 
326  node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
327  &matchlen);
328  if (node && node->replaceTo)
329  {
330  if (buf.data == NULL)
331  {
332  /* initialize buffer */
333  initStringInfo(&buf);
334  /* insert any data we already skipped over */
335  if (srcchar != srcstart)
336  appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
337  }
338  appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
339  }
340  else
341  {
342  matchlen = pg_mblen(srcchar);
343  if (buf.data != NULL)
344  appendBinaryStringInfo(&buf, srcchar, matchlen);
345  }
346 
347  srcchar += matchlen;
348  len -= matchlen;
349  }
350 
351  /* return a result only if we made at least one substitution */
352  if (buf.data != NULL)
353  {
354  res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
355  res->lexeme = buf.data;
356  res->flags = TSL_FILTER;
357  }
358  else
359  res = NULL;
360 
361  PG_RETURN_POINTER(res);
362 }
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
#define TSL_FILTER
Definition: ts_public.h:117
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
signed int int32
Definition: c.h:246
char * lexeme
Definition: ts_public.h:111
static char * buf
Definition: pg_test_fsync.c:67
uint16 flags
Definition: ts_public.h:109
void initStringInfo(StringInfo str)
Definition: stringinfo.c:46
void * palloc0(Size size)
Definition: mcxt.c:877
char * replaceTo
Definition: unaccent.c:42
int replacelen
Definition: unaccent.c:43
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:241
void appendBinaryStringInfo(StringInfo str, const char *data, int datalen)
Definition: stringinfo.c:208

Variable Documentation

PG_MODULE_MAGIC

Definition at line 25 of file unaccent.c.