PostgreSQL Source Code  git master
dict_snowball.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dict_snowball.c
4  * Snowball dictionary
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/backend/snowball/dict_snowball.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_utils.h"
18 
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26 
27 /* Now we can include the original Snowball header.h */
60 
62 
64 
66 
67 /* List of supported modules */
68 typedef struct stemmer_module
69 {
70  const char *name;
72  struct SN_env *(*create) (void);
73  void (*close) (struct SN_env *);
74  int (*stem) (struct SN_env *);
76 
78 {
79  /*
80  * Stemmers list from Snowball distribution
81  */
113 
114  /*
115  * Stemmer with PG_SQL_ASCII encoding should be valid for any server
116  * encoding
117  */
118  {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
119 
120  {NULL, 0, NULL, NULL, NULL} /* list end marker */
121 };
122 
123 
124 typedef struct DictSnowball
125 {
126  struct SN_env *z;
128  bool needrecode; /* needs recoding before/after call stem */
129  int (*stem) (struct SN_env *z);
130 
131  /*
132  * snowball saves alloced memory between calls, so we should run it in our
133  * private memory context. Note, init function is executed in long lived
134  * context, so we just remember CurrentMemoryContext
135  */
137 } DictSnowball;
138 
139 
140 static void
141 locate_stem_module(DictSnowball *d, const char *lang)
142 {
143  const stemmer_module *m;
144 
145  /*
146  * First, try to find exact match of stemmer module. Stemmer with
147  * PG_SQL_ASCII encoding is treated as working with any server encoding
148  */
149  for (m = stemmer_modules; m->name; m++)
150  {
151  if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
152  pg_strcasecmp(m->name, lang) == 0)
153  {
154  d->stem = m->stem;
155  d->z = m->create();
156  d->needrecode = false;
157  return;
158  }
159  }
160 
161  /*
162  * Second, try to find stemmer for needed language for UTF8 encoding.
163  */
164  for (m = stemmer_modules; m->name; m++)
165  {
166  if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
167  {
168  d->stem = m->stem;
169  d->z = m->create();
170  d->needrecode = true;
171  return;
172  }
173  }
174 
175  ereport(ERROR,
176  (errcode(ERRCODE_UNDEFINED_OBJECT),
177  errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
178  lang, GetDatabaseEncodingName())));
179 }
180 
181 Datum
183 {
184  List *dictoptions = (List *) PG_GETARG_POINTER(0);
185  DictSnowball *d;
186  bool stoploaded = false;
187  ListCell *l;
188 
189  d = (DictSnowball *) palloc0(sizeof(DictSnowball));
190 
191  foreach(l, dictoptions)
192  {
193  DefElem *defel = (DefElem *) lfirst(l);
194 
195  if (pg_strcasecmp("StopWords", defel->defname) == 0)
196  {
197  if (stoploaded)
198  ereport(ERROR,
199  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
200  errmsg("multiple StopWords parameters")));
202  stoploaded = true;
203  }
204  else if (pg_strcasecmp("Language", defel->defname) == 0)
205  {
206  if (d->stem)
207  ereport(ERROR,
208  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
209  errmsg("multiple Language parameters")));
210  locate_stem_module(d, defGetString(defel));
211  }
212  else
213  {
214  ereport(ERROR,
215  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
216  errmsg("unrecognized Snowball parameter: \"%s\"",
217  defel->defname)));
218  }
219  }
220 
221  if (!d->stem)
222  ereport(ERROR,
223  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
224  errmsg("missing Language parameter")));
225 
227 
229 }
230 
231 Datum
233 {
235  char *in = (char *) PG_GETARG_POINTER(1);
236  int32 len = PG_GETARG_INT32(2);
237  char *txt = lowerstr_with_len(in, len);
238  TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
239 
240  if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
241  {
242  pfree(txt);
243  }
244  else
245  {
246  MemoryContext saveCtx;
247 
248  /*
249  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
250  */
251  if (d->needrecode)
252  {
253  char *recoded;
254 
255  recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
256  if (recoded != txt)
257  {
258  pfree(txt);
259  txt = recoded;
260  }
261  }
262 
263  /* see comment about d->dictCtx */
264  saveCtx = MemoryContextSwitchTo(d->dictCtx);
265  SN_set_current(d->z, strlen(txt), (symbol *) txt);
266  d->stem(d->z);
267  MemoryContextSwitchTo(saveCtx);
268 
269  if (d->z->p && d->z->l)
270  {
271  txt = repalloc(txt, d->z->l + 1);
272  memcpy(txt, d->z->p, d->z->l);
273  txt[d->z->l] = '\0';
274  }
275 
276  /* back recode if needed */
277  if (d->needrecode)
278  {
279  char *recoded;
280 
281  recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
282  if (recoded != txt)
283  {
284  pfree(txt);
285  txt = recoded;
286  }
287  }
288 
289  res->lexeme = txt;
290  }
291 
292  PG_RETURN_POINTER(res);
293 }
struct SN_env * turkish_UTF_8_create_env(void)
struct SN_env * spanish_UTF_8_create_env(void)
int dutch_UTF_8_stem(struct SN_env *z)
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
struct SN_env * danish_ISO_8859_1_create_env(void)
struct SN_env * porter_UTF_8_create_env(void)
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
StopList stoplist
void porter_UTF_8_close_env(struct SN_env *z)
symbol * p
Definition: api.h:15
int portuguese_ISO_8859_1_stem(struct SN_env *z)
struct SN_env * english_ISO_8859_1_create_env(void)
void porter_ISO_8859_1_close_env(struct SN_env *z)
void norwegian_UTF_8_close_env(struct SN_env *z)
void russian_UTF_8_close_env(struct SN_env *z)
struct SN_env * italian_UTF_8_create_env(void)
struct SN_env * russian_UTF_8_create_env(void)
void portuguese_ISO_8859_1_close_env(struct SN_env *z)
void german_UTF_8_close_env(struct SN_env *z)
struct SN_env * portuguese_ISO_8859_1_create_env(void)
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:58
struct SN_env * german_ISO_8859_1_create_env(void)
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:241
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
int porter_ISO_8859_1_stem(struct SN_env *z)
int errcode(int sqlerrcode)
Definition: elog.c:575
int german_ISO_8859_1_stem(struct SN_env *z)
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
void portuguese_UTF_8_close_env(struct SN_env *z)
char * lowerstr(const char *str)
Definition: ts_locale.c:228
struct DictSnowball DictSnowball
int(* stem)(struct SN_env *z)
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void french_ISO_8859_1_close_env(struct SN_env *z)
int finnish_UTF_8_stem(struct SN_env *z)
void romanian_ISO_8859_2_close_env(struct SN_env *z)
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:634
signed int int32
Definition: c.h:284
void turkish_UTF_8_close_env(struct SN_env *z)
struct SN_env * norwegian_ISO_8859_1_create_env(void)
void dutch_ISO_8859_1_close_env(struct SN_env *z)
int italian_ISO_8859_1_stem(struct SN_env *z)
int danish_UTF_8_stem(struct SN_env *z)
struct SN_env * dutch_ISO_8859_1_create_env(void)
void pfree(void *pointer)
Definition: mcxt.c:949
int spanish_ISO_8859_1_stem(struct SN_env *z)
int dutch_ISO_8859_1_stem(struct SN_env *z)
struct SN_env * portuguese_UTF_8_create_env(void)
#define ERROR
Definition: elog.h:43
void spanish_ISO_8859_1_close_env(struct SN_env *z)
struct SN_env * dutch_UTF_8_create_env(void)
int finnish_ISO_8859_1_stem(struct SN_env *z)
int norwegian_UTF_8_stem(struct SN_env *z)
char * defGetString(DefElem *def)
Definition: define.c:49
struct SN_env * spanish_ISO_8859_1_create_env(void)
int german_UTF_8_stem(struct SN_env *z)
char * lexeme
Definition: ts_public.h:111
int italian_UTF_8_stem(struct SN_env *z)
struct SN_env *(* create)(void)
Definition: dict_snowball.c:72
int russian_UTF_8_stem(struct SN_env *z)
int porter_UTF_8_stem(struct SN_env *z)
struct SN_env * french_ISO_8859_1_create_env(void)
int russian_KOI8_R_stem(struct SN_env *z)
struct SN_env * russian_KOI8_R_create_env(void)
struct SN_env * danish_UTF_8_create_env(void)
struct SN_env * finnish_UTF_8_create_env(void)
struct SN_env * romanian_UTF_8_create_env(void)
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
void spanish_UTF_8_close_env(struct SN_env *z)
struct SN_env * hungarian_ISO_8859_1_create_env(void)
void english_ISO_8859_1_close_env(struct SN_env *z)
int hungarian_UTF_8_stem(struct SN_env *z)
void dutch_UTF_8_close_env(struct SN_env *z)
PG_FUNCTION_INFO_V1(dsnowball_init)
#define ereport(elevel, rest)
Definition: elog.h:122
int english_UTF_8_stem(struct SN_env *z)
int spanish_UTF_8_stem(struct SN_env *z)
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
int portuguese_UTF_8_stem(struct SN_env *z)
int danish_ISO_8859_1_stem(struct SN_env *z)
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *))
Definition: ts_utils.c:68
void * palloc0(Size size)
Definition: mcxt.c:877
uintptr_t Datum
Definition: postgres.h:372
void romanian_UTF_8_close_env(struct SN_env *z)
struct SN_env * english_UTF_8_create_env(void)
int GetDatabaseEncoding(void)
Definition: mbutils.c:1004
void russian_KOI8_R_close_env(struct SN_env *z)
int romanian_ISO_8859_2_stem(struct SN_env *z)
int swedish_ISO_8859_1_stem(struct SN_env *z)
void(* close)(struct SN_env *)
Definition: dict_snowball.c:73
struct SN_env * swedish_UTF_8_create_env(void)
void swedish_UTF_8_close_env(struct SN_env *z)
static void locate_stem_module(DictSnowball *d, const char *lang)
#define lfirst(lc)
Definition: pg_list.h:106
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1010
void swedish_ISO_8859_1_close_env(struct SN_env *z)
static const stemmer_module stemmer_modules[]
Definition: dict_snowball.c:77
struct stemmer_module stemmer_module
struct SN_env * norwegian_UTF_8_create_env(void)
void danish_UTF_8_close_env(struct SN_env *z)
struct SN_env * romanian_ISO_8859_2_create_env(void)
pg_enc
Definition: pg_wchar.h:238
int romanian_UTF_8_stem(struct SN_env *z)
struct SN_env * french_UTF_8_create_env(void)
struct SN_env * finnish_ISO_8859_1_create_env(void)
int(* stem)(struct SN_env *)
Definition: dict_snowball.c:74
void hungarian_ISO_8859_1_close_env(struct SN_env *z)
void german_ISO_8859_1_close_env(struct SN_env *z)
void finnish_UTF_8_close_env(struct SN_env *z)
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
struct SN_env * z
int english_ISO_8859_1_stem(struct SN_env *z)
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:141
Datum dsnowball_init(PG_FUNCTION_ARGS)
struct SN_env * italian_ISO_8859_1_create_env(void)
int errmsg(const char *fmt,...)
Definition: elog.c:797
int hungarian_ISO_8859_1_stem(struct SN_env *z)
void english_UTF_8_close_env(struct SN_env *z)
Definition: api.h:14
int norwegian_ISO_8859_1_stem(struct SN_env *z)
PG_MODULE_MAGIC
Definition: dict_snowball.c:61
void italian_ISO_8859_1_close_env(struct SN_env *z)
MemoryContext dictCtx
struct SN_env * swedish_ISO_8859_1_create_env(void)
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
char * defname
Definition: parsenodes.h:719
struct SN_env * hungarian_UTF_8_create_env(void)
int french_UTF_8_stem(struct SN_env *z)
int l
Definition: api.h:16
void italian_UTF_8_close_env(struct SN_env *z)
void danish_ISO_8859_1_close_env(struct SN_env *z)
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:561
struct SN_env * porter_ISO_8859_1_create_env(void)
const char * name
Definition: dict_snowball.c:70
Definition: pg_list.h:45
int turkish_UTF_8_stem(struct SN_env *z)
int french_ISO_8859_1_stem(struct SN_env *z)
void finnish_ISO_8859_1_close_env(struct SN_env *z)
void hungarian_UTF_8_close_env(struct SN_env *z)
void french_UTF_8_close_env(struct SN_env *z)
struct SN_env * german_UTF_8_create_env(void)
int swedish_UTF_8_stem(struct SN_env *z)
unsigned char symbol
Definition: api.h:2
void norwegian_ISO_8859_1_close_env(struct SN_env *z)