PostgreSQL Source Code  git master
dict_snowball.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dict_snowball.c
4  * Snowball dictionary
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/backend/snowball/dict_snowball.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_utils.h"
18 
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26 
27 /* Now we can include the original Snowball header.h */
69 
71 
73 
75 
76 /* List of supported modules */
77 typedef struct stemmer_module
78 {
79  const char *name;
81  struct SN_env *(*create) (void);
82  void (*close) (struct SN_env *);
83  int (*stem) (struct SN_env *);
85 
86 /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
87 #define STEMMER_MODULE(name,enc,senc) \
88  {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
89 
91 {
92  /*
93  * Stemmers list from Snowball distribution
94  */
95  STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
96  STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
97  STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
98  STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
99  STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
100  STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
101  STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
102  STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
103  STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
104  STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
105  STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
106  STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
107  STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
108  STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
109  STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
110  STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
111  STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
112  STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
113  STEMMER_MODULE(danish, PG_UTF8, UTF_8),
114  STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
115  STEMMER_MODULE(english, PG_UTF8, UTF_8),
116  STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
117  STEMMER_MODULE(french, PG_UTF8, UTF_8),
118  STEMMER_MODULE(german, PG_UTF8, UTF_8),
119  STEMMER_MODULE(greek, PG_UTF8, UTF_8),
120  STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
121  STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
122  STEMMER_MODULE(irish, PG_UTF8, UTF_8),
123  STEMMER_MODULE(italian, PG_UTF8, UTF_8),
124  STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
125  STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
126  STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
127  STEMMER_MODULE(porter, PG_UTF8, UTF_8),
128  STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
129  STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
130  STEMMER_MODULE(russian, PG_UTF8, UTF_8),
131  STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
132  STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
133  STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
134  STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
135 
136  /*
137  * Stemmer with PG_SQL_ASCII encoding should be valid for any server
138  * encoding
139  */
140  STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
141 
142  {NULL, 0, NULL, NULL, NULL} /* list end marker */
143 };
144 
145 
146 typedef struct DictSnowball
147 {
148  struct SN_env *z;
150  bool needrecode; /* needs recoding before/after call stem */
151  int (*stem) (struct SN_env *z);
152 
153  /*
154  * snowball saves alloced memory between calls, so we should run it in our
155  * private memory context. Note, init function is executed in long lived
156  * context, so we just remember CurrentMemoryContext
157  */
159 } DictSnowball;
160 
161 
162 static void
163 locate_stem_module(DictSnowball *d, const char *lang)
164 {
165  const stemmer_module *m;
166 
167  /*
168  * First, try to find exact match of stemmer module. Stemmer with
169  * PG_SQL_ASCII encoding is treated as working with any server encoding
170  */
171  for (m = stemmer_modules; m->name; m++)
172  {
173  if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
174  pg_strcasecmp(m->name, lang) == 0)
175  {
176  d->stem = m->stem;
177  d->z = m->create();
178  d->needrecode = false;
179  return;
180  }
181  }
182 
183  /*
184  * Second, try to find stemmer for needed language for UTF8 encoding.
185  */
186  for (m = stemmer_modules; m->name; m++)
187  {
188  if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
189  {
190  d->stem = m->stem;
191  d->z = m->create();
192  d->needrecode = true;
193  return;
194  }
195  }
196 
197  ereport(ERROR,
198  (errcode(ERRCODE_UNDEFINED_OBJECT),
199  errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
200  lang, GetDatabaseEncodingName())));
201 }
202 
203 Datum
205 {
206  List *dictoptions = (List *) PG_GETARG_POINTER(0);
207  DictSnowball *d;
208  bool stoploaded = false;
209  ListCell *l;
210 
211  d = (DictSnowball *) palloc0(sizeof(DictSnowball));
212 
213  foreach(l, dictoptions)
214  {
215  DefElem *defel = (DefElem *) lfirst(l);
216 
217  if (strcmp(defel->defname, "stopwords") == 0)
218  {
219  if (stoploaded)
220  ereport(ERROR,
221  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
222  errmsg("multiple StopWords parameters")));
224  stoploaded = true;
225  }
226  else if (strcmp(defel->defname, "language") == 0)
227  {
228  if (d->stem)
229  ereport(ERROR,
230  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
231  errmsg("multiple Language parameters")));
232  locate_stem_module(d, defGetString(defel));
233  }
234  else
235  {
236  ereport(ERROR,
237  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
238  errmsg("unrecognized Snowball parameter: \"%s\"",
239  defel->defname)));
240  }
241  }
242 
243  if (!d->stem)
244  ereport(ERROR,
245  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
246  errmsg("missing Language parameter")));
247 
249 
251 }
252 
253 Datum
255 {
257  char *in = (char *) PG_GETARG_POINTER(1);
258  int32 len = PG_GETARG_INT32(2);
259  char *txt = lowerstr_with_len(in, len);
260  TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
261 
262  if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
263  {
264  pfree(txt);
265  }
266  else
267  {
268  MemoryContext saveCtx;
269 
270  /*
271  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
272  */
273  if (d->needrecode)
274  {
275  char *recoded;
276 
277  recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
278  if (recoded != txt)
279  {
280  pfree(txt);
281  txt = recoded;
282  }
283  }
284 
285  /* see comment about d->dictCtx */
286  saveCtx = MemoryContextSwitchTo(d->dictCtx);
287  SN_set_current(d->z, strlen(txt), (symbol *) txt);
288  d->stem(d->z);
289  MemoryContextSwitchTo(saveCtx);
290 
291  if (d->z->p && d->z->l)
292  {
293  txt = repalloc(txt, d->z->l + 1);
294  memcpy(txt, d->z->p, d->z->l);
295  txt[d->z->l] = '\0';
296  }
297 
298  /* back recode if needed */
299  if (d->needrecode)
300  {
301  char *recoded;
302 
303  recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
304  if (recoded != txt)
305  {
306  pfree(txt);
307  txt = recoded;
308  }
309  }
310 
311  res->lexeme = txt;
312  }
313 
314  PG_RETURN_POINTER(res);
315 }
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:351
#define PG_GETARG_INT32(n)
Definition: fmgr.h:264
StopList stoplist
symbol * p
Definition: api.h:15
#define STEMMER_MODULE(name, enc, senc)
Definition: dict_snowball.c:87
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:58
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:252
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
int errcode(int sqlerrcode)
Definition: elog.c:608
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:271
char * lowerstr(const char *str)
Definition: ts_locale.c:239
struct DictSnowball DictSnowball
int(* stem)(struct SN_env *z)
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:654
signed int int32
Definition: c.h:347
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ERROR
Definition: elog.h:43
char * defGetString(DefElem *def)
Definition: define.c:49
char * lexeme
Definition: ts_public.h:111
struct SN_env *(* create)(void)
Definition: dict_snowball.c:81
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
PG_FUNCTION_INFO_V1(dsnowball_init)
#define ereport(elevel, rest)
Definition: elog.h:141
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *))
Definition: ts_utils.c:68
void * palloc0(Size size)
Definition: mcxt.c:980
uintptr_t Datum
Definition: postgres.h:367
int GetDatabaseEncoding(void)
Definition: mbutils.c:1046
void(* close)(struct SN_env *)
Definition: dict_snowball.c:82
static void locate_stem_module(DictSnowball *d, const char *lang)
#define lfirst(lc)
Definition: pg_list.h:190
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1052
static const stemmer_module stemmer_modules[]
Definition: dict_snowball.c:90
struct stemmer_module stemmer_module
pg_enc
Definition: pg_wchar.h:238
int(* stem)(struct SN_env *)
Definition: dict_snowball.c:83
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
struct SN_env * z
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:141
Datum dsnowball_init(PG_FUNCTION_ARGS)
int errmsg(const char *fmt,...)
Definition: elog.c:822
Definition: api.h:14
PG_MODULE_MAGIC
Definition: dict_snowball.c:70
MemoryContext dictCtx
#define PG_FUNCTION_ARGS
Definition: fmgr.h:188
char * defname
Definition: parsenodes.h:730
int l
Definition: api.h:16
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:581
const char * name
Definition: dict_snowball.c:79
Definition: pg_list.h:50
unsigned char symbol
Definition: api.h:2