PostgreSQL Source Code  git master
dict_snowball.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dict_snowball.c
4  * Snowball dictionary
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/backend/snowball/dict_snowball.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_public.h"
18 
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26 
27 /* Now we can include the original Snowball header.h */
77 
79 
81 
83 
84 /* List of supported modules */
85 typedef struct stemmer_module
86 {
87  const char *name;
89  struct SN_env *(*create) (void);
90  void (*close) (struct SN_env *);
91  int (*stem) (struct SN_env *);
93 
94 /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
95 #define STEMMER_MODULE(name,enc,senc) \
96  {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
97 
99 {
100  /*
101  * Stemmers list from Snowball distribution
102  */
103  STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
104  STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
105  STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
106  STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
107  STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
108  STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
109  STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
110  STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
111  STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
112  STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
113  STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
114  STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
115  STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
116  STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
117  STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
118  STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
119  STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
120  STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
121  STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
122  STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
123  STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
124  STEMMER_MODULE(basque, PG_UTF8, UTF_8),
125  STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
126  STEMMER_MODULE(danish, PG_UTF8, UTF_8),
127  STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
128  STEMMER_MODULE(english, PG_UTF8, UTF_8),
129  STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
130  STEMMER_MODULE(french, PG_UTF8, UTF_8),
131  STEMMER_MODULE(german, PG_UTF8, UTF_8),
132  STEMMER_MODULE(greek, PG_UTF8, UTF_8),
133  STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
134  STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
135  STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
136  STEMMER_MODULE(irish, PG_UTF8, UTF_8),
137  STEMMER_MODULE(italian, PG_UTF8, UTF_8),
138  STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
139  STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
140  STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
141  STEMMER_MODULE(porter, PG_UTF8, UTF_8),
142  STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
143  STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
144  STEMMER_MODULE(russian, PG_UTF8, UTF_8),
145  STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
146  STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
147  STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
148  STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
149  STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
150  STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
151 
152  /*
153  * Stemmer with PG_SQL_ASCII encoding should be valid for any server
154  * encoding
155  */
156  STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
157 
158  {NULL, 0, NULL, NULL, NULL} /* list end marker */
159 };
160 
161 
162 typedef struct DictSnowball
163 {
164  struct SN_env *z;
166  bool needrecode; /* needs recoding before/after call stem */
167  int (*stem) (struct SN_env *z);
168 
169  /*
170  * snowball saves alloced memory between calls, so we should run it in our
171  * private memory context. Note, init function is executed in long lived
172  * context, so we just remember CurrentMemoryContext
173  */
176 
177 
178 static void
179 locate_stem_module(DictSnowball *d, const char *lang)
180 {
181  const stemmer_module *m;
182 
183  /*
184  * First, try to find exact match of stemmer module. Stemmer with
185  * PG_SQL_ASCII encoding is treated as working with any server encoding
186  */
187  for (m = stemmer_modules; m->name; m++)
188  {
189  if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
190  pg_strcasecmp(m->name, lang) == 0)
191  {
192  d->stem = m->stem;
193  d->z = m->create();
194  d->needrecode = false;
195  return;
196  }
197  }
198 
199  /*
200  * Second, try to find stemmer for needed language for UTF8 encoding.
201  */
202  for (m = stemmer_modules; m->name; m++)
203  {
204  if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
205  {
206  d->stem = m->stem;
207  d->z = m->create();
208  d->needrecode = true;
209  return;
210  }
211  }
212 
213  ereport(ERROR,
214  (errcode(ERRCODE_UNDEFINED_OBJECT),
215  errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
216  lang, GetDatabaseEncodingName())));
217 }
218 
219 Datum
221 {
222  List *dictoptions = (List *) PG_GETARG_POINTER(0);
223  DictSnowball *d;
224  bool stoploaded = false;
225  ListCell *l;
226 
227  d = (DictSnowball *) palloc0(sizeof(DictSnowball));
228 
229  foreach(l, dictoptions)
230  {
231  DefElem *defel = (DefElem *) lfirst(l);
232 
233  if (strcmp(defel->defname, "stopwords") == 0)
234  {
235  if (stoploaded)
236  ereport(ERROR,
237  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
238  errmsg("multiple StopWords parameters")));
240  stoploaded = true;
241  }
242  else if (strcmp(defel->defname, "language") == 0)
243  {
244  if (d->stem)
245  ereport(ERROR,
246  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
247  errmsg("multiple Language parameters")));
248  locate_stem_module(d, defGetString(defel));
249  }
250  else
251  {
252  ereport(ERROR,
253  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
254  errmsg("unrecognized Snowball parameter: \"%s\"",
255  defel->defname)));
256  }
257  }
258 
259  if (!d->stem)
260  ereport(ERROR,
261  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
262  errmsg("missing Language parameter")));
263 
265 
267 }
268 
269 Datum
271 {
273  char *in = (char *) PG_GETARG_POINTER(1);
275  char *txt = lowerstr_with_len(in, len);
276  TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
277 
278  /*
279  * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
280  * surely not words in any human language. This restriction avoids
281  * wasting cycles on stuff like base64-encoded data, and it protects us
282  * against possible inefficiency or misbehavior in the stemmer. (For
283  * example, the Turkish stemmer has an indefinite recursion, so it can
284  * crash on long-enough strings.) However, Snowball dictionaries are
285  * defined to recognize all strings, so we can't reject the string as an
286  * unknown word.
287  */
288  if (len > 1000)
289  {
290  /* return the lexeme lowercased, but otherwise unmodified */
291  res->lexeme = txt;
292  }
293  else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
294  {
295  /* empty or stopword, so report as stopword */
296  pfree(txt);
297  }
298  else
299  {
300  MemoryContext saveCtx;
301 
302  /*
303  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
304  */
305  if (d->needrecode)
306  {
307  char *recoded;
308 
309  recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
310  if (recoded != txt)
311  {
312  pfree(txt);
313  txt = recoded;
314  }
315  }
316 
317  /* see comment about d->dictCtx */
318  saveCtx = MemoryContextSwitchTo(d->dictCtx);
319  SN_set_current(d->z, strlen(txt), (symbol *) txt);
320  d->stem(d->z);
321  MemoryContextSwitchTo(saveCtx);
322 
323  if (d->z->p && d->z->l)
324  {
325  txt = repalloc(txt, d->z->l + 1);
326  memcpy(txt, d->z->p, d->z->l);
327  txt[d->z->l] = '\0';
328  }
329 
330  /* back recode if needed */
331  if (d->needrecode)
332  {
333  char *recoded;
334 
335  recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
336  if (recoded != txt)
337  {
338  pfree(txt);
339  txt = recoded;
340  }
341  }
342 
343  res->lexeme = txt;
344  }
345 
347 }
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:51
unsigned char symbol
Definition: api.h:2
signed int int32
Definition: c.h:494
char * defGetString(DefElem *def)
Definition: define.c:48
struct stemmer_module stemmer_module
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
static const stemmer_module stemmer_modules[]
Definition: dict_snowball.c:98
PG_MODULE_MAGIC
Definition: dict_snowball.c:78
static void locate_stem_module(DictSnowball *d, const char *lang)
#define STEMMER_MODULE(name, enc, senc)
Definition: dict_snowball.c:95
PG_FUNCTION_INFO_V1(dsnowball_init)
Datum dsnowball_init(PG_FUNCTION_ARGS)
struct DictSnowball DictSnowball
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:676
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1267
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:749
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1540
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
pg_enc
Definition: pg_wchar.h:225
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
uintptr_t Datum
Definition: postgres.h:64
MemoryContextSwitchTo(old_ctx)
char * defname
Definition: parsenodes.h:815
MemoryContext dictCtx
StopList stoplist
struct SN_env * z
int(* stem)(struct SN_env *z)
Definition: pg_list.h:54
Definition: api.h:14
symbol * p
Definition: api.h:15
int l
Definition: api.h:16
const char * name
Definition: dict_snowball.c:87
void(* close)(struct SN_env *)
Definition: dict_snowball.c:90
int(* stem)(struct SN_env *)
Definition: dict_snowball.c:91
struct SN_env *(* create)(void)
Definition: dict_snowball.c:89
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:266
char * lowerstr(const char *str)
Definition: ts_locale.c:253
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *))
Definition: ts_utils.c:68
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:140