PostgreSQL Source Code git master
dict_snowball.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dict_snowball.c
4 * Snowball dictionary
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/backend/snowball/dict_snowball.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "postgres.h"
14
15#include "catalog/pg_collation_d.h"
16#include "commands/defrem.h"
17#include "mb/pg_wchar.h"
18#include "tsearch/ts_public.h"
19#include "utils/formatting.h"
20
21/* Some platforms define MAXINT and/or MININT, causing conflicts */
22#ifdef MAXINT
23#undef MAXINT
24#endif
25#ifdef MININT
26#undef MININT
27#endif
28
29/* Now we can include the original Snowball snowball_runtime.h */
84
86 .name = "dict_snowball",
87 .version = PG_VERSION
88);
89
91
93
94/* List of supported modules */
95typedef struct stemmer_module
96{
97 const char *name;
99 struct SN_env *(*create) (void);
100 void (*close) (struct SN_env *);
101 int (*stem) (struct SN_env *);
103
104/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
105#define STEMMER_MODULE(name,enc,senc) \
106 {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
107
109{
110 /*
111 * Stemmers list from Snowball distribution
112 */
113 STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
114 STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
115 STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
116 STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
117 STEMMER_MODULE(dutch_porter, PG_LATIN1, ISO_8859_1),
118 STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
119 STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
120 STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
121 STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
122 STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
123 STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
124 STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
125 STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
126 STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
127 STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
128 STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
129 STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
130 STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
131 STEMMER_MODULE(polish, PG_LATIN2, ISO_8859_2),
132 STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
133 STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
134 STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
135 STEMMER_MODULE(basque, PG_UTF8, UTF_8),
136 STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
137 STEMMER_MODULE(danish, PG_UTF8, UTF_8),
138 STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
139 STEMMER_MODULE(dutch_porter, PG_UTF8, UTF_8),
140 STEMMER_MODULE(english, PG_UTF8, UTF_8),
141 STEMMER_MODULE(esperanto, PG_UTF8, UTF_8),
142 STEMMER_MODULE(estonian, PG_UTF8, UTF_8),
143 STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
144 STEMMER_MODULE(french, PG_UTF8, UTF_8),
145 STEMMER_MODULE(german, PG_UTF8, UTF_8),
146 STEMMER_MODULE(greek, PG_UTF8, UTF_8),
147 STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
148 STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
149 STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
150 STEMMER_MODULE(irish, PG_UTF8, UTF_8),
151 STEMMER_MODULE(italian, PG_UTF8, UTF_8),
152 STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
153 STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
154 STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
155 STEMMER_MODULE(porter, PG_UTF8, UTF_8),
156 STEMMER_MODULE(polish, PG_UTF8, UTF_8),
157 STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
158 STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
159 STEMMER_MODULE(russian, PG_UTF8, UTF_8),
160 STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
161 STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
162 STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
163 STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
164 STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
165 STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
166
167 /*
168 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
169 * encoding
170 */
171 STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
172
173 {NULL, 0, NULL, NULL, NULL} /* list end marker */
174};
175
176
177typedef struct DictSnowball
178{
179 struct SN_env *z;
181 bool needrecode; /* needs recoding before/after call stem */
182 int (*stem) (struct SN_env *z);
183
184 /*
185 * snowball saves alloced memory between calls, so we should run it in our
186 * private memory context. Note, init function is executed in long lived
187 * context, so we just remember CurrentMemoryContext
188 */
191
192
193static void
194locate_stem_module(DictSnowball *d, const char *lang)
195{
196 const stemmer_module *m;
197
198 /*
199 * First, try to find exact match of stemmer module. Stemmer with
200 * PG_SQL_ASCII encoding is treated as working with any server encoding
201 */
202 for (m = stemmer_modules; m->name; m++)
203 {
204 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
205 pg_strcasecmp(m->name, lang) == 0)
206 {
207 d->stem = m->stem;
208 d->z = m->create();
209 d->needrecode = false;
210 return;
211 }
212 }
213
214 /*
215 * Second, try to find stemmer for needed language for UTF8 encoding.
216 */
217 for (m = stemmer_modules; m->name; m++)
218 {
219 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
220 {
221 d->stem = m->stem;
222 d->z = m->create();
223 d->needrecode = true;
224 return;
225 }
226 }
227
229 (errcode(ERRCODE_UNDEFINED_OBJECT),
230 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
231 lang, GetDatabaseEncodingName())));
232}
233
234Datum
236{
237 List *dictoptions = (List *) PG_GETARG_POINTER(0);
238 DictSnowball *d;
239 bool stoploaded = false;
240 ListCell *l;
241
243
244 foreach(l, dictoptions)
245 {
246 DefElem *defel = (DefElem *) lfirst(l);
247
248 if (strcmp(defel->defname, "stopwords") == 0)
249 {
250 if (stoploaded)
252 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
253 errmsg("multiple StopWords parameters")));
255 stoploaded = true;
256 }
257 else if (strcmp(defel->defname, "language") == 0)
258 {
259 if (d->stem)
261 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
262 errmsg("multiple Language parameters")));
264 }
265 else
266 {
268 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
269 errmsg("unrecognized Snowball parameter: \"%s\"",
270 defel->defname)));
271 }
272 }
273
274 if (!d->stem)
276 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
277 errmsg("missing Language parameter")));
278
280
282}
283
284Datum
286{
288 char *in = (char *) PG_GETARG_POINTER(1);
290 char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
292
293 /*
294 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
295 * surely not words in any human language. This restriction avoids
296 * wasting cycles on stuff like base64-encoded data, and it protects us
297 * against possible inefficiency or misbehavior in the stemmer. (For
298 * example, the Turkish stemmer has an indefinite recursion, so it can
299 * crash on long-enough strings.) However, Snowball dictionaries are
300 * defined to recognize all strings, so we can't reject the string as an
301 * unknown word.
302 */
303 if (len > 1000)
304 {
305 /* return the lexeme lowercased, but otherwise unmodified */
306 res->lexeme = txt;
307 }
308 else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
309 {
310 /* empty or stopword, so report as stopword */
311 pfree(txt);
312 }
313 else
314 {
315 MemoryContext saveCtx;
316
317 /*
318 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
319 */
320 if (d->needrecode)
321 {
322 char *recoded;
323
324 recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
325 if (recoded != txt)
326 {
327 pfree(txt);
328 txt = recoded;
329 }
330 }
331
332 /* see comment about d->dictCtx */
333 saveCtx = MemoryContextSwitchTo(d->dictCtx);
334 SN_set_current(d->z, strlen(txt), (symbol *) txt);
335 d->stem(d->z);
336 MemoryContextSwitchTo(saveCtx);
337
338 if (d->z->p && d->z->l)
339 {
340 txt = repalloc(txt, d->z->l + 1);
341 memcpy(txt, d->z->p, d->z->l);
342 txt[d->z->l] = '\0';
343 }
344
345 /* back recode if needed */
346 if (d->needrecode)
347 {
348 char *recoded;
349
350 recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
351 if (recoded != txt)
352 {
353 pfree(txt);
354 txt = recoded;
355 }
356 }
357
358 res->lexeme = txt;
359 }
360
362}
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:25
unsigned char symbol
Definition: api.h:4
int32_t int32
Definition: c.h:548
char * defGetString(DefElem *def)
Definition: define.c:35
struct stemmer_module stemmer_module
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
static const stemmer_module stemmer_modules[]
static void locate_stem_module(DictSnowball *d, const char *lang)
#define STEMMER_MODULE(name, enc, senc)
PG_MODULE_MAGIC_EXT(.name="dict_snowball",.version=PG_VERSION)
PG_FUNCTION_INFO_V1(dsnowball_init)
Datum dsnowball_init(PG_FUNCTION_ARGS)
struct DictSnowball DictSnowball
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc0_array(type, count)
Definition: fe_memutils.h:77
#define palloc0_object(type)
Definition: fe_memutils.h:75
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:277
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:363
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
Definition: formatting.c:1619
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:679
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1270
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:752
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1632
void pfree(void *pointer)
Definition: mcxt.c:1616
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
pg_enc
Definition: pg_wchar.h:225
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:32
uint64_t Datum
Definition: postgres.h:70
char * defname
Definition: parsenodes.h:844
MemoryContext dictCtx
StopList stoplist
struct SN_env * z
int(* stem)(struct SN_env *z)
Definition: pg_list.h:54
Definition: api.h:15
symbol * p
Definition: api.h:16
int l
Definition: api.h:17
char * lexeme
Definition: ts_public.h:138
struct SN_env *(* create)(void)
Definition: dict_snowball.c:99
const char * name
Definition: dict_snowball.c:97
void(* close)(struct SN_env *)
int(* stem)(struct SN_env *)
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *, size_t, Oid))
Definition: ts_utils.c:69
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:141
const char * name