PostgreSQL Source Code git master
Loading...
Searching...
No Matches
dict_snowball.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dict_snowball.c
4 * Snowball dictionary
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/backend/snowball/dict_snowball.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "postgres.h"
14
15#include "catalog/pg_collation_d.h"
16#include "commands/defrem.h"
17#include "mb/pg_wchar.h"
18#include "tsearch/ts_public.h"
19#include "utils/formatting.h"
20
21/* Some platforms define MAXINT and/or MININT, causing conflicts */
22#ifdef MAXINT
23#undef MAXINT
24#endif
25#ifdef MININT
26#undef MININT
27#endif
28
29/* Now we can include the original Snowball snowball_runtime.h */
84
86 .name = "dict_snowball",
87 .version = PG_VERSION
88);
89
91
93
94/* List of supported modules */
95typedef struct stemmer_module
96{
97 const char *name;
99 struct SN_env *(*create) (void);
100 void (*close) (struct SN_env *);
101 int (*stem) (struct SN_env *);
103
104/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
105#define STEMMER_MODULE(name,enc,senc) \
106 {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
107
109{
110 /*
111 * Stemmers list from Snowball distribution
112 */
166
167 /*
168 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
169 * encoding
170 */
172
173 {NULL, 0, NULL, NULL, NULL} /* list end marker */
174};
175
176
177typedef struct DictSnowball
178{
179 struct SN_env *z;
181 bool needrecode; /* needs recoding before/after call stem */
182 int (*stem) (struct SN_env *z);
183
184 /*
185 * snowball saves alloced memory between calls, so we should run it in our
186 * private memory context. Note, init function is executed in long lived
187 * context, so we just remember CurrentMemoryContext
188 */
191
192
193static void
194locate_stem_module(DictSnowball *d, const char *lang)
195{
196 const stemmer_module *m;
197
198 /*
199 * First, try to find exact match of stemmer module. Stemmer with
200 * PG_SQL_ASCII encoding is treated as working with any server encoding
201 */
202 for (m = stemmer_modules; m->name; m++)
203 {
204 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
205 pg_strcasecmp(m->name, lang) == 0)
206 {
207 d->stem = m->stem;
208 d->z = m->create();
209 d->needrecode = false;
210 return;
211 }
212 }
213
214 /*
215 * Second, try to find stemmer for needed language for UTF8 encoding.
216 */
217 for (m = stemmer_modules; m->name; m++)
218 {
219 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
220 {
221 d->stem = m->stem;
222 d->z = m->create();
223 d->needrecode = true;
224 return;
225 }
226 }
227
230 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
231 lang, GetDatabaseEncodingName())));
232}
233
234Datum
236{
238 DictSnowball *d;
239 bool stoploaded = false;
240 ListCell *l;
241
243
244 foreach(l, dictoptions)
245 {
246 DefElem *defel = (DefElem *) lfirst(l);
247
248 if (strcmp(defel->defname, "stopwords") == 0)
249 {
250 if (stoploaded)
253 errmsg("multiple StopWords parameters")));
255 stoploaded = true;
256 }
257 else if (strcmp(defel->defname, "language") == 0)
258 {
259 if (d->stem)
262 errmsg("multiple Language parameters")));
264 }
265 else
266 {
269 errmsg("unrecognized Snowball parameter: \"%s\"",
270 defel->defname)));
271 }
272 }
273
274 if (!d->stem)
277 errmsg("missing Language parameter")));
278
280
282}
283
284Datum
286{
288 char *in = (char *) PG_GETARG_POINTER(1);
292
293 /*
294 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
295 * surely not words in any human language. This restriction avoids
296 * wasting cycles on stuff like base64-encoded data, and it protects us
297 * against possible inefficiency or misbehavior in the stemmer. (For
298 * example, the Turkish stemmer has an indefinite recursion, so it can
299 * crash on long-enough strings.) However, Snowball dictionaries are
300 * defined to recognize all strings, so we can't reject the string as an
301 * unknown word.
302 */
303 if (len > 1000)
304 {
305 /* return the lexeme lowercased, but otherwise unmodified */
306 res->lexeme = txt;
307 }
308 else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
309 {
310 /* empty or stopword, so report as stopword */
311 pfree(txt);
312 }
313 else
314 {
316
317 /*
318 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
319 */
320 if (d->needrecode)
321 {
322 char *recoded;
323
325 if (recoded != txt)
326 {
327 pfree(txt);
328 txt = recoded;
329 }
330 }
331
332 /* see comment about d->dictCtx */
334 SN_set_current(d->z, strlen(txt), (symbol *) txt);
335 d->stem(d->z);
337
338 if (d->z->p && d->z->l)
339 {
340 txt = repalloc(txt, d->z->l + 1);
341 memcpy(txt, d->z->p, d->z->l);
342 txt[d->z->l] = '\0';
343 }
344
345 /* back recode if needed */
346 if (d->needrecode)
347 {
348 char *recoded;
349
351 if (recoded != txt)
352 {
353 pfree(txt);
354 txt = recoded;
355 }
356 }
357
358 res->lexeme = txt;
359 }
360
362}
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition api.c:25
unsigned char symbol
Definition api.h:4
int32_t int32
Definition c.h:542
char * defGetString(DefElem *def)
Definition define.c:34
Datum dsnowball_lexize(PG_FUNCTION_ARGS)
static const stemmer_module stemmer_modules[]
static void locate_stem_module(DictSnowball *d, const char *lang)
#define STEMMER_MODULE(name, enc, senc)
Datum dsnowball_init(PG_FUNCTION_ARGS)
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define ERROR
Definition elog.h:39
#define ereport(elevel,...)
Definition elog.h:150
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define palloc0_object(type)
Definition fe_memutils.h:75
#define PG_GETARG_POINTER(n)
Definition fmgr.h:277
#define PG_MODULE_MAGIC_EXT(...)
Definition fmgr.h:540
#define PG_FUNCTION_INFO_V1(funcname)
Definition fmgr.h:417
#define PG_GETARG_INT32(n)
Definition fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
#define PG_UTF8
Definition mbprint.c:43
int GetDatabaseEncoding(void)
Definition mbutils.c:1264
char * pg_any_to_server(const char *s, int len, int encoding)
Definition mbutils.c:679
const char * GetDatabaseEncodingName(void)
Definition mbutils.c:1270
char * pg_server_to_any(const char *s, int len, int encoding)
Definition mbutils.c:752
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
const void size_t len
#define lfirst(lc)
Definition pg_list.h:172
pg_enc
Definition pg_wchar.h:225
@ PG_SQL_ASCII
Definition pg_wchar.h:226
@ PG_KOI8R
Definition pg_wchar.h:248
@ PG_LATIN2
Definition pg_wchar.h:235
@ PG_LATIN1
Definition pg_wchar.h:234
int pg_strcasecmp(const char *s1, const char *s2)
uint64_t Datum
Definition postgres.h:70
static int fb(int x)
MemoryContext dictCtx
StopList stoplist
struct SN_env * z
int(* stem)(struct SN_env *z)
Definition pg_list.h:54
Definition api.h:15
symbol * p
Definition api.h:16
int l
Definition api.h:17
char * lexeme
Definition ts_public.h:138
struct SN_env *(* create)(void)
const char * name
void(* close)(struct SN_env *)
int(* stem)(struct SN_env *)
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *, size_t, Oid))
Definition ts_utils.c:69
bool searchstoplist(StopList *s, char *key)
Definition ts_utils.c:141
const char * name