PostgreSQL Source Code git master
dict_snowball.c File Reference
#include "postgres.h"
#include "catalog/pg_collation_d.h"
#include "commands/defrem.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_public.h"
#include "utils/formatting.h"
#include "snowball/libstemmer/snowball_runtime.h"
#include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
#include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
#include "snowball/libstemmer/stem_ISO_8859_1_dutch_porter.h"
#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
#include "snowball/libstemmer/stem_ISO_8859_2_polish.h"
#include "snowball/libstemmer/stem_KOI8_R_russian.h"
#include "snowball/libstemmer/stem_UTF_8_arabic.h"
#include "snowball/libstemmer/stem_UTF_8_armenian.h"
#include "snowball/libstemmer/stem_UTF_8_basque.h"
#include "snowball/libstemmer/stem_UTF_8_catalan.h"
#include "snowball/libstemmer/stem_UTF_8_danish.h"
#include "snowball/libstemmer/stem_UTF_8_dutch.h"
#include "snowball/libstemmer/stem_UTF_8_dutch_porter.h"
#include "snowball/libstemmer/stem_UTF_8_english.h"
#include "snowball/libstemmer/stem_UTF_8_esperanto.h"
#include "snowball/libstemmer/stem_UTF_8_estonian.h"
#include "snowball/libstemmer/stem_UTF_8_finnish.h"
#include "snowball/libstemmer/stem_UTF_8_french.h"
#include "snowball/libstemmer/stem_UTF_8_german.h"
#include "snowball/libstemmer/stem_UTF_8_greek.h"
#include "snowball/libstemmer/stem_UTF_8_hindi.h"
#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
#include "snowball/libstemmer/stem_UTF_8_irish.h"
#include "snowball/libstemmer/stem_UTF_8_italian.h"
#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
#include "snowball/libstemmer/stem_UTF_8_nepali.h"
#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
#include "snowball/libstemmer/stem_UTF_8_polish.h"
#include "snowball/libstemmer/stem_UTF_8_porter.h"
#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
#include "snowball/libstemmer/stem_UTF_8_romanian.h"
#include "snowball/libstemmer/stem_UTF_8_russian.h"
#include "snowball/libstemmer/stem_UTF_8_serbian.h"
#include "snowball/libstemmer/stem_UTF_8_spanish.h"
#include "snowball/libstemmer/stem_UTF_8_swedish.h"
#include "snowball/libstemmer/stem_UTF_8_tamil.h"
#include "snowball/libstemmer/stem_UTF_8_turkish.h"
#include "snowball/libstemmer/stem_UTF_8_yiddish.h"
Include dependency graph for dict_snowball.c:

Go to the source code of this file.

Data Structures

struct  stemmer_module
 
struct  DictSnowball
 

Macros

#define STEMMER_MODULE(name, enc, senc)    {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
 

Typedefs

typedef struct stemmer_module stemmer_module
 
typedef struct DictSnowball DictSnowball
 

Functions

 PG_MODULE_MAGIC_EXT (.name="dict_snowball",.version=PG_VERSION)
 
 PG_FUNCTION_INFO_V1 (dsnowball_init)
 
 PG_FUNCTION_INFO_V1 (dsnowball_lexize)
 
static void locate_stem_module (DictSnowball *d, const char *lang)
 
Datum dsnowball_init (PG_FUNCTION_ARGS)
 
Datum dsnowball_lexize (PG_FUNCTION_ARGS)
 

Variables

static const stemmer_module stemmer_modules []
 

Macro Definition Documentation

◆ STEMMER_MODULE

#define STEMMER_MODULE (   name,
  enc,
  senc 
)     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}

Definition at line 105 of file dict_snowball.c.

Typedef Documentation

◆ DictSnowball

typedef struct DictSnowball DictSnowball

◆ stemmer_module

Function Documentation

◆ dsnowball_init()

Datum dsnowball_init ( PG_FUNCTION_ARGS  )

Definition at line 235 of file dict_snowball.c.

236{
237 List *dictoptions = (List *) PG_GETARG_POINTER(0);
238 DictSnowball *d;
239 bool stoploaded = false;
240 ListCell *l;
241
243
244 foreach(l, dictoptions)
245 {
246 DefElem *defel = (DefElem *) lfirst(l);
247
248 if (strcmp(defel->defname, "stopwords") == 0)
249 {
250 if (stoploaded)
252 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
253 errmsg("multiple StopWords parameters")));
255 stoploaded = true;
256 }
257 else if (strcmp(defel->defname, "language") == 0)
258 {
259 if (d->stem)
261 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
262 errmsg("multiple Language parameters")));
264 }
265 else
266 {
268 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
269 errmsg("unrecognized Snowball parameter: \"%s\"",
270 defel->defname)));
271 }
272 }
273
274 if (!d->stem)
276 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
277 errmsg("missing Language parameter")));
278
280
282}
char * defGetString(DefElem *def)
Definition: define.c:35
static void locate_stem_module(DictSnowball *d, const char *lang)
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc0_object(type)
Definition: fe_memutils.h:75
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:277
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:363
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
Definition: formatting.c:1619
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
#define lfirst(lc)
Definition: pg_list.h:172
char * defname
Definition: parsenodes.h:844
MemoryContext dictCtx
StopList stoplist
int(* stem)(struct SN_env *z)
Definition: pg_list.h:54
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *, size_t, Oid))
Definition: ts_utils.c:69

References CurrentMemoryContext, defGetString(), DefElem::defname, DictSnowball::dictCtx, ereport, errcode(), errmsg(), ERROR, SN_env::l, lfirst, locate_stem_module(), palloc0_object, PG_GETARG_POINTER, PG_RETURN_POINTER, readstoplist(), DictSnowball::stem, DictSnowball::stoplist, and str_tolower().

◆ dsnowball_lexize()

Datum dsnowball_lexize ( PG_FUNCTION_ARGS  )

Definition at line 285 of file dict_snowball.c.

286{
288 char *in = (char *) PG_GETARG_POINTER(1);
290 char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
292
293 /*
294 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
295 * surely not words in any human language. This restriction avoids
296 * wasting cycles on stuff like base64-encoded data, and it protects us
297 * against possible inefficiency or misbehavior in the stemmer. (For
298 * example, the Turkish stemmer has an indefinite recursion, so it can
299 * crash on long-enough strings.) However, Snowball dictionaries are
300 * defined to recognize all strings, so we can't reject the string as an
301 * unknown word.
302 */
303 if (len > 1000)
304 {
305 /* return the lexeme lowercased, but otherwise unmodified */
306 res->lexeme = txt;
307 }
308 else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
309 {
310 /* empty or stopword, so report as stopword */
311 pfree(txt);
312 }
313 else
314 {
315 MemoryContext saveCtx;
316
317 /*
318 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
319 */
320 if (d->needrecode)
321 {
322 char *recoded;
323
324 recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
325 if (recoded != txt)
326 {
327 pfree(txt);
328 txt = recoded;
329 }
330 }
331
332 /* see comment about d->dictCtx */
333 saveCtx = MemoryContextSwitchTo(d->dictCtx);
334 SN_set_current(d->z, strlen(txt), (symbol *) txt);
335 d->stem(d->z);
336 MemoryContextSwitchTo(saveCtx);
337
338 if (d->z->p && d->z->l)
339 {
340 txt = repalloc(txt, d->z->l + 1);
341 memcpy(txt, d->z->p, d->z->l);
342 txt[d->z->l] = '\0';
343 }
344
345 /* back recode if needed */
346 if (d->needrecode)
347 {
348 char *recoded;
349
350 recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
351 if (recoded != txt)
352 {
353 pfree(txt);
354 txt = recoded;
355 }
356 }
357
358 res->lexeme = txt;
359 }
360
362}
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:25
unsigned char symbol
Definition: api.h:4
int32_t int32
Definition: c.h:548
#define palloc0_array(type, count)
Definition: fe_memutils.h:77
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:679
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:752
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1632
void pfree(void *pointer)
Definition: mcxt.c:1616
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
const void size_t len
@ PG_UTF8
Definition: pg_wchar.h:232
struct SN_env * z
symbol * p
Definition: api.h:16
int l
Definition: api.h:17
char * lexeme
Definition: ts_public.h:138
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:141

References DictSnowball::dictCtx, SN_env::l, len, TSLexeme::lexeme, MemoryContextSwitchTo(), DictSnowball::needrecode, SN_env::p, palloc0_array, pfree(), pg_any_to_server(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, pg_server_to_any(), PG_UTF8, repalloc(), searchstoplist(), SN_set_current(), DictSnowball::stem, DictSnowball::stoplist, str_tolower(), and DictSnowball::z.

◆ locate_stem_module()

static void locate_stem_module ( DictSnowball d,
const char *  lang 
)
static

Definition at line 194 of file dict_snowball.c.

195{
196 const stemmer_module *m;
197
198 /*
199 * First, try to find exact match of stemmer module. Stemmer with
200 * PG_SQL_ASCII encoding is treated as working with any server encoding
201 */
202 for (m = stemmer_modules; m->name; m++)
203 {
204 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
205 pg_strcasecmp(m->name, lang) == 0)
206 {
207 d->stem = m->stem;
208 d->z = m->create();
209 d->needrecode = false;
210 return;
211 }
212 }
213
214 /*
215 * Second, try to find stemmer for needed language for UTF8 encoding.
216 */
217 for (m = stemmer_modules; m->name; m++)
218 {
219 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
220 {
221 d->stem = m->stem;
222 d->z = m->create();
223 d->needrecode = true;
224 return;
225 }
226 }
227
229 (errcode(ERRCODE_UNDEFINED_OBJECT),
230 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
231 lang, GetDatabaseEncodingName())));
232}
static const stemmer_module stemmer_modules[]
int GetDatabaseEncoding(void)
Definition: mbutils.c:1264
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1270
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:32
struct SN_env *(* create)(void)
Definition: dict_snowball.c:99
const char * name
Definition: dict_snowball.c:97
int(* stem)(struct SN_env *)

References stemmer_module::create, stemmer_module::enc, ereport, errcode(), errmsg(), ERROR, GetDatabaseEncoding(), GetDatabaseEncodingName(), stemmer_module::name, DictSnowball::needrecode, PG_SQL_ASCII, pg_strcasecmp(), PG_UTF8, stemmer_module::stem, DictSnowball::stem, stemmer_modules, and DictSnowball::z.

Referenced by dsnowball_init().

◆ PG_FUNCTION_INFO_V1() [1/2]

PG_FUNCTION_INFO_V1 ( dsnowball_init  )

◆ PG_FUNCTION_INFO_V1() [2/2]

PG_FUNCTION_INFO_V1 ( dsnowball_lexize  )

◆ PG_MODULE_MAGIC_EXT()

PG_MODULE_MAGIC_EXT ( name = "dict_snowball",
version = PG_VERSION 
)

Variable Documentation

◆ stemmer_modules

const stemmer_module stemmer_modules[]
static

Definition at line 108 of file dict_snowball.c.

Referenced by locate_stem_module().