PostgreSQL Source Code  git master
dict_snowball.c File Reference
#include "postgres.h"
#include "commands/defrem.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "snowball/libstemmer/header.h"
#include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
#include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
#include "snowball/libstemmer/stem_KOI8_R_russian.h"
#include "snowball/libstemmer/stem_UTF_8_arabic.h"
#include "snowball/libstemmer/stem_UTF_8_armenian.h"
#include "snowball/libstemmer/stem_UTF_8_basque.h"
#include "snowball/libstemmer/stem_UTF_8_catalan.h"
#include "snowball/libstemmer/stem_UTF_8_danish.h"
#include "snowball/libstemmer/stem_UTF_8_dutch.h"
#include "snowball/libstemmer/stem_UTF_8_english.h"
#include "snowball/libstemmer/stem_UTF_8_finnish.h"
#include "snowball/libstemmer/stem_UTF_8_french.h"
#include "snowball/libstemmer/stem_UTF_8_german.h"
#include "snowball/libstemmer/stem_UTF_8_greek.h"
#include "snowball/libstemmer/stem_UTF_8_hindi.h"
#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
#include "snowball/libstemmer/stem_UTF_8_irish.h"
#include "snowball/libstemmer/stem_UTF_8_italian.h"
#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
#include "snowball/libstemmer/stem_UTF_8_nepali.h"
#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
#include "snowball/libstemmer/stem_UTF_8_porter.h"
#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
#include "snowball/libstemmer/stem_UTF_8_romanian.h"
#include "snowball/libstemmer/stem_UTF_8_russian.h"
#include "snowball/libstemmer/stem_UTF_8_serbian.h"
#include "snowball/libstemmer/stem_UTF_8_spanish.h"
#include "snowball/libstemmer/stem_UTF_8_swedish.h"
#include "snowball/libstemmer/stem_UTF_8_tamil.h"
#include "snowball/libstemmer/stem_UTF_8_turkish.h"
#include "snowball/libstemmer/stem_UTF_8_yiddish.h"
Include dependency graph for dict_snowball.c:

Go to the source code of this file.

Data Structures

struct  stemmer_module
 
struct  DictSnowball
 

Macros

#define STEMMER_MODULE(name, enc, senc)   {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
 

Typedefs

typedef struct stemmer_module stemmer_module
 
typedef struct DictSnowball DictSnowball
 

Functions

 PG_FUNCTION_INFO_V1 (dsnowball_init)
 
 PG_FUNCTION_INFO_V1 (dsnowball_lexize)
 
static void locate_stem_module (DictSnowball *d, const char *lang)
 
Datum dsnowball_init (PG_FUNCTION_ARGS)
 
Datum dsnowball_lexize (PG_FUNCTION_ARGS)
 

Variables

 PG_MODULE_MAGIC
 
static const stemmer_module stemmer_modules []
 

Macro Definition Documentation

◆ STEMMER_MODULE

#define STEMMER_MODULE (   name,
  enc,
  senc 
)    {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}

Definition at line 95 of file dict_snowball.c.

Typedef Documentation

◆ DictSnowball

typedef struct DictSnowball DictSnowball

◆ stemmer_module

Function Documentation

◆ dsnowball_init()

Datum dsnowball_init ( PG_FUNCTION_ARGS  )

Definition at line 220 of file dict_snowball.c.

References CurrentMemoryContext, defGetString(), DefElem::defname, DictSnowball::dictCtx, ereport, errcode(), errmsg(), ERROR, SN_env::l, lfirst, locate_stem_module(), lowerstr(), palloc0(), PG_GETARG_POINTER, PG_RETURN_POINTER, readstoplist(), DictSnowball::stem, and DictSnowball::stoplist.

221 {
222  List *dictoptions = (List *) PG_GETARG_POINTER(0);
223  DictSnowball *d;
224  bool stoploaded = false;
225  ListCell *l;
226 
227  d = (DictSnowball *) palloc0(sizeof(DictSnowball));
228 
229  foreach(l, dictoptions)
230  {
231  DefElem *defel = (DefElem *) lfirst(l);
232 
233  if (strcmp(defel->defname, "stopwords") == 0)
234  {
235  if (stoploaded)
236  ereport(ERROR,
237  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
238  errmsg("multiple StopWords parameters")));
240  stoploaded = true;
241  }
242  else if (strcmp(defel->defname, "language") == 0)
243  {
244  if (d->stem)
245  ereport(ERROR,
246  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
247  errmsg("multiple Language parameters")));
248  locate_stem_module(d, defGetString(defel));
249  }
250  else
251  {
252  ereport(ERROR,
253  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
254  errmsg("unrecognized Snowball parameter: \"%s\"",
255  defel->defname)));
256  }
257  }
258 
259  if (!d->stem)
260  ereport(ERROR,
261  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
262  errmsg("missing Language parameter")));
263 
265 
267 }
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
StopList stoplist
int errcode(int sqlerrcode)
Definition: elog.c:698
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
char * lowerstr(const char *str)
Definition: ts_locale.c:244
int(* stem)(struct SN_env *z)
#define ERROR
Definition: elog.h:46
char * defGetString(DefElem *def)
Definition: define.c:49
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
void readstoplist(const char *fname, StopList *s, char *(*wordop)(const char *))
Definition: ts_utils.c:68
void * palloc0(Size size)
Definition: mcxt.c:1093
#define ereport(elevel,...)
Definition: elog.h:157
static void locate_stem_module(DictSnowball *d, const char *lang)
#define lfirst(lc)
Definition: pg_list.h:169
int errmsg(const char *fmt,...)
Definition: elog.c:909
MemoryContext dictCtx
char * defname
Definition: parsenodes.h:746
Definition: pg_list.h:50

◆ dsnowball_lexize()

Datum dsnowball_lexize ( PG_FUNCTION_ARGS  )

Definition at line 270 of file dict_snowball.c.

References DictSnowball::dictCtx, SN_env::l, TSLexeme::lexeme, lowerstr_with_len(), MemoryContextSwitchTo(), DictSnowball::needrecode, SN_env::p, palloc0(), pfree(), pg_any_to_server(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, pg_server_to_any(), PG_UTF8, repalloc(), searchstoplist(), SN_set_current(), DictSnowball::stem, DictSnowball::stoplist, and DictSnowball::z.

271 {
273  char *in = (char *) PG_GETARG_POINTER(1);
274  int32 len = PG_GETARG_INT32(2);
275  char *txt = lowerstr_with_len(in, len);
276  TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
277 
278  if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
279  {
280  pfree(txt);
281  }
282  else
283  {
284  MemoryContext saveCtx;
285 
286  /*
287  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
288  */
289  if (d->needrecode)
290  {
291  char *recoded;
292 
293  recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
294  if (recoded != txt)
295  {
296  pfree(txt);
297  txt = recoded;
298  }
299  }
300 
301  /* see comment about d->dictCtx */
302  saveCtx = MemoryContextSwitchTo(d->dictCtx);
303  SN_set_current(d->z, strlen(txt), (symbol *) txt);
304  d->stem(d->z);
305  MemoryContextSwitchTo(saveCtx);
306 
307  if (d->z->p && d->z->l)
308  {
309  txt = repalloc(txt, d->z->l + 1);
310  memcpy(txt, d->z->p, d->z->l);
311  txt[d->z->l] = '\0';
312  }
313 
314  /* back recode if needed */
315  if (d->needrecode)
316  {
317  char *recoded;
318 
319  recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
320  if (recoded != txt)
321  {
322  pfree(txt);
323  txt = recoded;
324  }
325  }
326 
327  res->lexeme = txt;
328  }
329 
330  PG_RETURN_POINTER(res);
331 }
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
StopList stoplist
symbol * p
Definition: api.h:15
int SN_set_current(struct SN_env *z, int size, const symbol *s)
Definition: api.c:51
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:257
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
int(* stem)(struct SN_env *z)
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:749
signed int int32
Definition: c.h:429
void pfree(void *pointer)
Definition: mcxt.c:1169
char * lexeme
Definition: ts_public.h:111
void * palloc0(Size size)
Definition: mcxt.c:1093
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1182
struct SN_env * z
bool searchstoplist(StopList *s, char *key)
Definition: ts_utils.c:141
MemoryContext dictCtx
int l
Definition: api.h:16
char * pg_any_to_server(const char *s, int len, int encoding)
Definition: mbutils.c:676
unsigned char symbol
Definition: api.h:2

◆ locate_stem_module()

static void locate_stem_module ( DictSnowball d,
const char *  lang 
)
static

Definition at line 179 of file dict_snowball.c.

References stemmer_module::create, stemmer_module::enc, ereport, errcode(), errmsg(), ERROR, GetDatabaseEncoding(), GetDatabaseEncodingName(), stemmer_module::name, DictSnowball::needrecode, PG_SQL_ASCII, pg_strcasecmp(), PG_UTF8, stemmer_module::stem, DictSnowball::stem, and DictSnowball::z.

Referenced by dsnowball_init().

180 {
181  const stemmer_module *m;
182 
183  /*
184  * First, try to find exact match of stemmer module. Stemmer with
185  * PG_SQL_ASCII encoding is treated as working with any server encoding
186  */
187  for (m = stemmer_modules; m->name; m++)
188  {
189  if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
190  pg_strcasecmp(m->name, lang) == 0)
191  {
192  d->stem = m->stem;
193  d->z = m->create();
194  d->needrecode = false;
195  return;
196  }
197  }
198 
199  /*
200  * Second, try to find stemmer for needed language for UTF8 encoding.
201  */
202  for (m = stemmer_modules; m->name; m++)
203  {
204  if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
205  {
206  d->stem = m->stem;
207  d->z = m->create();
208  d->needrecode = true;
209  return;
210  }
211  }
212 
213  ereport(ERROR,
214  (errcode(ERRCODE_UNDEFINED_OBJECT),
215  errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
216  lang, GetDatabaseEncodingName())));
217 }
int errcode(int sqlerrcode)
Definition: elog.c:698
int(* stem)(struct SN_env *z)
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define ERROR
Definition: elog.h:46
struct SN_env *(* create)(void)
Definition: dict_snowball.c:89
int GetDatabaseEncoding(void)
Definition: mbutils.c:1210
#define ereport(elevel,...)
Definition: elog.h:157
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1216
static const stemmer_module stemmer_modules[]
Definition: dict_snowball.c:98
int(* stem)(struct SN_env *)
Definition: dict_snowball.c:91
struct SN_env * z
int errmsg(const char *fmt,...)
Definition: elog.c:909
const char * name
Definition: dict_snowball.c:87

◆ PG_FUNCTION_INFO_V1() [1/2]

PG_FUNCTION_INFO_V1 ( dsnowball_init  )

◆ PG_FUNCTION_INFO_V1() [2/2]

PG_FUNCTION_INFO_V1 ( dsnowball_lexize  )

Variable Documentation

◆ PG_MODULE_MAGIC

PG_MODULE_MAGIC

Definition at line 78 of file dict_snowball.c.

◆ stemmer_modules

const stemmer_module stemmer_modules[]
static

Definition at line 98 of file dict_snowball.c.