PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
spell.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * spell.c
4  * Normalizing word with ISpell
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  * Ispell dictionary
9  * -----------------
10  *
11  * Rules of dictionaries are defined in two files with .affix and .dict
12  * extensions. They are used by spell checker programs Ispell and Hunspell.
13  *
14  * An .affix file declares morphological rules to get a basic form of words.
15  * The format of an .affix file has different structure for Ispell and Hunspell
16  * dictionaries. The Hunspell format is more complicated. But when an .affix
17  * file is imported and compiled, it is stored in the same structure AffixNode.
18  *
19  * A .dict file stores a list of basic forms of words with references to
20  * affix rules. The format of a .dict file has the same structure for Ispell
21  * and Hunspell dictionaries.
22  *
23  * Compilation of a dictionary
24  * ---------------------------
25  *
26  * A compiled dictionary is stored in the IspellDict structure. Compilation of
27  * a dictionary is divided into the several steps:
28  * - NIImportDictionary() - stores each word of a .dict file in the
29  * temporary Spell field.
30  * - NIImportAffixes() - stores affix rules of an .affix file in the
31  * Affix field (not temporary) if an .affix file has the Ispell format.
32  * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33  * Hunspell format. The AffixData field is initialized if AF parameter
34  * is defined.
35  * - NISortDictionary() - builds a prefix tree (Trie) from the words list
36  * and stores it in the Dictionary field. The words list is got from the
37  * Spell field. The AffixData field is initialized if AF parameter is not
38  * defined.
39  * - NISortAffixes():
40  * - builds a list of compound affixes from the affix list and stores it
41  * in the CompoundAffix.
42  * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43  * and stores them in Suffix and Prefix fields.
44  * The affix list is got from the Affix field.
45  *
46  * Memory management
47  * -----------------
48  *
49  * The IspellDict structure has the Spell field which is used only in compile
50  * time. The Spell field stores a words list. It can take a lot of memory.
51  * Therefore when a dictionary is compiled this field is cleared by
52  * NIFinishBuild().
53  *
54  * All resources which should cleared by NIFinishBuild() is initialized using
55  * tmpalloc() and tmpalloc0().
56  *
57  * IDENTIFICATION
58  * src/backend/tsearch/spell.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 
63 #include "postgres.h"
64 
65 #include "catalog/pg_collation.h"
66 #include "tsearch/dicts/spell.h"
67 #include "tsearch/ts_locale.h"
68 #include "utils/memutils.h"
69 
70 
71 /*
72  * Initialization requires a lot of memory that's not needed
73  * after the initialization is done. During initialization,
74  * CurrentMemoryContext is the long-lived memory context associated
75  * with the dictionary cache entry. We keep the short-lived stuff
76  * in the Conf->buildCxt context.
77  */
78 #define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
79 #define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
80 
81 /*
82  * Prepare for constructing an ISpell dictionary.
83  *
84  * The IspellDict struct is assumed to be zeroed when allocated.
85  */
86 void
88 {
89  /*
90  * The temp context is a child of CurTransactionContext, so that it will
91  * go away automatically on error.
92  */
94  "Ispell dictionary init context",
96 }
97 
98 /*
99  * Clean up when dictionary construction is complete.
100  */
101 void
103 {
104  /* Release no-longer-needed temp memory */
106  /* Just for cleanliness, zero the now-dangling pointers */
107  Conf->buildCxt = NULL;
108  Conf->Spell = NULL;
109  Conf->firstfree = NULL;
110  Conf->CompoundAffixFlags = NULL;
111 }
112 
113 
114 /*
115  * "Compact" palloc: allocate without extra palloc overhead.
116  *
117  * Since we have no need to free the ispell data items individually, there's
118  * not much value in the per-chunk overhead normally consumed by palloc.
119  * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
120  *
121  * We currently pre-zero all data allocated this way, even though some of it
122  * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
123  * to indicate which allocations actually require zeroing.
124  */
125 #define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
126 #define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
127 
128 static void *
129 compact_palloc0(IspellDict *Conf, size_t size)
130 {
131  void *result;
132 
133  /* Should only be called during init */
134  Assert(Conf->buildCxt != NULL);
135 
136  /* No point in this for large chunks */
137  if (size > COMPACT_MAX_REQ)
138  return palloc0(size);
139 
140  /* Keep everything maxaligned */
141  size = MAXALIGN(size);
142 
143  /* Need more space? */
144  if (size > Conf->avail)
145  {
147  Conf->avail = COMPACT_ALLOC_CHUNK;
148  }
149 
150  result = (void *) Conf->firstfree;
151  Conf->firstfree += size;
152  Conf->avail -= size;
153 
154  return result;
155 }
156 
157 #define cpalloc(size) compact_palloc0(Conf, size)
158 #define cpalloc0(size) compact_palloc0(Conf, size)
159 
160 static char *
161 cpstrdup(IspellDict *Conf, const char *str)
162 {
163  char *res = cpalloc(strlen(str) + 1);
164 
165  strcpy(res, str);
166  return res;
167 }
168 
169 
170 /*
171  * Apply lowerstr(), producing a temporary result (in the buildCxt).
172  */
173 static char *
174 lowerstr_ctx(IspellDict *Conf, const char *src)
175 {
176  MemoryContext saveCtx;
177  char *dst;
178 
179  saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
180  dst = lowerstr(src);
181  MemoryContextSwitchTo(saveCtx);
182 
183  return dst;
184 }
185 
186 #define MAX_NORM 1024
187 #define MAXNORMLEN 256
188 
189 #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
190 #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
191 #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
192 
193 static char *VoidString = "";
194 
195 static int
196 cmpspell(const void *s1, const void *s2)
197 {
198  return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
199 }
200 
201 static int
202 cmpspellaffix(const void *s1, const void *s2)
203 {
204  return strcmp((*(SPELL *const *) s1)->p.flag,
205  (*(SPELL *const *) s2)->p.flag);
206 }
207 
208 static int
209 cmpcmdflag(const void *f1, const void *f2)
210 {
211  CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
212  *fv2 = (CompoundAffixFlag *) f2;
213 
214  Assert(fv1->flagMode == fv2->flagMode);
215 
216  if (fv1->flagMode == FM_NUM)
217  {
218  if (fv1->flag.i == fv2->flag.i)
219  return 0;
220 
221  return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
222  }
223 
224  return strcmp(fv1->flag.s, fv2->flag.s);
225 }
226 
227 static char *
228 findchar(char *str, int c)
229 {
230  while (*str)
231  {
232  if (t_iseq(str, c))
233  return str;
234  str += pg_mblen(str);
235  }
236 
237  return NULL;
238 }
239 
240 static char *
241 findchar2(char *str, int c1, int c2)
242 {
243  while (*str)
244  {
245  if (t_iseq(str, c1) || t_iseq(str, c2))
246  return str;
247  str += pg_mblen(str);
248  }
249 
250  return NULL;
251 }
252 
253 
254 /* backward string compare for suffix tree operations */
255 static int
256 strbcmp(const unsigned char *s1, const unsigned char *s2)
257 {
258  int l1 = strlen((const char *) s1) - 1,
259  l2 = strlen((const char *) s2) - 1;
260 
261  while (l1 >= 0 && l2 >= 0)
262  {
263  if (s1[l1] < s2[l2])
264  return -1;
265  if (s1[l1] > s2[l2])
266  return 1;
267  l1--;
268  l2--;
269  }
270  if (l1 < l2)
271  return -1;
272  if (l1 > l2)
273  return 1;
274 
275  return 0;
276 }
277 
278 static int
279 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
280 {
281  int l1 = strlen((const char *) s1) - 1,
282  l2 = strlen((const char *) s2) - 1,
283  l = count;
284 
285  while (l1 >= 0 && l2 >= 0 && l > 0)
286  {
287  if (s1[l1] < s2[l2])
288  return -1;
289  if (s1[l1] > s2[l2])
290  return 1;
291  l1--;
292  l2--;
293  l--;
294  }
295  if (l == 0)
296  return 0;
297  if (l1 < l2)
298  return -1;
299  if (l1 > l2)
300  return 1;
301  return 0;
302 }
303 
304 /*
305  * Compares affixes.
306  * First compares the type of an affix. Prefixes should go before affixes.
307  * If types are equal then compares replaceable string.
308  */
309 static int
310 cmpaffix(const void *s1, const void *s2)
311 {
312  const AFFIX *a1 = (const AFFIX *) s1;
313  const AFFIX *a2 = (const AFFIX *) s2;
314 
315  if (a1->type < a2->type)
316  return -1;
317  if (a1->type > a2->type)
318  return 1;
319  if (a1->type == FF_PREFIX)
320  return strcmp(a1->repl, a2->repl);
321  else
322  return strbcmp((const unsigned char *) a1->repl,
323  (const unsigned char *) a2->repl);
324 }
325 
326 /*
327  * Gets an affix flag from the set of affix flags (sflagset).
328  *
329  * Several flags can be stored in a single string. Flags can be represented by:
330  * - 1 character (FM_CHAR). A character may be Unicode.
331  * - 2 characters (FM_LONG). A character may be Unicode.
332  * - numbers from 1 to 65000 (FM_NUM).
333  *
334  * Depending on the flagMode an affix string can have the following format:
335  * - FM_CHAR: ABCD
336  * Here we have 4 flags: A, B, C and D
337  * - FM_LONG: ABCDE*
338  * Here we have 3 flags: AB, CD and E*
339  * - FM_NUM: 200,205,50
340  * Here we have 3 flags: 200, 205 and 50
341  *
342  * Conf: current dictionary.
343  * sflagset: the set of affix flags. Returns a reference to the start of a next
344  * affix flag.
345  * sflag: returns an affix flag from sflagset.
346  */
347 static void
348 getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
349 {
350  int32 s;
351  char *next,
352  *sbuf = *sflagset;
353  int maxstep;
354  bool stop = false;
355  bool met_comma = false;
356 
357  maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
358 
359  while (**sflagset)
360  {
361  switch (Conf->flagMode)
362  {
363  case FM_LONG:
364  case FM_CHAR:
365  COPYCHAR(sflag, *sflagset);
366  sflag += pg_mblen(*sflagset);
367 
368  /* Go to start of the next flag */
369  *sflagset += pg_mblen(*sflagset);
370 
371  /* Check if we get all characters of flag */
372  maxstep--;
373  stop = (maxstep == 0);
374  break;
375  case FM_NUM:
376  s = strtol(*sflagset, &next, 10);
377  if (*sflagset == next || errno == ERANGE)
378  ereport(ERROR,
379  (errcode(ERRCODE_CONFIG_FILE_ERROR),
380  errmsg("invalid affix flag \"%s\"", *sflagset)));
381  if (s < 0 || s > FLAGNUM_MAXSIZE)
382  ereport(ERROR,
383  (errcode(ERRCODE_CONFIG_FILE_ERROR),
384  errmsg("affix flag \"%s\" is out of range",
385  *sflagset)));
386  sflag += sprintf(sflag, "%0d", s);
387 
388  /* Go to start of the next flag */
389  *sflagset = next;
390  while (**sflagset)
391  {
392  if (t_isdigit(*sflagset))
393  {
394  if (!met_comma)
395  ereport(ERROR,
396  (errcode(ERRCODE_CONFIG_FILE_ERROR),
397  errmsg("invalid affix flag \"%s\"",
398  *sflagset)));
399  break;
400  }
401  else if (t_iseq(*sflagset, ','))
402  {
403  if (met_comma)
404  ereport(ERROR,
405  (errcode(ERRCODE_CONFIG_FILE_ERROR),
406  errmsg("invalid affix flag \"%s\"",
407  *sflagset)));
408  met_comma = true;
409  }
410  else if (!t_isspace(*sflagset))
411  {
412  ereport(ERROR,
413  (errcode(ERRCODE_CONFIG_FILE_ERROR),
414  errmsg("invalid character in affix flag \"%s\"",
415  *sflagset)));
416  }
417 
418  *sflagset += pg_mblen(*sflagset);
419  }
420  stop = true;
421  break;
422  default:
423  elog(ERROR, "unrecognized type of Conf->flagMode: %d",
424  Conf->flagMode);
425  }
426 
427  if (stop)
428  break;
429  }
430 
431  if (Conf->flagMode == FM_LONG && maxstep > 0)
432  ereport(ERROR,
433  (errcode(ERRCODE_CONFIG_FILE_ERROR),
434  errmsg("invalid affix flag \"%s\" with \"long\" flag value",
435  sbuf)));
436 
437  *sflag = '\0';
438 }
439 
440 /*
441  * Checks if the affix set Conf->AffixData[affix] contains affixflag.
442  * Conf->AffixData[affix] does not contain affixflag if this flag is not used
443  * actually by the .dict file.
444  *
445  * Conf: current dictionary.
446  * affix: index of the Conf->AffixData array.
447  * affixflag: the affix flag.
448  *
449  * Returns true if the string Conf->AffixData[affix] contains affixflag,
450  * otherwise returns false.
451  */
452 static bool
453 IsAffixFlagInUse(IspellDict *Conf, int affix, char *affixflag)
454 {
455  char *flagcur;
456  char flag[BUFSIZ];
457 
458  if (*affixflag == 0)
459  return true;
460 
461  flagcur = Conf->AffixData[affix];
462 
463  while (*flagcur)
464  {
465  getNextFlagFromString(Conf, &flagcur, flag);
466  /* Compare first affix flag in flagcur with affixflag */
467  if (strcmp(flag, affixflag) == 0)
468  return true;
469  }
470 
471  /* Could not find affixflag */
472  return false;
473 }
474 
475 /*
476  * Adds the new word into the temporary array Spell.
477  *
478  * Conf: current dictionary.
479  * word: new word.
480  * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
481  */
482 static void
483 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
484 {
485  if (Conf->nspell >= Conf->mspell)
486  {
487  if (Conf->mspell)
488  {
489  Conf->mspell *= 2;
490  Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
491  }
492  else
493  {
494  Conf->mspell = 1024 * 20;
495  Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
496  }
497  }
498  Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
499  strcpy(Conf->Spell[Conf->nspell]->word, word);
500  Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
501  ? cpstrdup(Conf, flag) : VoidString;
502  Conf->nspell++;
503 }
504 
505 /*
506  * Imports dictionary into the temporary array Spell.
507  *
508  * Note caller must already have applied get_tsearch_config_filename.
509  *
510  * Conf: current dictionary.
511  * filename: path to the .dict file.
512  */
513 void
515 {
517  char *line;
518 
519  if (!tsearch_readline_begin(&trst, filename))
520  ereport(ERROR,
521  (errcode(ERRCODE_CONFIG_FILE_ERROR),
522  errmsg("could not open dictionary file \"%s\": %m",
523  filename)));
524 
525  while ((line = tsearch_readline(&trst)) != NULL)
526  {
527  char *s,
528  *pstr;
529 
530  /* Set of affix flags */
531  const char *flag;
532 
533  /* Extract flag from the line */
534  flag = NULL;
535  if ((s = findchar(line, '/')))
536  {
537  *s++ = '\0';
538  flag = s;
539  while (*s)
540  {
541  /* we allow only single encoded flags for faster works */
542  if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
543  s++;
544  else
545  {
546  *s = '\0';
547  break;
548  }
549  }
550  }
551  else
552  flag = "";
553 
554  /* Remove trailing spaces */
555  s = line;
556  while (*s)
557  {
558  if (t_isspace(s))
559  {
560  *s = '\0';
561  break;
562  }
563  s += pg_mblen(s);
564  }
565  pstr = lowerstr_ctx(Conf, line);
566 
567  NIAddSpell(Conf, pstr, flag);
568  pfree(pstr);
569 
570  pfree(line);
571  }
572  tsearch_readline_end(&trst);
573 }
574 
575 /*
576  * Searches a basic form of word in the prefix tree. This word was generated
577  * using an affix rule. This rule may not be presented in an affix set of
578  * a basic form of word.
579  *
580  * For example, we have the entry in the .dict file:
581  * meter/GMD
582  *
583  * The affix rule with the flag S:
584  * SFX S y ies [^aeiou]y
585  * is not presented here.
586  *
587  * The affix rule with the flag M:
588  * SFX M 0 's .
589  * is presented here.
590  *
591  * Conf: current dictionary.
592  * word: basic form of word.
593  * affixflag: affix flag, by which a basic form of word was generated.
594  * flag: compound flag used to compare with StopMiddle->compoundflag.
595  *
596  * Returns 1 if the word was found in the prefix tree, else returns 0.
597  */
598 static int
599 FindWord(IspellDict *Conf, const char *word, char *affixflag, int flag)
600 {
601  SPNode *node = Conf->Dictionary;
602  SPNodeData *StopLow,
603  *StopHigh,
604  *StopMiddle;
605  const uint8 *ptr = (const uint8 *) word;
606 
607  flag &= FF_COMPOUNDFLAGMASK;
608 
609  while (node && *ptr)
610  {
611  StopLow = node->data;
612  StopHigh = node->data + node->length;
613  while (StopLow < StopHigh)
614  {
615  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
616  if (StopMiddle->val == *ptr)
617  {
618  if (*(ptr + 1) == '\0' && StopMiddle->isword)
619  {
620  if (flag == 0)
621  {
622  /*
623  * The word can be formed only with another word. And
624  * in the flag parameter there is not a sign that we
625  * search compound words.
626  */
627  if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
628  return 0;
629  }
630  else if ((flag & StopMiddle->compoundflag) == 0)
631  return 0;
632 
633  /*
634  * Check if this affix rule is presented in the affix set
635  * with index StopMiddle->affix.
636  */
637  if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
638  return 1;
639  }
640  node = StopMiddle->node;
641  ptr++;
642  break;
643  }
644  else if (StopMiddle->val < *ptr)
645  StopLow = StopMiddle + 1;
646  else
647  StopHigh = StopMiddle;
648  }
649  if (StopLow >= StopHigh)
650  break;
651  }
652  return 0;
653 }
654 
655 /*
656  * Adds a new affix rule to the Affix field.
657  *
658  * Conf: current dictionary.
659  * flag: affix flag ('\' in the below example).
660  * flagflags: set of flags from the flagval field for this affix rule. This set
661  * is listed after '/' character in the added string (repl).
662  *
663  * For example L flag in the hunspell_sample.affix:
664  * SFX \ 0 Y/L [^Y]
665  *
666  * mask: condition for search ('[^Y]' in the above example).
667  * find: stripping characters from beginning (at prefix) or end (at suffix)
668  * of the word ('0' in the above example, 0 means that there is not
669  * stripping character).
670  * repl: adding string after stripping ('Y' in the above example).
671  * type: FF_SUFFIX or FF_PREFIX.
672  */
673 static void
674 NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
675  const char *find, const char *repl, int type)
676 {
677  AFFIX *Affix;
678 
679  if (Conf->naffixes >= Conf->maffixes)
680  {
681  if (Conf->maffixes)
682  {
683  Conf->maffixes *= 2;
684  Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
685  }
686  else
687  {
688  Conf->maffixes = 16;
689  Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
690  }
691  }
692 
693  Affix = Conf->Affix + Conf->naffixes;
694 
695  /* This affix rule can be applied for words with any ending */
696  if (strcmp(mask, ".") == 0 || *mask == '\0')
697  {
698  Affix->issimple = 1;
699  Affix->isregis = 0;
700  }
701  /* This affix rule will use regis to search word ending */
702  else if (RS_isRegis(mask))
703  {
704  Affix->issimple = 0;
705  Affix->isregis = 1;
706  RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
707  *mask ? mask : VoidString);
708  }
709  /* This affix rule will use regex_t to search word ending */
710  else
711  {
712  int masklen;
713  int wmasklen;
714  int err;
715  pg_wchar *wmask;
716  char *tmask;
717 
718  Affix->issimple = 0;
719  Affix->isregis = 0;
720  tmask = (char *) tmpalloc(strlen(mask) + 3);
721  if (type == FF_SUFFIX)
722  sprintf(tmask, "%s$", mask);
723  else
724  sprintf(tmask, "^%s", mask);
725 
726  masklen = strlen(tmask);
727  wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
728  wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
729 
730  err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
733  if (err)
734  {
735  char errstr[100];
736 
737  pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr));
738  ereport(ERROR,
739  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
740  errmsg("invalid regular expression: %s", errstr)));
741  }
742  }
743 
744  Affix->flagflags = flagflags;
745  if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
746  {
747  if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
748  Affix->flagflags |= FF_COMPOUNDFLAG;
749  }
750  Affix->flag = cpstrdup(Conf, flag);
751  Affix->type = type;
752 
753  Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
754  if ((Affix->replen = strlen(repl)) > 0)
755  Affix->repl = cpstrdup(Conf, repl);
756  else
757  Affix->repl = VoidString;
758  Conf->naffixes++;
759 }
760 
761 /* Parsing states for parse_affentry() and friends */
762 #define PAE_WAIT_MASK 0
763 #define PAE_INMASK 1
764 #define PAE_WAIT_FIND 2
765 #define PAE_INFIND 3
766 #define PAE_WAIT_REPL 4
767 #define PAE_INREPL 5
768 #define PAE_WAIT_TYPE 6
769 #define PAE_WAIT_FLAG 7
770 
771 /*
772  * Parse next space-separated field of an .affix file line.
773  *
774  * *str is the input pointer (will be advanced past field)
775  * next is where to copy the field value to, with null termination
776  *
777  * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
778  *
779  * Returns TRUE if we found a field, FALSE if not.
780  */
781 static bool
782 get_nextfield(char **str, char *next)
783 {
784  int state = PAE_WAIT_MASK;
785  int avail = BUFSIZ;
786 
787  while (**str)
788  {
789  if (state == PAE_WAIT_MASK)
790  {
791  if (t_iseq(*str, '#'))
792  return false;
793  else if (!t_isspace(*str))
794  {
795  int clen = pg_mblen(*str);
796 
797  if (clen < avail)
798  {
799  COPYCHAR(next, *str);
800  next += clen;
801  avail -= clen;
802  }
803  state = PAE_INMASK;
804  }
805  }
806  else /* state == PAE_INMASK */
807  {
808  if (t_isspace(*str))
809  {
810  *next = '\0';
811  return true;
812  }
813  else
814  {
815  int clen = pg_mblen(*str);
816 
817  if (clen < avail)
818  {
819  COPYCHAR(next, *str);
820  next += clen;
821  avail -= clen;
822  }
823  }
824  }
825  *str += pg_mblen(*str);
826  }
827 
828  *next = '\0';
829 
830  return (state == PAE_INMASK); /* OK if we got a nonempty field */
831 }
832 
833 /*
834  * Parses entry of an .affix file of MySpell or Hunspell format.
835  *
836  * An .affix file entry has the following format:
837  * - header
838  * <type> <flag> <cross_flag> <flag_count>
839  * - fields after header:
840  * <type> <flag> <find> <replace> <mask>
841  *
842  * str is the input line
843  * field values are returned to type etc, which must be buffers of size BUFSIZ.
844  *
845  * Returns number of fields found; any omitted fields are set to empty strings.
846  */
847 static int
848 parse_ooaffentry(char *str, char *type, char *flag, char *find,
849  char *repl, char *mask)
850 {
851  int state = PAE_WAIT_TYPE;
852  int fields_read = 0;
853  bool valid = false;
854 
855  *type = *flag = *find = *repl = *mask = '\0';
856 
857  while (*str)
858  {
859  switch (state)
860  {
861  case PAE_WAIT_TYPE:
862  valid = get_nextfield(&str, type);
863  state = PAE_WAIT_FLAG;
864  break;
865  case PAE_WAIT_FLAG:
866  valid = get_nextfield(&str, flag);
867  state = PAE_WAIT_FIND;
868  break;
869  case PAE_WAIT_FIND:
870  valid = get_nextfield(&str, find);
871  state = PAE_WAIT_REPL;
872  break;
873  case PAE_WAIT_REPL:
874  valid = get_nextfield(&str, repl);
875  state = PAE_WAIT_MASK;
876  break;
877  case PAE_WAIT_MASK:
878  valid = get_nextfield(&str, mask);
879  state = -1; /* force loop exit */
880  break;
881  default:
882  elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
883  state);
884  break;
885  }
886  if (valid)
887  fields_read++;
888  else
889  break; /* early EOL */
890  if (state < 0)
891  break; /* got all fields */
892  }
893 
894  return fields_read;
895 }
896 
897 /*
898  * Parses entry of an .affix file of Ispell format
899  *
900  * An .affix file entry has the following format:
901  * <mask> > [-<find>,]<replace>
902  */
903 static bool
904 parse_affentry(char *str, char *mask, char *find, char *repl)
905 {
906  int state = PAE_WAIT_MASK;
907  char *pmask = mask,
908  *pfind = find,
909  *prepl = repl;
910 
911  *mask = *find = *repl = '\0';
912 
913  while (*str)
914  {
915  if (state == PAE_WAIT_MASK)
916  {
917  if (t_iseq(str, '#'))
918  return false;
919  else if (!t_isspace(str))
920  {
921  COPYCHAR(pmask, str);
922  pmask += pg_mblen(str);
923  state = PAE_INMASK;
924  }
925  }
926  else if (state == PAE_INMASK)
927  {
928  if (t_iseq(str, '>'))
929  {
930  *pmask = '\0';
931  state = PAE_WAIT_FIND;
932  }
933  else if (!t_isspace(str))
934  {
935  COPYCHAR(pmask, str);
936  pmask += pg_mblen(str);
937  }
938  }
939  else if (state == PAE_WAIT_FIND)
940  {
941  if (t_iseq(str, '-'))
942  {
943  state = PAE_INFIND;
944  }
945  else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
946  {
947  COPYCHAR(prepl, str);
948  prepl += pg_mblen(str);
949  state = PAE_INREPL;
950  }
951  else if (!t_isspace(str))
952  ereport(ERROR,
953  (errcode(ERRCODE_CONFIG_FILE_ERROR),
954  errmsg("syntax error")));
955  }
956  else if (state == PAE_INFIND)
957  {
958  if (t_iseq(str, ','))
959  {
960  *pfind = '\0';
961  state = PAE_WAIT_REPL;
962  }
963  else if (t_isalpha(str))
964  {
965  COPYCHAR(pfind, str);
966  pfind += pg_mblen(str);
967  }
968  else if (!t_isspace(str))
969  ereport(ERROR,
970  (errcode(ERRCODE_CONFIG_FILE_ERROR),
971  errmsg("syntax error")));
972  }
973  else if (state == PAE_WAIT_REPL)
974  {
975  if (t_iseq(str, '-'))
976  {
977  break; /* void repl */
978  }
979  else if (t_isalpha(str))
980  {
981  COPYCHAR(prepl, str);
982  prepl += pg_mblen(str);
983  state = PAE_INREPL;
984  }
985  else if (!t_isspace(str))
986  ereport(ERROR,
987  (errcode(ERRCODE_CONFIG_FILE_ERROR),
988  errmsg("syntax error")));
989  }
990  else if (state == PAE_INREPL)
991  {
992  if (t_iseq(str, '#'))
993  {
994  *prepl = '\0';
995  break;
996  }
997  else if (t_isalpha(str))
998  {
999  COPYCHAR(prepl, str);
1000  prepl += pg_mblen(str);
1001  }
1002  else if (!t_isspace(str))
1003  ereport(ERROR,
1004  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1005  errmsg("syntax error")));
1006  }
1007  else
1008  elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1009 
1010  str += pg_mblen(str);
1011  }
1012 
1013  *pmask = *pfind = *prepl = '\0';
1014 
1015  return (*mask && (*find || *repl));
1016 }
1017 
1018 /*
1019  * Sets a Hunspell options depending on flag type.
1020  */
1021 static void
1023  char *s, uint32 val)
1024 {
1025  if (Conf->flagMode == FM_NUM)
1026  {
1027  char *next;
1028  int i;
1029 
1030  i = strtol(s, &next, 10);
1031  if (s == next || errno == ERANGE)
1032  ereport(ERROR,
1033  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1034  errmsg("invalid affix flag \"%s\"", s)));
1035  if (i < 0 || i > FLAGNUM_MAXSIZE)
1036  ereport(ERROR,
1037  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1038  errmsg("affix flag \"%s\" is out of range", s)));
1039 
1040  entry->flag.i = i;
1041  }
1042  else
1043  entry->flag.s = cpstrdup(Conf, s);
1044 
1045  entry->flagMode = Conf->flagMode;
1046  entry->value = val;
1047 }
1048 
1049 /*
1050  * Sets up a correspondence for the affix parameter with the affix flag.
1051  *
1052  * Conf: current dictionary.
1053  * s: affix flag in string.
1054  * val: affix parameter.
1055  */
1056 static void
1058 {
1059  CompoundAffixFlag *newValue;
1060  char sbuf[BUFSIZ];
1061  char *sflag;
1062  int clen;
1063 
1064  while (*s && t_isspace(s))
1065  s += pg_mblen(s);
1066 
1067  if (!*s)
1068  ereport(ERROR,
1069  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1070  errmsg("syntax error")));
1071 
1072  /* Get flag without \n */
1073  sflag = sbuf;
1074  while (*s && !t_isspace(s) && *s != '\n')
1075  {
1076  clen = pg_mblen(s);
1077  COPYCHAR(sflag, s);
1078  sflag += clen;
1079  s += clen;
1080  }
1081  *sflag = '\0';
1082 
1083  /* Resize array or allocate memory for array CompoundAffixFlag */
1084  if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1085  {
1086  if (Conf->mCompoundAffixFlag)
1087  {
1088  Conf->mCompoundAffixFlag *= 2;
1090  repalloc((void *) Conf->CompoundAffixFlags,
1091  Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1092  }
1093  else
1094  {
1095  Conf->mCompoundAffixFlag = 10;
1098  }
1099  }
1100 
1101  newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1102 
1103  setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
1104 
1105  Conf->usecompound = true;
1106  Conf->nCompoundAffixFlag++;
1107 }
1108 
1109 /*
1110  * Returns a set of affix parameters which correspondence to the set of affix
1111  * flags s.
1112  */
1113 static int
1115 {
1116  uint32 flag = 0;
1117  CompoundAffixFlag *found,
1118  key;
1119  char sflag[BUFSIZ];
1120  char *flagcur;
1121 
1122  if (Conf->nCompoundAffixFlag == 0)
1123  return 0;
1124 
1125  flagcur = s;
1126  while (*flagcur)
1127  {
1128  getNextFlagFromString(Conf, &flagcur, sflag);
1129  setCompoundAffixFlagValue(Conf, &key, sflag, 0);
1130 
1131  found = (CompoundAffixFlag *)
1132  bsearch(&key, (void *) Conf->CompoundAffixFlags,
1133  Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
1134  cmpcmdflag);
1135  if (found != NULL)
1136  flag |= found->value;
1137  }
1138 
1139  return flag;
1140 }
1141 
1142 /*
1143  * Returns a flag set using the s parameter.
1144  *
1145  * If Conf->useFlagAliases is true then the s parameter is index of the
1146  * Conf->AffixData array and function returns its entry.
1147  * Else function returns the s parameter.
1148  */
1149 static char *
1151 {
1152  if (Conf->useFlagAliases && *s != '\0')
1153  {
1154  int curaffix;
1155  char *end;
1156 
1157  curaffix = strtol(s, &end, 10);
1158  if (s == end || errno == ERANGE)
1159  ereport(ERROR,
1160  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1161  errmsg("invalid affix alias \"%s\"", s)));
1162 
1163  if (curaffix > 0 && curaffix <= Conf->nAffixData)
1164 
1165  /*
1166  * Do not subtract 1 from curaffix because empty string was added
1167  * in NIImportOOAffixes
1168  */
1169  return Conf->AffixData[curaffix];
1170  else
1171  return VoidString;
1172  }
1173  else
1174  return s;
1175 }
1176 
1177 /*
1178  * Import an affix file that follows MySpell or Hunspell format.
1179  *
1180  * Conf: current dictionary.
1181  * filename: path to the .affix file.
1182  */
1183 static void
1185 {
1186  char type[BUFSIZ],
1187  *ptype = NULL;
1188  char sflag[BUFSIZ];
1189  char mask[BUFSIZ],
1190  *pmask;
1191  char find[BUFSIZ],
1192  *pfind;
1193  char repl[BUFSIZ],
1194  *prepl;
1195  bool isSuffix = false;
1196  int naffix = 0,
1197  curaffix = 0;
1198  int sflaglen = 0;
1199  char flagflags = 0;
1201  char *recoded;
1202 
1203  /* read file to find any flag */
1204  Conf->usecompound = false;
1205  Conf->useFlagAliases = false;
1206  Conf->flagMode = FM_CHAR;
1207 
1208  if (!tsearch_readline_begin(&trst, filename))
1209  ereport(ERROR,
1210  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1211  errmsg("could not open affix file \"%s\": %m",
1212  filename)));
1213 
1214  while ((recoded = tsearch_readline(&trst)) != NULL)
1215  {
1216  if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1217  {
1218  pfree(recoded);
1219  continue;
1220  }
1221 
1222  if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1223  addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1224  FF_COMPOUNDFLAG);
1225  else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1226  addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1228  else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1229  addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1230  FF_COMPOUNDLAST);
1231  /* COMPOUNDLAST and COMPOUNDEND are synonyms */
1232  else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1233  addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1234  FF_COMPOUNDLAST);
1235  else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1236  addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1238  else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1239  addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1240  FF_COMPOUNDONLY);
1241  else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1243  recoded + strlen("COMPOUNDPERMITFLAG"),
1245  else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1247  recoded + strlen("COMPOUNDFORBIDFLAG"),
1249  else if (STRNCMP(recoded, "FLAG") == 0)
1250  {
1251  char *s = recoded + strlen("FLAG");
1252 
1253  while (*s && t_isspace(s))
1254  s += pg_mblen(s);
1255 
1256  if (*s)
1257  {
1258  if (STRNCMP(s, "long") == 0)
1259  Conf->flagMode = FM_LONG;
1260  else if (STRNCMP(s, "num") == 0)
1261  Conf->flagMode = FM_NUM;
1262  else if (STRNCMP(s, "default") != 0)
1263  ereport(ERROR,
1264  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1265  errmsg("Ispell dictionary supports only "
1266  "\"default\", \"long\", "
1267  "and \"num\" flag values")));
1268  }
1269  }
1270 
1271  pfree(recoded);
1272  }
1273  tsearch_readline_end(&trst);
1274 
1275  if (Conf->nCompoundAffixFlag > 1)
1276  qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
1277  sizeof(CompoundAffixFlag), cmpcmdflag);
1278 
1279  if (!tsearch_readline_begin(&trst, filename))
1280  ereport(ERROR,
1281  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1282  errmsg("could not open affix file \"%s\": %m",
1283  filename)));
1284 
1285  while ((recoded = tsearch_readline(&trst)) != NULL)
1286  {
1287  int fields_read;
1288 
1289  if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1290  goto nextline;
1291 
1292  fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
1293 
1294  if (ptype)
1295  pfree(ptype);
1296  ptype = lowerstr_ctx(Conf, type);
1297 
1298  /* First try to parse AF parameter (alias compression) */
1299  if (STRNCMP(ptype, "af") == 0)
1300  {
1301  /* First line is the number of aliases */
1302  if (!Conf->useFlagAliases)
1303  {
1304  Conf->useFlagAliases = true;
1305  naffix = atoi(sflag);
1306  if (naffix == 0)
1307  ereport(ERROR,
1308  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1309  errmsg("invalid number of flag vector aliases")));
1310 
1311  /* Also reserve place for empty flag set */
1312  naffix++;
1313 
1314  Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1315  Conf->lenAffixData = Conf->nAffixData = naffix;
1316 
1317  /* Add empty flag set into AffixData */
1318  Conf->AffixData[curaffix] = VoidString;
1319  curaffix++;
1320  }
1321  /* Other lines is aliases */
1322  else
1323  {
1324  if (curaffix < naffix)
1325  {
1326  Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1327  curaffix++;
1328  }
1329  }
1330  goto nextline;
1331  }
1332  /* Else try to parse prefixes and suffixes */
1333  if (fields_read < 4 ||
1334  (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1335  goto nextline;
1336 
1337  sflaglen = strlen(sflag);
1338  if (sflaglen == 0
1339  || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1340  || (sflaglen > 2 && Conf->flagMode == FM_LONG))
1341  goto nextline;
1342 
1343  /*--------
1344  * Affix header. For example:
1345  * SFX \ N 1
1346  *--------
1347  */
1348  if (fields_read == 4)
1349  {
1350  isSuffix = (STRNCMP(ptype, "sfx") == 0);
1351  if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1352  flagflags = FF_CROSSPRODUCT;
1353  else
1354  flagflags = 0;
1355  }
1356  /*--------
1357  * Affix fields. For example:
1358  * SFX \ 0 Y/L [^Y]
1359  *--------
1360  */
1361  else
1362  {
1363  char *ptr;
1364  int aflg = 0;
1365 
1366  /* Get flags after '/' (flags are case sensitive) */
1367  if ((ptr = strchr(repl, '/')) != NULL)
1368  aflg |= getCompoundAffixFlagValue(Conf,
1369  getAffixFlagSet(Conf,
1370  ptr + 1));
1371  /* Get lowercased version of string before '/' */
1372  prepl = lowerstr_ctx(Conf, repl);
1373  if ((ptr = strchr(prepl, '/')) != NULL)
1374  *ptr = '\0';
1375  pfind = lowerstr_ctx(Conf, find);
1376  pmask = lowerstr_ctx(Conf, mask);
1377  if (t_iseq(find, '0'))
1378  *pfind = '\0';
1379  if (t_iseq(repl, '0'))
1380  *prepl = '\0';
1381 
1382  NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1383  isSuffix ? FF_SUFFIX : FF_PREFIX);
1384  pfree(prepl);
1385  pfree(pfind);
1386  pfree(pmask);
1387  }
1388 
1389 nextline:
1390  pfree(recoded);
1391  }
1392 
1393  tsearch_readline_end(&trst);
1394  if (ptype)
1395  pfree(ptype);
1396 }
1397 
1398 /*
1399  * import affixes
1400  *
1401  * Note caller must already have applied get_tsearch_config_filename
1402  *
1403  * This function is responsible for parsing ispell ("old format") affix files.
1404  * If we realize that the file contains new-format commands, we pass off the
1405  * work to NIImportOOAffixes(), which will re-read the whole file.
1406  */
1407 void
1409 {
1410  char *pstr = NULL;
1411  char flag[BUFSIZ];
1412  char mask[BUFSIZ];
1413  char find[BUFSIZ];
1414  char repl[BUFSIZ];
1415  char *s;
1416  bool suffixes = false;
1417  bool prefixes = false;
1418  char flagflags = 0;
1420  bool oldformat = false;
1421  char *recoded = NULL;
1422 
1423  if (!tsearch_readline_begin(&trst, filename))
1424  ereport(ERROR,
1425  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1426  errmsg("could not open affix file \"%s\": %m",
1427  filename)));
1428 
1429  Conf->usecompound = false;
1430  Conf->useFlagAliases = false;
1431  Conf->flagMode = FM_CHAR;
1432 
1433  while ((recoded = tsearch_readline(&trst)) != NULL)
1434  {
1435  pstr = lowerstr(recoded);
1436 
1437  /* Skip comments and empty lines */
1438  if (*pstr == '#' || *pstr == '\n')
1439  goto nextline;
1440 
1441  if (STRNCMP(pstr, "compoundwords") == 0)
1442  {
1443  /* Find case-insensitive L flag in non-lowercased string */
1444  s = findchar2(recoded, 'l', 'L');
1445  if (s)
1446  {
1447  while (*s && !t_isspace(s))
1448  s += pg_mblen(s);
1449  while (*s && t_isspace(s))
1450  s += pg_mblen(s);
1451 
1452  if (*s && pg_mblen(s) == 1)
1453  {
1455  Conf->usecompound = true;
1456  }
1457  oldformat = true;
1458  goto nextline;
1459  }
1460  }
1461  if (STRNCMP(pstr, "suffixes") == 0)
1462  {
1463  suffixes = true;
1464  prefixes = false;
1465  oldformat = true;
1466  goto nextline;
1467  }
1468  if (STRNCMP(pstr, "prefixes") == 0)
1469  {
1470  suffixes = false;
1471  prefixes = true;
1472  oldformat = true;
1473  goto nextline;
1474  }
1475  if (STRNCMP(pstr, "flag") == 0)
1476  {
1477  s = recoded + 4; /* we need non-lowercased string */
1478  flagflags = 0;
1479 
1480  while (*s && t_isspace(s))
1481  s += pg_mblen(s);
1482 
1483  if (*s == '*')
1484  {
1485  flagflags |= FF_CROSSPRODUCT;
1486  s++;
1487  }
1488  else if (*s == '~')
1489  {
1490  flagflags |= FF_COMPOUNDONLY;
1491  s++;
1492  }
1493 
1494  if (*s == '\\')
1495  s++;
1496 
1497  /*
1498  * An old-format flag is a single ASCII character; we expect it to
1499  * be followed by EOL, whitespace, or ':'. Otherwise this is a
1500  * new-format flag command.
1501  */
1502  if (*s && pg_mblen(s) == 1)
1503  {
1504  COPYCHAR(flag, s);
1505  flag[1] = '\0';
1506 
1507  s++;
1508  if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1509  t_isspace(s))
1510  {
1511  oldformat = true;
1512  goto nextline;
1513  }
1514  }
1515  goto isnewformat;
1516  }
1517  if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1518  STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1519  STRNCMP(recoded, "PFX") == 0 ||
1520  STRNCMP(recoded, "SFX") == 0)
1521  goto isnewformat;
1522 
1523  if ((!suffixes) && (!prefixes))
1524  goto nextline;
1525 
1526  if (!parse_affentry(pstr, mask, find, repl))
1527  goto nextline;
1528 
1529  NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1530 
1531 nextline:
1532  pfree(recoded);
1533  pfree(pstr);
1534  }
1535  tsearch_readline_end(&trst);
1536  return;
1537 
1538 isnewformat:
1539  if (oldformat)
1540  ereport(ERROR,
1541  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1542  errmsg("affix file contains both old-style and new-style commands")));
1543  tsearch_readline_end(&trst);
1544 
1545  NIImportOOAffixes(Conf, filename);
1546 }
1547 
1548 /*
1549  * Merges two affix flag sets and stores a new affix flag set into
1550  * Conf->AffixData.
1551  *
1552  * Returns index of a new affix flag set.
1553  */
1554 static int
1555 MergeAffix(IspellDict *Conf, int a1, int a2)
1556 {
1557  char **ptr;
1558 
1559  /* Do not merge affix flags if one of affix flags is empty */
1560  if (*Conf->AffixData[a1] == '\0')
1561  return a2;
1562  else if (*Conf->AffixData[a2] == '\0')
1563  return a1;
1564 
1565  while (Conf->nAffixData + 1 >= Conf->lenAffixData)
1566  {
1567  Conf->lenAffixData *= 2;
1568  Conf->AffixData = (char **) repalloc(Conf->AffixData,
1569  sizeof(char *) * Conf->lenAffixData);
1570  }
1571 
1572  ptr = Conf->AffixData + Conf->nAffixData;
1573  if (Conf->flagMode == FM_NUM)
1574  {
1575  *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1576  strlen(Conf->AffixData[a2]) +
1577  1 /* comma */ + 1 /* \0 */ );
1578  sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1579  }
1580  else
1581  {
1582  *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1583  strlen(Conf->AffixData[a2]) +
1584  1 /* \0 */ );
1585  sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1586  }
1587  ptr++;
1588  *ptr = NULL;
1589  Conf->nAffixData++;
1590 
1591  return Conf->nAffixData - 1;
1592 }
1593 
1594 /*
1595  * Returns a set of affix parameters which correspondence to the set of affix
1596  * flags with the given index.
1597  */
1598 static uint32
1600 {
1601  char *str = Conf->AffixData[affix];
1602 
1603  return (getCompoundAffixFlagValue(Conf, str) & FF_COMPOUNDFLAGMASK);
1604 }
1605 
1606 /*
1607  * Makes a prefix tree for the given level.
1608  *
1609  * Conf: current dictionary.
1610  * low: lower index of the Conf->Spell array.
1611  * high: upper index of the Conf->Spell array.
1612  * level: current prefix tree level.
1613  */
1614 static SPNode *
1615 mkSPNode(IspellDict *Conf, int low, int high, int level)
1616 {
1617  int i;
1618  int nchar = 0;
1619  char lastchar = '\0';
1620  SPNode *rs;
1621  SPNodeData *data;
1622  int lownew = low;
1623 
1624  for (i = low; i < high; i++)
1625  if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1626  {
1627  nchar++;
1628  lastchar = Conf->Spell[i]->word[level];
1629  }
1630 
1631  if (!nchar)
1632  return NULL;
1633 
1634  rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1635  rs->length = nchar;
1636  data = rs->data;
1637 
1638  lastchar = '\0';
1639  for (i = low; i < high; i++)
1640  if (Conf->Spell[i]->p.d.len > level)
1641  {
1642  if (lastchar != Conf->Spell[i]->word[level])
1643  {
1644  if (lastchar)
1645  {
1646  /* Next level of the prefix tree */
1647  data->node = mkSPNode(Conf, lownew, i, level + 1);
1648  lownew = i;
1649  data++;
1650  }
1651  lastchar = Conf->Spell[i]->word[level];
1652  }
1653  data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1654  if (Conf->Spell[i]->p.d.len == level + 1)
1655  {
1656  bool clearCompoundOnly = false;
1657 
1658  if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1659  {
1660  /*
1661  * MergeAffix called a few times. If one of word is
1662  * allowed to be in compound word and another isn't, then
1663  * clear FF_COMPOUNDONLY flag.
1664  */
1665 
1666  clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1667  & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1668  ? false : true;
1669  data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1670  }
1671  else
1672  data->affix = Conf->Spell[i]->p.d.affix;
1673  data->isword = 1;
1674 
1675  data->compoundflag = makeCompoundFlags(Conf, data->affix);
1676 
1677  if ((data->compoundflag & FF_COMPOUNDONLY) &&
1678  (data->compoundflag & FF_COMPOUNDFLAG) == 0)
1679  data->compoundflag |= FF_COMPOUNDFLAG;
1680 
1681  if (clearCompoundOnly)
1682  data->compoundflag &= ~FF_COMPOUNDONLY;
1683  }
1684  }
1685 
1686  /* Next level of the prefix tree */
1687  data->node = mkSPNode(Conf, lownew, high, level + 1);
1688 
1689  return rs;
1690 }
1691 
1692 /*
1693  * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1694  * and affixes.
1695  */
1696 void
1698 {
1699  int i;
1700  int naffix = 0;
1701  int curaffix;
1702 
1703  /* compress affixes */
1704 
1705  /*
1706  * If we use flag aliases then we need to use Conf->AffixData filled in
1707  * the NIImportOOAffixes().
1708  */
1709  if (Conf->useFlagAliases)
1710  {
1711  for (i = 0; i < Conf->nspell; i++)
1712  {
1713  char *end;
1714 
1715  if (*Conf->Spell[i]->p.flag != '\0')
1716  {
1717  curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1718  if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1719  ereport(ERROR,
1720  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1721  errmsg("invalid affix alias \"%s\"",
1722  Conf->Spell[i]->p.flag)));
1723  }
1724  else
1725  {
1726  /*
1727  * If Conf->Spell[i]->p.flag is empty, then get empty value of
1728  * Conf->AffixData (0 index).
1729  */
1730  curaffix = 0;
1731  }
1732 
1733  Conf->Spell[i]->p.d.affix = curaffix;
1734  Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1735  }
1736  }
1737  /* Otherwise fill Conf->AffixData here */
1738  else
1739  {
1740  /* Count the number of different flags used in the dictionary */
1741  qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
1742  cmpspellaffix);
1743 
1744  naffix = 0;
1745  for (i = 0; i < Conf->nspell; i++)
1746  {
1747  if (i == 0
1748  || strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
1749  naffix++;
1750  }
1751 
1752  /*
1753  * Fill in Conf->AffixData with the affixes that were used in the
1754  * dictionary. Replace textual flag-field of Conf->Spell entries with
1755  * indexes into Conf->AffixData array.
1756  */
1757  Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1758 
1759  curaffix = -1;
1760  for (i = 0; i < Conf->nspell; i++)
1761  {
1762  if (i == 0
1763  || strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]))
1764  {
1765  curaffix++;
1766  Assert(curaffix < naffix);
1767  Conf->AffixData[curaffix] = cpstrdup(Conf,
1768  Conf->Spell[i]->p.flag);
1769  }
1770 
1771  Conf->Spell[i]->p.d.affix = curaffix;
1772  Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1773  }
1774 
1775  Conf->lenAffixData = Conf->nAffixData = naffix;
1776  }
1777 
1778  /* Start build a prefix tree */
1779  qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1780  Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1781 }
1782 
1783 /*
1784  * Makes a prefix tree for the given level using the repl string of an affix
1785  * rule. Affixes with empty replace string do not include in the prefix tree.
1786  * This affixes are included by mkVoidAffix().
1787  *
1788  * Conf: current dictionary.
1789  * low: lower index of the Conf->Affix array.
1790  * high: upper index of the Conf->Affix array.
1791  * level: current prefix tree level.
1792  * type: FF_SUFFIX or FF_PREFIX.
1793  */
1794 static AffixNode *
1795 mkANode(IspellDict *Conf, int low, int high, int level, int type)
1796 {
1797  int i;
1798  int nchar = 0;
1799  uint8 lastchar = '\0';
1800  AffixNode *rs;
1801  AffixNodeData *data;
1802  int lownew = low;
1803  int naff;
1804  AFFIX **aff;
1805 
1806  for (i = low; i < high; i++)
1807  if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1808  {
1809  nchar++;
1810  lastchar = GETCHAR(Conf->Affix + i, level, type);
1811  }
1812 
1813  if (!nchar)
1814  return NULL;
1815 
1816  aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1817  naff = 0;
1818 
1819  rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1820  rs->length = nchar;
1821  data = rs->data;
1822 
1823  lastchar = '\0';
1824  for (i = low; i < high; i++)
1825  if (Conf->Affix[i].replen > level)
1826  {
1827  if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1828  {
1829  if (lastchar)
1830  {
1831  /* Next level of the prefix tree */
1832  data->node = mkANode(Conf, lownew, i, level + 1, type);
1833  if (naff)
1834  {
1835  data->naff = naff;
1836  data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1837  memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1838  naff = 0;
1839  }
1840  data++;
1841  lownew = i;
1842  }
1843  lastchar = GETCHAR(Conf->Affix + i, level, type);
1844  }
1845  data->val = GETCHAR(Conf->Affix + i, level, type);
1846  if (Conf->Affix[i].replen == level + 1)
1847  { /* affix stopped */
1848  aff[naff++] = Conf->Affix + i;
1849  }
1850  }
1851 
1852  /* Next level of the prefix tree */
1853  data->node = mkANode(Conf, lownew, high, level + 1, type);
1854  if (naff)
1855  {
1856  data->naff = naff;
1857  data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1858  memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1859  naff = 0;
1860  }
1861 
1862  pfree(aff);
1863 
1864  return rs;
1865 }
1866 
1867 /*
1868  * Makes the root void node in the prefix tree. The root void node is created
1869  * for affixes which have empty replace string ("repl" field).
1870  */
1871 static void
1872 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1873 {
1874  int i,
1875  cnt = 0;
1876  int start = (issuffix) ? startsuffix : 0;
1877  int end = (issuffix) ? Conf->naffixes : startsuffix;
1878  AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1879 
1880  Affix->length = 1;
1881  Affix->isvoid = 1;
1882 
1883  if (issuffix)
1884  {
1885  Affix->data->node = Conf->Suffix;
1886  Conf->Suffix = Affix;
1887  }
1888  else
1889  {
1890  Affix->data->node = Conf->Prefix;
1891  Conf->Prefix = Affix;
1892  }
1893 
1894  /* Count affixes with empty replace string */
1895  for (i = start; i < end; i++)
1896  if (Conf->Affix[i].replen == 0)
1897  cnt++;
1898 
1899  /* There is not affixes with empty replace string */
1900  if (cnt == 0)
1901  return;
1902 
1903  Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1904  Affix->data->naff = (uint32) cnt;
1905 
1906  cnt = 0;
1907  for (i = start; i < end; i++)
1908  if (Conf->Affix[i].replen == 0)
1909  {
1910  Affix->data->aff[cnt] = Conf->Affix + i;
1911  cnt++;
1912  }
1913 }
1914 
1915 /*
1916  * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1917  * contain affixflag if this flag is not used actually by the .dict file.
1918  *
1919  * Conf: current dictionary.
1920  * affixflag: affix flag.
1921  *
1922  * Returns true if the Conf->AffixData array contains affixflag, otherwise
1923  * returns false.
1924  */
1925 static bool
1926 isAffixInUse(IspellDict *Conf, char *affixflag)
1927 {
1928  int i;
1929 
1930  for (i = 0; i < Conf->nAffixData; i++)
1931  if (IsAffixFlagInUse(Conf, i, affixflag))
1932  return true;
1933 
1934  return false;
1935 }
1936 
1937 /*
1938  * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1939  */
1940 void
1942 {
1943  AFFIX *Affix;
1944  size_t i;
1945  CMPDAffix *ptr;
1946  int firstsuffix = Conf->naffixes;
1947 
1948  if (Conf->naffixes == 0)
1949  return;
1950 
1951  /* Store compound affixes in the Conf->CompoundAffix array */
1952  if (Conf->naffixes > 1)
1953  qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
1954  Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
1955  ptr->affix = NULL;
1956 
1957  for (i = 0; i < Conf->naffixes; i++)
1958  {
1959  Affix = &(((AFFIX *) Conf->Affix)[i]);
1960  if (Affix->type == FF_SUFFIX && i < firstsuffix)
1961  firstsuffix = i;
1962 
1963  if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
1964  isAffixInUse(Conf, Affix->flag))
1965  {
1966  if (ptr == Conf->CompoundAffix ||
1967  ptr->issuffix != (ptr - 1)->issuffix ||
1968  strbncmp((const unsigned char *) (ptr - 1)->affix,
1969  (const unsigned char *) Affix->repl,
1970  (ptr - 1)->len))
1971  {
1972  /* leave only unique and minimals suffixes */
1973  ptr->affix = Affix->repl;
1974  ptr->len = Affix->replen;
1975  ptr->issuffix = (Affix->type == FF_SUFFIX);
1976  ptr++;
1977  }
1978  }
1979  }
1980  ptr->affix = NULL;
1981  Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
1982 
1983  /* Start build a prefix tree */
1984  Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
1985  Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
1986  mkVoidAffix(Conf, true, firstsuffix);
1987  mkVoidAffix(Conf, false, firstsuffix);
1988 }
1989 
1990 static AffixNodeData *
1991 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
1992 {
1993  AffixNodeData *StopLow,
1994  *StopHigh,
1995  *StopMiddle;
1996  uint8 symbol;
1997 
1998  if (node->isvoid)
1999  { /* search void affixes */
2000  if (node->data->naff)
2001  return node->data;
2002  node = node->data->node;
2003  }
2004 
2005  while (node && *level < wrdlen)
2006  {
2007  StopLow = node->data;
2008  StopHigh = node->data + node->length;
2009  while (StopLow < StopHigh)
2010  {
2011  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2012  symbol = GETWCHAR(word, wrdlen, *level, type);
2013 
2014  if (StopMiddle->val == symbol)
2015  {
2016  (*level)++;
2017  if (StopMiddle->naff)
2018  return StopMiddle;
2019  node = StopMiddle->node;
2020  break;
2021  }
2022  else if (StopMiddle->val < symbol)
2023  StopLow = StopMiddle + 1;
2024  else
2025  StopHigh = StopMiddle;
2026  }
2027  if (StopLow >= StopHigh)
2028  break;
2029  }
2030  return NULL;
2031 }
2032 
2033 static char *
2034 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2035 {
2036  /*
2037  * Check compound allow flags
2038  */
2039 
2040  if (flagflags == 0)
2041  {
2042  if (Affix->flagflags & FF_COMPOUNDONLY)
2043  return NULL;
2044  }
2045  else if (flagflags & FF_COMPOUNDBEGIN)
2046  {
2047  if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2048  return NULL;
2049  if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2050  if (Affix->type == FF_SUFFIX)
2051  return NULL;
2052  }
2053  else if (flagflags & FF_COMPOUNDMIDDLE)
2054  {
2055  if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2056  (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
2057  return NULL;
2058  }
2059  else if (flagflags & FF_COMPOUNDLAST)
2060  {
2061  if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2062  return NULL;
2063  if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2064  if (Affix->type == FF_PREFIX)
2065  return NULL;
2066  }
2067 
2068  /*
2069  * make replace pattern of affix
2070  */
2071  if (Affix->type == FF_SUFFIX)
2072  {
2073  strcpy(newword, word);
2074  strcpy(newword + len - Affix->replen, Affix->find);
2075  if (baselen) /* store length of non-changed part of word */
2076  *baselen = len - Affix->replen;
2077  }
2078  else
2079  {
2080  /*
2081  * if prefix is an all non-changed part's length then all word
2082  * contains only prefix and suffix, so out
2083  */
2084  if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
2085  return NULL;
2086  strcpy(newword, Affix->find);
2087  strcat(newword, word + Affix->replen);
2088  }
2089 
2090  /*
2091  * check resulting word
2092  */
2093  if (Affix->issimple)
2094  return newword;
2095  else if (Affix->isregis)
2096  {
2097  if (RS_execute(&(Affix->reg.regis), newword))
2098  return newword;
2099  }
2100  else
2101  {
2102  int err;
2103  pg_wchar *data;
2104  size_t data_len;
2105  int newword_len;
2106 
2107  /* Convert data string to wide characters */
2108  newword_len = strlen(newword);
2109  data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
2110  data_len = pg_mb2wchar_with_len(newword, data, newword_len);
2111 
2112  if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
2113  {
2114  pfree(data);
2115  return newword;
2116  }
2117  pfree(data);
2118  }
2119 
2120  return NULL;
2121 }
2122 
2123 static int
2124 addToResult(char **forms, char **cur, char *word)
2125 {
2126  if (cur - forms >= MAX_NORM - 1)
2127  return 0;
2128  if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2129  {
2130  *cur = pstrdup(word);
2131  *(cur + 1) = NULL;
2132  return 1;
2133  }
2134 
2135  return 0;
2136 }
2137 
2138 static char **
2140 {
2141  AffixNodeData *suffix = NULL,
2142  *prefix = NULL;
2143  int slevel = 0,
2144  plevel = 0;
2145  int wrdlen = strlen(word),
2146  swrdlen;
2147  char **forms;
2148  char **cur;
2149  char newword[2 * MAXNORMLEN] = "";
2150  char pnewword[2 * MAXNORMLEN] = "";
2151  AffixNode *snode = Conf->Suffix,
2152  *pnode;
2153  int i,
2154  j;
2155 
2156  if (wrdlen > MAXNORMLEN)
2157  return NULL;
2158  cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
2159  *cur = NULL;
2160 
2161 
2162  /* Check that the word itself is normal form */
2163  if (FindWord(Conf, word, VoidString, flag))
2164  {
2165  *cur = pstrdup(word);
2166  cur++;
2167  *cur = NULL;
2168  }
2169 
2170  /* Find all other NORMAL forms of the 'word' (check only prefix) */
2171  pnode = Conf->Prefix;
2172  plevel = 0;
2173  while (pnode)
2174  {
2175  prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
2176  if (!prefix)
2177  break;
2178  for (j = 0; j < prefix->naff; j++)
2179  {
2180  if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2181  {
2182  /* prefix success */
2183  if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2184  cur += addToResult(forms, cur, newword);
2185  }
2186  }
2187  pnode = prefix->node;
2188  }
2189 
2190  /*
2191  * Find all other NORMAL forms of the 'word' (check suffix and then
2192  * prefix)
2193  */
2194  while (snode)
2195  {
2196  int baselen = 0;
2197 
2198  /* find possible suffix */
2199  suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2200  if (!suffix)
2201  break;
2202  /* foreach suffix check affix */
2203  for (i = 0; i < suffix->naff; i++)
2204  {
2205  if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2206  {
2207  /* suffix success */
2208  if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2209  cur += addToResult(forms, cur, newword);
2210 
2211  /* now we will look changed word with prefixes */
2212  pnode = Conf->Prefix;
2213  plevel = 0;
2214  swrdlen = strlen(newword);
2215  while (pnode)
2216  {
2217  prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
2218  if (!prefix)
2219  break;
2220  for (j = 0; j < prefix->naff; j++)
2221  {
2222  if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2223  {
2224  /* prefix success */
2225  char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2226  VoidString : prefix->aff[j]->flag;
2227 
2228  if (FindWord(Conf, pnewword, ff, flag))
2229  cur += addToResult(forms, cur, pnewword);
2230  }
2231  }
2232  pnode = prefix->node;
2233  }
2234  }
2235  }
2236 
2237  snode = suffix->node;
2238  }
2239 
2240  if (cur == forms)
2241  {
2242  pfree(forms);
2243  return NULL;
2244  }
2245  return forms;
2246 }
2247 
2248 typedef struct SplitVar
2249 {
2250  int nstem;
2251  int lenstem;
2252  char **stem;
2253  struct SplitVar *next;
2254 } SplitVar;
2255 
2256 static int
2257 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
2258 {
2259  bool issuffix;
2260 
2261  /* in case CompoundAffix is null: */
2262  if (*ptr == NULL)
2263  return -1;
2264 
2265  if (CheckInPlace)
2266  {
2267  while ((*ptr)->affix)
2268  {
2269  if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2270  {
2271  len = (*ptr)->len;
2272  issuffix = (*ptr)->issuffix;
2273  (*ptr)++;
2274  return (issuffix) ? len : 0;
2275  }
2276  (*ptr)++;
2277  }
2278  }
2279  else
2280  {
2281  char *affbegin;
2282 
2283  while ((*ptr)->affix)
2284  {
2285  if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2286  {
2287  len = (*ptr)->len + (affbegin - word);
2288  issuffix = (*ptr)->issuffix;
2289  (*ptr)++;
2290  return (issuffix) ? len : 0;
2291  }
2292  (*ptr)++;
2293  }
2294  }
2295  return -1;
2296 }
2297 
2298 static SplitVar *
2299 CopyVar(SplitVar *s, int makedup)
2300 {
2301  SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
2302 
2303  v->next = NULL;
2304  if (s)
2305  {
2306  int i;
2307 
2308  v->lenstem = s->lenstem;
2309  v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2310  v->nstem = s->nstem;
2311  for (i = 0; i < s->nstem; i++)
2312  v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2313  }
2314  else
2315  {
2316  v->lenstem = 16;
2317  v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2318  v->nstem = 0;
2319  }
2320  return v;
2321 }
2322 
2323 static void
2325 {
2326  if (v->nstem >= v->lenstem)
2327  {
2328  v->lenstem *= 2;
2329  v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2330  }
2331 
2332  v->stem[v->nstem] = word;
2333  v->nstem++;
2334 }
2335 
2336 static SplitVar *
2337 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
2338 {
2339  SplitVar *var = NULL;
2340  SPNodeData *StopLow,
2341  *StopHigh,
2342  *StopMiddle = NULL;
2343  SPNode *node = (snode) ? snode : Conf->Dictionary;
2344  int level = (snode) ? minpos : startpos; /* recursive
2345  * minpos==level */
2346  int lenaff;
2347  CMPDAffix *caff;
2348  char *notprobed;
2349  int compoundflag = 0;
2350 
2351  notprobed = (char *) palloc(wordlen);
2352  memset(notprobed, 1, wordlen);
2353  var = CopyVar(orig, 1);
2354 
2355  while (level < wordlen)
2356  {
2357  /* find word with epenthetic or/and compound affix */
2358  caff = Conf->CompoundAffix;
2359  while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2360  {
2361  /*
2362  * there is one of compound affixes, so check word for existings
2363  */
2364  char buf[MAXNORMLEN];
2365  char **subres;
2366 
2367  lenaff = level - startpos + lenaff;
2368 
2369  if (!notprobed[startpos + lenaff - 1])
2370  continue;
2371 
2372  if (level + lenaff - 1 <= minpos)
2373  continue;
2374 
2375  if (lenaff >= MAXNORMLEN)
2376  continue; /* skip too big value */
2377  if (lenaff > 0)
2378  memcpy(buf, word + startpos, lenaff);
2379  buf[lenaff] = '\0';
2380 
2381  if (level == 0)
2382  compoundflag = FF_COMPOUNDBEGIN;
2383  else if (level == wordlen - 1)
2384  compoundflag = FF_COMPOUNDLAST;
2385  else
2386  compoundflag = FF_COMPOUNDMIDDLE;
2387  subres = NormalizeSubWord(Conf, buf, compoundflag);
2388  if (subres)
2389  {
2390  /* Yes, it was a word from dictionary */
2391  SplitVar *new = CopyVar(var, 0);
2392  SplitVar *ptr = var;
2393  char **sptr = subres;
2394 
2395  notprobed[startpos + lenaff - 1] = 0;
2396 
2397  while (*sptr)
2398  {
2399  AddStem(new, *sptr);
2400  sptr++;
2401  }
2402  pfree(subres);
2403 
2404  while (ptr->next)
2405  ptr = ptr->next;
2406  ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2407 
2408  pfree(new->stem);
2409  pfree(new);
2410  }
2411  }
2412 
2413  if (!node)
2414  break;
2415 
2416  StopLow = node->data;
2417  StopHigh = node->data + node->length;
2418  while (StopLow < StopHigh)
2419  {
2420  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2421  if (StopMiddle->val == ((uint8 *) (word))[level])
2422  break;
2423  else if (StopMiddle->val < ((uint8 *) (word))[level])
2424  StopLow = StopMiddle + 1;
2425  else
2426  StopHigh = StopMiddle;
2427  }
2428 
2429  if (StopLow < StopHigh)
2430  {
2431  if (startpos == 0)
2432  compoundflag = FF_COMPOUNDBEGIN;
2433  else if (level == wordlen - 1)
2434  compoundflag = FF_COMPOUNDLAST;
2435  else
2436  compoundflag = FF_COMPOUNDMIDDLE;
2437 
2438  /* find infinitive */
2439  if (StopMiddle->isword &&
2440  (StopMiddle->compoundflag & compoundflag) &&
2441  notprobed[level])
2442  {
2443  /* ok, we found full compoundallowed word */
2444  if (level > minpos)
2445  {
2446  /* and its length more than minimal */
2447  if (wordlen == level + 1)
2448  {
2449  /* well, it was last word */
2450  AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2451  pfree(notprobed);
2452  return var;
2453  }
2454  else
2455  {
2456  /* then we will search more big word at the same point */
2457  SplitVar *ptr = var;
2458 
2459  while (ptr->next)
2460  ptr = ptr->next;
2461  ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2462  /* we can find next word */
2463  level++;
2464  AddStem(var, pnstrdup(word + startpos, level - startpos));
2465  node = Conf->Dictionary;
2466  startpos = level;
2467  continue;
2468  }
2469  }
2470  }
2471  node = StopMiddle->node;
2472  }
2473  else
2474  node = NULL;
2475  level++;
2476  }
2477 
2478  AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2479  pfree(notprobed);
2480  return var;
2481 }
2482 
2483 static void
2484 addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
2485 {
2486  if (*lres == NULL)
2487  *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
2488 
2489  if (*lcur - *lres < MAX_NORM - 1)
2490  {
2491  (*lcur)->lexeme = word;
2492  (*lcur)->flags = flags;
2493  (*lcur)->nvariant = NVariant;
2494  (*lcur)++;
2495  (*lcur)->lexeme = NULL;
2496  }
2497 }
2498 
2499 TSLexeme *
2501 {
2502  char **res;
2503  TSLexeme *lcur = NULL,
2504  *lres = NULL;
2505  uint16 NVariant = 1;
2506 
2507  res = NormalizeSubWord(Conf, word, 0);
2508 
2509  if (res)
2510  {
2511  char **ptr = res;
2512 
2513  while (*ptr && (lcur - lres) < MAX_NORM)
2514  {
2515  addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2516  ptr++;
2517  }
2518  pfree(res);
2519  }
2520 
2521  if (Conf->usecompound)
2522  {
2523  int wordlen = strlen(word);
2524  SplitVar *ptr,
2525  *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2526  int i;
2527 
2528  while (var)
2529  {
2530  if (var->nstem > 1)
2531  {
2532  char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2533 
2534  if (subres)
2535  {
2536  char **subptr = subres;
2537 
2538  while (*subptr)
2539  {
2540  for (i = 0; i < var->nstem - 1; i++)
2541  {
2542  addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2543  }
2544 
2545  addNorm(&lres, &lcur, *subptr, 0, NVariant);
2546  subptr++;
2547  NVariant++;
2548  }
2549 
2550  pfree(subres);
2551  var->stem[0] = NULL;
2552  pfree(var->stem[var->nstem - 1]);
2553  }
2554  }
2555 
2556  for (i = 0; i < var->nstem && var->stem[i]; i++)
2557  pfree(var->stem[i]);
2558  ptr = var->next;
2559  pfree(var->stem);
2560  pfree(var);
2561  var = ptr;
2562  }
2563  }
2564 
2565  return lres;
2566 }
#define COPYCHAR(d, s)
Definition: ts_locale.h:47
char * flag
Definition: spell.h:89
MemoryContext buildCxt
Definition: spell.h:215
int naffixes
Definition: spell.h:182
#define FF_COMPOUNDONLY
Definition: spell.h:42
int nAffixData
Definition: spell.h:192
struct spell_struct::@110::@111 d
static int cmpspell(const void *s1, const void *s2)
Definition: spell.c:196
int t_isprint(const char *ptr)
Definition: ts_locale.c:73
#define PAE_INREPL
Definition: spell.c:767
static void AddStem(SplitVar *v, char *word)
Definition: spell.c:2324
static char * findchar(char *str, int c)
Definition: spell.c:228
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:200
static char ** NormalizeSubWord(IspellDict *Conf, char *word, int flag)
Definition: spell.c:2139
int lenAffixData
Definition: spell.h:191
int mspell
Definition: spell.h:220
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1087
static int find(struct vars *, struct cnfa *, struct colormap *)
Definition: regexec.c:375
static int32 next
Definition: blutils.c:210
bool useFlagAliases
Definition: spell.h:193
uint32 length
Definition: spell.h:135
uint32 flagflags
Definition: spell.h:91
bool usecompound
Definition: spell.h:197
struct AffixNode * node
Definition: spell.h:130
AffixNodeData data[FLEXIBLE_ARRAY_MEMBER]
Definition: spell.h:137
AffixNode * Prefix
Definition: spell.h:186
size_t avail
Definition: spell.h:224
AffixNode * Suffix
Definition: spell.h:185
static int getCompoundAffixFlagValue(IspellDict *Conf, char *s)
Definition: spell.c:1114
uint32 compoundflag
Definition: spell.h:29
SPNodeData data[FLEXIBLE_ARRAY_MEMBER]
Definition: spell.h:53
static bool IsAffixFlagInUse(IspellDict *Conf, int affix, char *affixflag)
Definition: spell.c:453
AFFIX ** aff
Definition: spell.h:129
uint32 type
Definition: spell.h:91
#define tmpalloc(sz)
Definition: spell.c:78
Definition: spell.h:154
char * pstrdup(const char *in)
Definition: mcxt.c:1076
struct SplitVar SplitVar
static void getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
Definition: spell.c:348
uint32 val
Definition: spell.h:29
static int MergeAffix(IspellDict *Conf, int a1, int a2)
Definition: spell.c:1555
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
unsigned char uint8
Definition: c.h:256
MemoryContext CurTransactionContext
Definition: mcxt.c:49
#define PAE_INFIND
Definition: spell.c:765
static SPNode * mkSPNode(IspellDict *Conf, int low, int high, int level)
Definition: spell.c:1615
struct cursor * cur
Definition: ecpg.c:28
#define FF_CROSSPRODUCT
Definition: spell.h:110
int errcode(int sqlerrcode)
Definition: elog.c:575
static void NIImportOOAffixes(IspellDict *Conf, const char *filename)
Definition: spell.c:1184
#define FF_COMPOUNDLAST
Definition: spell.h:45
#define FF_COMPOUNDMIDDLE
Definition: spell.h:44
char * lowerstr(const char *str)
Definition: ts_locale.c:228
uint32 isregis
Definition: spell.h:91
static void addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
Definition: spell.c:2484
#define ANHRDSZ
Definition: spell.h:140
static void NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
Definition: spell.c:483
#define PAE_WAIT_TYPE
Definition: spell.c:768
uint32 length
Definition: spell.h:52
uint32 isword
Definition: spell.h:29
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition: regcomp.c:314
#define SPNHDRSZ
Definition: spell.h:56
#define PAE_WAIT_FLAG
Definition: spell.c:769
#define MAX_NORM
Definition: spell.c:186
#define FLAGNUM_MAXSIZE
Definition: spell.h:177
struct SPNode * node
Definition: spell.h:35
void NIImportAffixes(IspellDict *Conf, const char *filename)
Definition: spell.c:1408
#define FF_COMPOUNDBEGIN
Definition: spell.h:43
signed int int32
Definition: c.h:246
static int FindWord(IspellDict *Conf, const char *word, char *affixflag, int flag)
Definition: spell.c:599
static int strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
Definition: spell.c:279
static bool isAffixInUse(IspellDict *Conf, char *affixflag)
Definition: spell.c:1926
Regis regis
Definition: spell.h:101
SPNode * Dictionary
Definition: spell.h:188
int t_isdigit(const char *ptr)
Definition: ts_locale.c:25
uint32 value
Definition: spell.h:174
#define MAXNORMLEN
Definition: spell.c:187
unsigned short uint16
Definition: c.h:257
void pfree(void *pointer)
Definition: mcxt.c:949
#define GETCHAR(A, N, T)
Definition: spell.c:191
#define cpalloc0(size)
Definition: spell.c:158
#define GETWCHAR(W, L, N, T)
Definition: spell.c:190
int nspell
Definition: spell.h:219
#define ERROR
Definition: elog.h:43
char * s1
int len
Definition: spell.h:145
int nstem
Definition: spell.c:2250
static int CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
Definition: spell.c:2257
static bool parse_affentry(char *str, char *mask, char *find, char *repl)
Definition: spell.c:904
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:165
#define FF_SUFFIX
Definition: spell.h:116
union aff_struct::@112 reg
char ** AffixData
Definition: spell.h:190
int t_isspace(const char *ptr)
Definition: ts_locale.c:41
char * c
static char * buf
Definition: pg_test_fsync.c:67
static void mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
Definition: spell.c:1872
bool RS_execute(Regis *r, char *str)
Definition: regis.c:213
static void prefixes(struct vars *v)
Definition: regc_lex.c:99
uint32 val
Definition: spell.h:127
Definition: spell.h:156
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:75
#define t_iseq(x, c)
Definition: ts_locale.h:45
static void NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
Definition: spell.c:674
char * flag(int b)
Definition: test-ctype.c:33
uint32 issimple
Definition: spell.h:91
static SplitVar * SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
Definition: spell.c:2337
AFFIX * Affix
Definition: spell.h:183
struct aff_struct AFFIX
Definition: spell.h:50
unsigned int uint32
Definition: c.h:258
union CompoundAffixFlag::@113 flag
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition: regerror.c:60
static uint32 makeCompoundFlags(IspellDict *Conf, int affix)
Definition: spell.c:1599
#define REG_ADVANCED
Definition: regex.h:103
void NIStartBuild(IspellDict *Conf)
Definition: spell.c:87
#define ereport(elevel, rest)
Definition: elog.h:122
unsigned int pg_wchar
Definition: mbprint.c:31
char word[FLEXIBLE_ARRAY_MEMBER]
Definition: spell.h:79
char * find
Definition: spell.h:96
static int cmpcmdflag(const void *f1, const void *f2)
Definition: spell.c:209
#define FF_COMPOUNDFLAGMASK
Definition: spell.h:48
static void setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry, char *s, uint32 val)
Definition: spell.c:1022
MemoryContext AllocSetContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: aset.c:322
void * palloc0(Size size)
Definition: mcxt.c:877
char * s2
char * flag
Definition: spell.h:69
void NIImportDictionary(IspellDict *Conf, const char *filename)
Definition: spell.c:514
void RS_compile(Regis *r, bool issuffix, const char *str)
Definition: regis.c:85
#define COMPACT_ALLOC_CHUNK
Definition: spell.c:125
void NIFinishBuild(IspellDict *Conf)
Definition: spell.c:102
static char * lowerstr_ctx(IspellDict *Conf, const char *src)
Definition: spell.c:174
#define PAE_INMASK
Definition: spell.c:763
static AffixNode * mkANode(IspellDict *Conf, int low, int high, int level, int type)
Definition: spell.c:1795
static bool get_nextfield(char **str, char *next)
Definition: spell.c:782
char * affix
Definition: spell.h:144
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:723
FlagMode flagMode
Definition: spell.h:173
struct CompoundAffixFlag CompoundAffixFlag
static FormData_pg_attribute a1
Definition: heap.c:144
void NISortAffixes(IspellDict *Conf)
Definition: spell.c:1941
static char * CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
Definition: spell.c:2034
#define STRNCMP(s, p)
Definition: spell.c:189
#define Assert(condition)
Definition: c.h:681
regex_t regex
Definition: spell.h:100
char ** stem
Definition: spell.c:2252
Definition: regguts.h:298
#define SPELLHDRSZ
Definition: spell.h:82
static char * findchar2(char *str, int c1, int c2)
Definition: spell.c:241
CompoundAffixFlag * CompoundAffixFlags
Definition: spell.h:205
bool issuffix
Definition: spell.h:146
#define FF_PREFIX
Definition: spell.h:117
static void addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
Definition: spell.c:1057
#define MAXALIGN(LEN)
Definition: c.h:576
static XLogRecPtr startpos
char * firstfree
Definition: spell.h:223
#define REG_NOSUB
Definition: regex.h:107
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
#define cpalloc(size)
Definition: spell.c:157
#define PAE_WAIT_FIND
Definition: spell.c:764
#define COMPACT_MAX_REQ
Definition: spell.c:126
int mCompoundAffixFlag
Definition: spell.h:209
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:150
SPELL ** Spell
Definition: spell.h:218
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:135
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
CMPDAffix * CompoundAffix
Definition: spell.h:195
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:113
static void word(struct vars *, int, struct state *, struct state *)
Definition: regcomp.c:1243
Definition: spell.h:155
int lenstem
Definition: spell.c:2251
#define FF_COMPOUNDFORBIDFLAG
Definition: spell.h:109
static char * filename
Definition: pg_dumpall.c:90
int pg_regexec(regex_t *re, const chr *string, size_t len, size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], int flags)
Definition: regexec.c:172
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
TSLexeme * NINormalizeWord(IspellDict *Conf, char *word)
Definition: spell.c:2500
bool RS_isRegis(const char *str)
Definition: regis.c:31
static int addToResult(char **forms, char **cur, char *word)
Definition: spell.c:2124
static char * VoidString
Definition: spell.c:193
static char * cpstrdup(IspellDict *Conf, const char *str)
Definition: spell.c:161
uint32 naff
Definition: spell.h:127
#define PAE_WAIT_MASK
Definition: spell.c:762
static int cmpspellaffix(const void *s1, const void *s2)
Definition: spell.c:202
int i
void NISortDictionary(IspellDict *Conf)
Definition: spell.c:1697
char * repl
Definition: spell.h:97
static int strbcmp(const unsigned char *s1, const unsigned char *s2)
Definition: spell.c:256
#define elog
Definition: elog.h:219
int t_isalpha(const char *ptr)
Definition: ts_locale.c:57
static void * compact_palloc0(IspellDict *Conf, size_t size)
Definition: spell.c:129
#define FF_COMPOUNDFLAG
Definition: spell.h:46
uint32 isvoid
Definition: spell.h:135
int maffixes
Definition: spell.h:181
#define qsort(a, b, c, d)
Definition: port.h:447
static int parse_ooaffentry(char *str, char *type, char *flag, char *find, char *repl, char *mask)
Definition: spell.c:848
#define PAE_WAIT_REPL
Definition: spell.c:766
static char * getAffixFlagSet(IspellDict *Conf, char *s)
Definition: spell.c:1150
uint32 replen
Definition: spell.h:91
int nCompoundAffixFlag
Definition: spell.h:207
#define FF_COMPOUNDPERMITFLAG
Definition: spell.h:108
static SplitVar * CopyVar(SplitVar *s, int makedup)
Definition: spell.c:2299
static AffixNodeData * FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
Definition: spell.c:1991
long val
Definition: informix.c:689
union spell_struct::@110 p
struct SplitVar * next
Definition: spell.c:2253
uint32 affix
Definition: spell.h:29
FlagMode flagMode
Definition: spell.h:198
static int cmpaffix(const void *s1, const void *s2)
Definition: spell.c:310
unsigned char symbol
Definition: api.h:2
static FormData_pg_attribute a2
Definition: heap.c:150