PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
spell.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * spell.c
4 * Normalizing word with ISpell
5 *
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 *
8 * Ispell dictionary
9 * -----------------
10 *
11 * Rules of dictionaries are defined in two files with .affix and .dict
12 * extensions. They are used by spell checker programs Ispell and Hunspell.
13 *
14 * An .affix file declares morphological rules to get a basic form of words.
15 * The format of an .affix file has different structure for Ispell and Hunspell
16 * dictionaries. The Hunspell format is more complicated. But when an .affix
17 * file is imported and compiled, it is stored in the same structure AffixNode.
18 *
19 * A .dict file stores a list of basic forms of words with references to
20 * affix rules. The format of a .dict file has the same structure for Ispell
21 * and Hunspell dictionaries.
22 *
23 * Compilation of a dictionary
24 * ---------------------------
25 *
26 * A compiled dictionary is stored in the IspellDict structure. Compilation of
27 * a dictionary is divided into the several steps:
28 * - NIImportDictionary() - stores each word of a .dict file in the
29 * temporary Spell field.
30 * - NIImportAffixes() - stores affix rules of an .affix file in the
31 * Affix field (not temporary) if an .affix file has the Ispell format.
32 * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33 * Hunspell format. The AffixData field is initialized if AF parameter
34 * is defined.
35 * - NISortDictionary() - builds a prefix tree (Trie) from the words list
36 * and stores it in the Dictionary field. The words list is got from the
37 * Spell field. The AffixData field is initialized if AF parameter is not
38 * defined.
39 * - NISortAffixes():
40 * - builds a list of compound affixes from the affix list and stores it
41 * in the CompoundAffix.
42 * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43 * and stores them in Suffix and Prefix fields.
44 * The affix list is got from the Affix field.
45 *
46 * Memory management
47 * -----------------
48 *
49 * The IspellDict structure has the Spell field which is used only in compile
50 * time. The Spell field stores a words list. It can take a lot of memory.
51 * Therefore when a dictionary is compiled this field is cleared by
52 * NIFinishBuild().
53 *
54 * All resources which should cleared by NIFinishBuild() is initialized using
55 * tmpalloc() and tmpalloc0().
56 *
57 * IDENTIFICATION
58 * src/backend/tsearch/spell.c
59 *
60 *-------------------------------------------------------------------------
61 */
62
63#include "postgres.h"
64
66#include "miscadmin.h"
67#include "tsearch/dicts/spell.h"
68#include "tsearch/ts_locale.h"
69#include "utils/formatting.h"
70#include "utils/memutils.h"
71
72
73/*
74 * Initialization requires a lot of memory that's not needed
75 * after the initialization is done. During initialization,
76 * CurrentMemoryContext is the long-lived memory context associated
77 * with the dictionary cache entry. We keep the short-lived stuff
78 * in the Conf->buildCxt context.
79 */
80#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
81#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
82
83/*
84 * Prepare for constructing an ISpell dictionary.
85 *
86 * The IspellDict struct is assumed to be zeroed when allocated.
87 */
88void
90{
91 /*
92 * The temp context is a child of CurTransactionContext, so that it will
93 * go away automatically on error.
94 */
96 "Ispell dictionary init context",
98}
99
100/*
101 * Clean up when dictionary construction is complete.
102 */
103void
105{
106 /* Release no-longer-needed temp memory */
108 /* Just for cleanliness, zero the now-dangling pointers */
109 Conf->buildCxt = NULL;
110 Conf->Spell = NULL;
111 Conf->firstfree = NULL;
112 Conf->CompoundAffixFlags = NULL;
113}
114
115
116/*
117 * "Compact" palloc: allocate without extra palloc overhead.
118 *
119 * Since we have no need to free the ispell data items individually, there's
120 * not much value in the per-chunk overhead normally consumed by palloc.
121 * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
122 *
123 * We currently pre-zero all data allocated this way, even though some of it
124 * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
125 * to indicate which allocations actually require zeroing.
126 */
127#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
128#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
129
130static void *
132{
133 void *result;
134
135 /* Should only be called during init */
136 Assert(Conf->buildCxt != NULL);
137
138 /* No point in this for large chunks */
139 if (size > COMPACT_MAX_REQ)
140 return palloc0(size);
141
142 /* Keep everything maxaligned */
143 size = MAXALIGN(size);
144
145 /* Need more space? */
146 if (size > Conf->avail)
147 {
150 }
151
152 result = Conf->firstfree;
153 Conf->firstfree += size;
154 Conf->avail -= size;
155
156 return result;
157}
158
159#define cpalloc(size) compact_palloc0(Conf, size)
160#define cpalloc0(size) compact_palloc0(Conf, size)
161
162static char *
163cpstrdup(IspellDict *Conf, const char *str)
164{
165 char *res = cpalloc(strlen(str) + 1);
166
167 strcpy(res, str);
168 return res;
169}
170
171
172/*
173 * Apply str_tolower(), producing a temporary result (in the buildCxt).
174 */
175static char *
176lowerstr_ctx(IspellDict *Conf, const char *src)
177{
178 MemoryContext saveCtx;
179 char *dst;
180
181 saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
182 dst = str_tolower(src, strlen(src), DEFAULT_COLLATION_OID);
183 MemoryContextSwitchTo(saveCtx);
184
185 return dst;
186}
187
188#define MAX_NORM 1024
189#define MAXNORMLEN 256
190
191#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
192#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
193#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
194
195static const char *VoidString = "";
196
197static int
198cmpspell(const void *s1, const void *s2)
199{
200 return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
201}
202
203static int
204cmpspellaffix(const void *s1, const void *s2)
205{
206 return strcmp((*(SPELL *const *) s1)->p.flag,
207 (*(SPELL *const *) s2)->p.flag);
208}
209
210static int
211cmpcmdflag(const void *f1, const void *f2)
212{
214 *fv2 = (CompoundAffixFlag *) f2;
215
216 Assert(fv1->flagMode == fv2->flagMode);
217
218 if (fv1->flagMode == FM_NUM)
219 {
220 if (fv1->flag.i == fv2->flag.i)
221 return 0;
222
223 return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
224 }
225
226 return strcmp(fv1->flag.s, fv2->flag.s);
227}
228
229static char *
230findchar(char *str, int c)
231{
232 while (*str)
233 {
234 if (t_iseq(str, c))
235 return str;
236 str += pg_mblen(str);
237 }
238
239 return NULL;
240}
241
242static char *
243findchar2(char *str, int c1, int c2)
244{
245 while (*str)
246 {
247 if (t_iseq(str, c1) || t_iseq(str, c2))
248 return str;
249 str += pg_mblen(str);
250 }
251
252 return NULL;
253}
254
255
256/* backward string compare for suffix tree operations */
257static int
258strbcmp(const unsigned char *s1, const unsigned char *s2)
259{
260 int l1 = strlen((const char *) s1) - 1,
261 l2 = strlen((const char *) s2) - 1;
262
263 while (l1 >= 0 && l2 >= 0)
264 {
265 if (s1[l1] < s2[l2])
266 return -1;
267 if (s1[l1] > s2[l2])
268 return 1;
269 l1--;
270 l2--;
271 }
272 if (l1 < l2)
273 return -1;
274 if (l1 > l2)
275 return 1;
276
277 return 0;
278}
279
280static int
281strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
282{
283 int l1 = strlen((const char *) s1) - 1,
284 l2 = strlen((const char *) s2) - 1,
285 l = count;
286
287 while (l1 >= 0 && l2 >= 0 && l > 0)
288 {
289 if (s1[l1] < s2[l2])
290 return -1;
291 if (s1[l1] > s2[l2])
292 return 1;
293 l1--;
294 l2--;
295 l--;
296 }
297 if (l == 0)
298 return 0;
299 if (l1 < l2)
300 return -1;
301 if (l1 > l2)
302 return 1;
303 return 0;
304}
305
306/*
307 * Compares affixes.
308 * First compares the type of an affix. Prefixes should go before affixes.
309 * If types are equal then compares replaceable string.
310 */
311static int
312cmpaffix(const void *s1, const void *s2)
313{
314 const AFFIX *a1 = (const AFFIX *) s1;
315 const AFFIX *a2 = (const AFFIX *) s2;
316
317 if (a1->type < a2->type)
318 return -1;
319 if (a1->type > a2->type)
320 return 1;
321 if (a1->type == FF_PREFIX)
322 return strcmp(a1->repl, a2->repl);
323 else
324 return strbcmp((const unsigned char *) a1->repl,
325 (const unsigned char *) a2->repl);
326}
327
328/*
329 * Gets an affix flag from the set of affix flags (sflagset).
330 *
331 * Several flags can be stored in a single string. Flags can be represented by:
332 * - 1 character (FM_CHAR). A character may be Unicode.
333 * - 2 characters (FM_LONG). A character may be Unicode.
334 * - numbers from 1 to 65000 (FM_NUM).
335 *
336 * Depending on the flagMode an affix string can have the following format:
337 * - FM_CHAR: ABCD
338 * Here we have 4 flags: A, B, C and D
339 * - FM_LONG: ABCDE*
340 * Here we have 3 flags: AB, CD and E*
341 * - FM_NUM: 200,205,50
342 * Here we have 3 flags: 200, 205 and 50
343 *
344 * Conf: current dictionary.
345 * sflagset: the set of affix flags. Returns a reference to the start of a next
346 * affix flag.
347 * sflag: returns an affix flag from sflagset.
348 */
349static void
350getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
351{
352 int32 s;
353 char *next;
354 const char *sbuf = *sflagset;
355 int maxstep;
356 bool stop = false;
357 bool met_comma = false;
358
359 maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
360
361 while (**sflagset)
362 {
363 switch (Conf->flagMode)
364 {
365 case FM_LONG:
366 case FM_CHAR:
367 COPYCHAR(sflag, *sflagset);
368 sflag += pg_mblen(*sflagset);
369
370 /* Go to start of the next flag */
371 *sflagset += pg_mblen(*sflagset);
372
373 /* Check if we get all characters of flag */
374 maxstep--;
375 stop = (maxstep == 0);
376 break;
377 case FM_NUM:
378 s = strtol(*sflagset, &next, 10);
379 if (*sflagset == next || errno == ERANGE)
381 (errcode(ERRCODE_CONFIG_FILE_ERROR),
382 errmsg("invalid affix flag \"%s\"", *sflagset)));
383 if (s < 0 || s > FLAGNUM_MAXSIZE)
385 (errcode(ERRCODE_CONFIG_FILE_ERROR),
386 errmsg("affix flag \"%s\" is out of range",
387 *sflagset)));
388 sflag += sprintf(sflag, "%0d", s);
389
390 /* Go to start of the next flag */
391 *sflagset = next;
392 while (**sflagset)
393 {
394 if (isdigit((unsigned char) **sflagset))
395 {
396 if (!met_comma)
398 (errcode(ERRCODE_CONFIG_FILE_ERROR),
399 errmsg("invalid affix flag \"%s\"",
400 *sflagset)));
401 break;
402 }
403 else if (t_iseq(*sflagset, ','))
404 {
405 if (met_comma)
407 (errcode(ERRCODE_CONFIG_FILE_ERROR),
408 errmsg("invalid affix flag \"%s\"",
409 *sflagset)));
410 met_comma = true;
411 }
412 else if (!isspace((unsigned char) **sflagset))
413 {
415 (errcode(ERRCODE_CONFIG_FILE_ERROR),
416 errmsg("invalid character in affix flag \"%s\"",
417 *sflagset)));
418 }
419
420 *sflagset += pg_mblen(*sflagset);
421 }
422 stop = true;
423 break;
424 default:
425 elog(ERROR, "unrecognized type of Conf->flagMode: %d",
426 Conf->flagMode);
427 }
428
429 if (stop)
430 break;
431 }
432
433 if (Conf->flagMode == FM_LONG && maxstep > 0)
435 (errcode(ERRCODE_CONFIG_FILE_ERROR),
436 errmsg("invalid affix flag \"%s\" with \"long\" flag value",
437 sbuf)));
438
439 *sflag = '\0';
440}
441
442/*
443 * Checks if the affix set Conf->AffixData[affix] contains affixflag.
444 * Conf->AffixData[affix] does not contain affixflag if this flag is not used
445 * actually by the .dict file.
446 *
447 * Conf: current dictionary.
448 * affix: index of the Conf->AffixData array.
449 * affixflag: the affix flag.
450 *
451 * Returns true if the string Conf->AffixData[affix] contains affixflag,
452 * otherwise returns false.
453 */
454static bool
455IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
456{
457 const char *flagcur;
458 char flag[BUFSIZ];
459
460 if (*affixflag == 0)
461 return true;
462
463 Assert(affix < Conf->nAffixData);
464
465 flagcur = Conf->AffixData[affix];
466
467 while (*flagcur)
468 {
469 getNextFlagFromString(Conf, &flagcur, flag);
470 /* Compare first affix flag in flagcur with affixflag */
471 if (strcmp(flag, affixflag) == 0)
472 return true;
473 }
474
475 /* Could not find affixflag */
476 return false;
477}
478
479/*
480 * Adds the new word into the temporary array Spell.
481 *
482 * Conf: current dictionary.
483 * word: new word.
484 * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
485 */
486static void
487NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
488{
489 if (Conf->nspell >= Conf->mspell)
490 {
491 if (Conf->mspell)
492 {
493 Conf->mspell *= 2;
494 Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
495 }
496 else
497 {
498 Conf->mspell = 1024 * 20;
499 Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
500 }
501 }
502 Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
503 strcpy(Conf->Spell[Conf->nspell]->word, word);
504 Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
505 ? cpstrdup(Conf, flag) : VoidString;
506 Conf->nspell++;
507}
508
509/*
510 * Imports dictionary into the temporary array Spell.
511 *
512 * Note caller must already have applied get_tsearch_config_filename.
513 *
514 * Conf: current dictionary.
515 * filename: path to the .dict file.
516 */
517void
519{
521 char *line;
522
523 if (!tsearch_readline_begin(&trst, filename))
525 (errcode(ERRCODE_CONFIG_FILE_ERROR),
526 errmsg("could not open dictionary file \"%s\": %m",
527 filename)));
528
529 while ((line = tsearch_readline(&trst)) != NULL)
530 {
531 char *s,
532 *pstr;
533
534 /* Set of affix flags */
535 const char *flag;
536
537 /* Extract flag from the line */
538 flag = NULL;
539 if ((s = findchar(line, '/')))
540 {
541 *s++ = '\0';
542 flag = s;
543 while (*s)
544 {
545 /* we allow only single encoded flags for faster works */
546 if (pg_mblen(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
547 s++;
548 else
549 {
550 *s = '\0';
551 break;
552 }
553 }
554 }
555 else
556 flag = "";
557
558 /* Remove trailing spaces */
559 s = line;
560 while (*s)
561 {
562 if (isspace((unsigned char) *s))
563 {
564 *s = '\0';
565 break;
566 }
567 s += pg_mblen(s);
568 }
569 pstr = lowerstr_ctx(Conf, line);
570
571 NIAddSpell(Conf, pstr, flag);
572 pfree(pstr);
573
574 pfree(line);
575 }
577}
578
579/*
580 * Searches a basic form of word in the prefix tree. This word was generated
581 * using an affix rule. This rule may not be presented in an affix set of
582 * a basic form of word.
583 *
584 * For example, we have the entry in the .dict file:
585 * meter/GMD
586 *
587 * The affix rule with the flag S:
588 * SFX S y ies [^aeiou]y
589 * is not presented here.
590 *
591 * The affix rule with the flag M:
592 * SFX M 0 's .
593 * is presented here.
594 *
595 * Conf: current dictionary.
596 * word: basic form of word.
597 * affixflag: affix flag, by which a basic form of word was generated.
598 * flag: compound flag used to compare with StopMiddle->compoundflag.
599 *
600 * Returns 1 if the word was found in the prefix tree, else returns 0.
601 */
602static int
603FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
604{
605 SPNode *node = Conf->Dictionary;
606 SPNodeData *StopLow,
607 *StopHigh,
608 *StopMiddle;
609 const uint8 *ptr = (const uint8 *) word;
610
612
613 while (node && *ptr)
614 {
615 StopLow = node->data;
616 StopHigh = node->data + node->length;
617 while (StopLow < StopHigh)
618 {
619 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
620 if (StopMiddle->val == *ptr)
621 {
622 if (*(ptr + 1) == '\0' && StopMiddle->isword)
623 {
624 if (flag == 0)
625 {
626 /*
627 * The word can be formed only with another word. And
628 * in the flag parameter there is not a sign that we
629 * search compound words.
630 */
631 if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
632 return 0;
633 }
634 else if ((flag & StopMiddle->compoundflag) == 0)
635 return 0;
636
637 /*
638 * Check if this affix rule is presented in the affix set
639 * with index StopMiddle->affix.
640 */
641 if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
642 return 1;
643 }
644 node = StopMiddle->node;
645 ptr++;
646 break;
647 }
648 else if (StopMiddle->val < *ptr)
649 StopLow = StopMiddle + 1;
650 else
651 StopHigh = StopMiddle;
652 }
653 if (StopLow >= StopHigh)
654 break;
655 }
656 return 0;
657}
658
659/*
660 * Adds a new affix rule to the Affix field.
661 *
662 * Conf: current dictionary.
663 * flag: affix flag ('\' in the below example).
664 * flagflags: set of flags from the flagval field for this affix rule. This set
665 * is listed after '/' character in the added string (repl).
666 *
667 * For example L flag in the hunspell_sample.affix:
668 * SFX \ 0 Y/L [^Y]
669 *
670 * mask: condition for search ('[^Y]' in the above example).
671 * find: stripping characters from beginning (at prefix) or end (at suffix)
672 * of the word ('0' in the above example, 0 means that there is not
673 * stripping character).
674 * repl: adding string after stripping ('Y' in the above example).
675 * type: FF_SUFFIX or FF_PREFIX.
676 */
677static void
678NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
679 const char *find, const char *repl, int type)
680{
681 AFFIX *Affix;
682
683 if (Conf->naffixes >= Conf->maffixes)
684 {
685 if (Conf->maffixes)
686 {
687 Conf->maffixes *= 2;
688 Conf->Affix = (AFFIX *) repalloc(Conf->Affix, Conf->maffixes * sizeof(AFFIX));
689 }
690 else
691 {
692 Conf->maffixes = 16;
693 Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
694 }
695 }
696
697 Affix = Conf->Affix + Conf->naffixes;
698
699 /* This affix rule can be applied for words with any ending */
700 if (strcmp(mask, ".") == 0 || *mask == '\0')
701 {
702 Affix->issimple = 1;
703 Affix->isregis = 0;
704 }
705 /* This affix rule will use regis to search word ending */
706 else if (RS_isRegis(mask))
707 {
708 Affix->issimple = 0;
709 Affix->isregis = 1;
710 RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
711 *mask ? mask : VoidString);
712 }
713 /* This affix rule will use regex_t to search word ending */
714 else
715 {
716 int masklen;
717 int wmasklen;
718 int err;
719 pg_wchar *wmask;
720 char *tmask;
721
722 Affix->issimple = 0;
723 Affix->isregis = 0;
724 tmask = (char *) tmpalloc(strlen(mask) + 3);
725 if (type == FF_SUFFIX)
726 sprintf(tmask, "%s$", mask);
727 else
728 sprintf(tmask, "^%s", mask);
729
730 masklen = strlen(tmask);
731 wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
732 wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
733
734 /*
735 * The regex and all internal state created by pg_regcomp are
736 * allocated in the dictionary's memory context, and will be freed
737 * automatically when it is destroyed.
738 */
739 Affix->reg.pregex = palloc(sizeof(regex_t));
740 err = pg_regcomp(Affix->reg.pregex, wmask, wmasklen,
742 DEFAULT_COLLATION_OID);
743 if (err)
744 {
745 char errstr[100];
746
747 pg_regerror(err, Affix->reg.pregex, errstr, sizeof(errstr));
749 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
750 errmsg("invalid regular expression: %s", errstr)));
751 }
752 }
753
754 Affix->flagflags = flagflags;
755 if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
756 {
757 if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
758 Affix->flagflags |= FF_COMPOUNDFLAG;
759 }
760 Affix->flag = cpstrdup(Conf, flag);
761 Affix->type = type;
762
763 Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
764 if ((Affix->replen = strlen(repl)) > 0)
765 Affix->repl = cpstrdup(Conf, repl);
766 else
767 Affix->repl = VoidString;
768 Conf->naffixes++;
769}
770
771/* Parsing states for parse_affentry() and friends */
772#define PAE_WAIT_MASK 0
773#define PAE_INMASK 1
774#define PAE_WAIT_FIND 2
775#define PAE_INFIND 3
776#define PAE_WAIT_REPL 4
777#define PAE_INREPL 5
778#define PAE_WAIT_TYPE 6
779#define PAE_WAIT_FLAG 7
780
781/*
782 * Parse next space-separated field of an .affix file line.
783 *
784 * *str is the input pointer (will be advanced past field)
785 * next is where to copy the field value to, with null termination
786 *
787 * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
788 *
789 * Returns true if we found a field, false if not.
790 */
791static bool
792get_nextfield(char **str, char *next)
793{
794 int state = PAE_WAIT_MASK;
795 int avail = BUFSIZ;
796
797 while (**str)
798 {
799 if (state == PAE_WAIT_MASK)
800 {
801 if (t_iseq(*str, '#'))
802 return false;
803 else if (!isspace((unsigned char) **str))
804 {
805 int clen = pg_mblen(*str);
806
807 if (clen < avail)
808 {
809 COPYCHAR(next, *str);
810 next += clen;
811 avail -= clen;
812 }
814 }
815 }
816 else /* state == PAE_INMASK */
817 {
818 if (isspace((unsigned char) **str))
819 {
820 *next = '\0';
821 return true;
822 }
823 else
824 {
825 int clen = pg_mblen(*str);
826
827 if (clen < avail)
828 {
829 COPYCHAR(next, *str);
830 next += clen;
831 avail -= clen;
832 }
833 }
834 }
835 *str += pg_mblen(*str);
836 }
837
838 *next = '\0';
839
840 return (state == PAE_INMASK); /* OK if we got a nonempty field */
841}
842
843/*
844 * Parses entry of an .affix file of MySpell or Hunspell format.
845 *
846 * An .affix file entry has the following format:
847 * - header
848 * <type> <flag> <cross_flag> <flag_count>
849 * - fields after header:
850 * <type> <flag> <find> <replace> <mask>
851 *
852 * str is the input line
853 * field values are returned to type etc, which must be buffers of size BUFSIZ.
854 *
855 * Returns number of fields found; any omitted fields are set to empty strings.
856 */
857static int
858parse_ooaffentry(char *str, char *type, char *flag, char *find,
859 char *repl, char *mask)
860{
861 int state = PAE_WAIT_TYPE;
862 int fields_read = 0;
863 bool valid = false;
864
865 *type = *flag = *find = *repl = *mask = '\0';
866
867 while (*str)
868 {
869 switch (state)
870 {
871 case PAE_WAIT_TYPE:
872 valid = get_nextfield(&str, type);
874 break;
875 case PAE_WAIT_FLAG:
876 valid = get_nextfield(&str, flag);
878 break;
879 case PAE_WAIT_FIND:
880 valid = get_nextfield(&str, find);
882 break;
883 case PAE_WAIT_REPL:
884 valid = get_nextfield(&str, repl);
886 break;
887 case PAE_WAIT_MASK:
888 valid = get_nextfield(&str, mask);
889 state = -1; /* force loop exit */
890 break;
891 default:
892 elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
893 state);
894 break;
895 }
896 if (valid)
897 fields_read++;
898 else
899 break; /* early EOL */
900 if (state < 0)
901 break; /* got all fields */
902 }
903
904 return fields_read;
905}
906
907/*
908 * Parses entry of an .affix file of Ispell format
909 *
910 * An .affix file entry has the following format:
911 * <mask> > [-<find>,]<replace>
912 */
913static bool
914parse_affentry(char *str, char *mask, char *find, char *repl)
915{
916 int state = PAE_WAIT_MASK;
917 char *pmask = mask,
918 *pfind = find,
919 *prepl = repl;
920
921 *mask = *find = *repl = '\0';
922
923 while (*str)
924 {
925 if (state == PAE_WAIT_MASK)
926 {
927 if (t_iseq(str, '#'))
928 return false;
929 else if (!isspace((unsigned char) *str))
930 {
931 COPYCHAR(pmask, str);
932 pmask += pg_mblen(str);
934 }
935 }
936 else if (state == PAE_INMASK)
937 {
938 if (t_iseq(str, '>'))
939 {
940 *pmask = '\0';
942 }
943 else if (!isspace((unsigned char) *str))
944 {
945 COPYCHAR(pmask, str);
946 pmask += pg_mblen(str);
947 }
948 }
949 else if (state == PAE_WAIT_FIND)
950 {
951 if (t_iseq(str, '-'))
952 {
954 }
955 else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
956 {
957 COPYCHAR(prepl, str);
958 prepl += pg_mblen(str);
960 }
961 else if (!isspace((unsigned char) *str))
963 (errcode(ERRCODE_CONFIG_FILE_ERROR),
964 errmsg("syntax error")));
965 }
966 else if (state == PAE_INFIND)
967 {
968 if (t_iseq(str, ','))
969 {
970 *pfind = '\0';
972 }
973 else if (t_isalpha(str))
974 {
975 COPYCHAR(pfind, str);
976 pfind += pg_mblen(str);
977 }
978 else if (!isspace((unsigned char) *str))
980 (errcode(ERRCODE_CONFIG_FILE_ERROR),
981 errmsg("syntax error")));
982 }
983 else if (state == PAE_WAIT_REPL)
984 {
985 if (t_iseq(str, '-'))
986 {
987 break; /* void repl */
988 }
989 else if (t_isalpha(str))
990 {
991 COPYCHAR(prepl, str);
992 prepl += pg_mblen(str);
994 }
995 else if (!isspace((unsigned char) *str))
997 (errcode(ERRCODE_CONFIG_FILE_ERROR),
998 errmsg("syntax error")));
999 }
1000 else if (state == PAE_INREPL)
1001 {
1002 if (t_iseq(str, '#'))
1003 {
1004 *prepl = '\0';
1005 break;
1006 }
1007 else if (t_isalpha(str))
1008 {
1009 COPYCHAR(prepl, str);
1010 prepl += pg_mblen(str);
1011 }
1012 else if (!isspace((unsigned char) *str))
1013 ereport(ERROR,
1014 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1015 errmsg("syntax error")));
1016 }
1017 else
1018 elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1019
1020 str += pg_mblen(str);
1021 }
1022
1023 *pmask = *pfind = *prepl = '\0';
1024
1025 return (*mask && (*find || *repl));
1026}
1027
1028/*
1029 * Sets a Hunspell options depending on flag type.
1030 */
1031static void
1033 char *s, uint32 val)
1034{
1035 if (Conf->flagMode == FM_NUM)
1036 {
1037 char *next;
1038 int i;
1039
1040 i = strtol(s, &next, 10);
1041 if (s == next || errno == ERANGE)
1042 ereport(ERROR,
1043 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1044 errmsg("invalid affix flag \"%s\"", s)));
1045 if (i < 0 || i > FLAGNUM_MAXSIZE)
1046 ereport(ERROR,
1047 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1048 errmsg("affix flag \"%s\" is out of range", s)));
1049
1050 entry->flag.i = i;
1051 }
1052 else
1053 entry->flag.s = cpstrdup(Conf, s);
1054
1055 entry->flagMode = Conf->flagMode;
1056 entry->value = val;
1057}
1058
1059/*
1060 * Sets up a correspondence for the affix parameter with the affix flag.
1061 *
1062 * Conf: current dictionary.
1063 * s: affix flag in string.
1064 * val: affix parameter.
1065 */
1066static void
1068{
1069 CompoundAffixFlag *newValue;
1070 char sbuf[BUFSIZ];
1071 char *sflag;
1072 int clen;
1073
1074 while (*s && isspace((unsigned char) *s))
1075 s += pg_mblen(s);
1076
1077 if (!*s)
1078 ereport(ERROR,
1079 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1080 errmsg("syntax error")));
1081
1082 /* Get flag without \n */
1083 sflag = sbuf;
1084 while (*s && !isspace((unsigned char) *s) && *s != '\n')
1085 {
1086 clen = pg_mblen(s);
1087 COPYCHAR(sflag, s);
1088 sflag += clen;
1089 s += clen;
1090 }
1091 *sflag = '\0';
1092
1093 /* Resize array or allocate memory for array CompoundAffixFlag */
1094 if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1095 {
1096 if (Conf->mCompoundAffixFlag)
1097 {
1098 Conf->mCompoundAffixFlag *= 2;
1101 Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1102 }
1103 else
1104 {
1105 Conf->mCompoundAffixFlag = 10;
1108 }
1109 }
1110
1111 newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1112
1113 setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
1114
1115 Conf->usecompound = true;
1116 Conf->nCompoundAffixFlag++;
1117}
1118
1119/*
1120 * Returns a set of affix parameters which correspondence to the set of affix
1121 * flags s.
1122 */
1123static int
1125{
1126 uint32 flag = 0;
1127 CompoundAffixFlag *found,
1128 key;
1129 char sflag[BUFSIZ];
1130 const char *flagcur;
1131
1132 if (Conf->nCompoundAffixFlag == 0)
1133 return 0;
1134
1135 flagcur = s;
1136 while (*flagcur)
1137 {
1138 getNextFlagFromString(Conf, &flagcur, sflag);
1139 setCompoundAffixFlagValue(Conf, &key, sflag, 0);
1140
1141 found = (CompoundAffixFlag *)
1142 bsearch(&key, Conf->CompoundAffixFlags,
1144 cmpcmdflag);
1145 if (found != NULL)
1146 flag |= found->value;
1147 }
1148
1149 return flag;
1150}
1151
1152/*
1153 * Returns a flag set using the s parameter.
1154 *
1155 * If Conf->useFlagAliases is true then the s parameter is index of the
1156 * Conf->AffixData array and function returns its entry.
1157 * Else function returns the s parameter.
1158 */
1159static const char *
1161{
1162 if (Conf->useFlagAliases && *s != '\0')
1163 {
1164 int curaffix;
1165 char *end;
1166
1167 curaffix = strtol(s, &end, 10);
1168 if (s == end || errno == ERANGE)
1169 ereport(ERROR,
1170 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1171 errmsg("invalid affix alias \"%s\"", s)));
1172
1173 if (curaffix > 0 && curaffix < Conf->nAffixData)
1174
1175 /*
1176 * Do not subtract 1 from curaffix because empty string was added
1177 * in NIImportOOAffixes
1178 */
1179 return Conf->AffixData[curaffix];
1180 else if (curaffix > Conf->nAffixData)
1181 ereport(ERROR,
1182 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1183 errmsg("invalid affix alias \"%s\"", s)));
1184 return VoidString;
1185 }
1186 else
1187 return s;
1188}
1189
1190/*
1191 * Import an affix file that follows MySpell or Hunspell format.
1192 *
1193 * Conf: current dictionary.
1194 * filename: path to the .affix file.
1195 */
1196static void
1198{
1199 char type[BUFSIZ],
1200 *ptype = NULL;
1201 char sflag[BUFSIZ];
1202 char mask[BUFSIZ],
1203 *pmask;
1204 char find[BUFSIZ],
1205 *pfind;
1206 char repl[BUFSIZ],
1207 *prepl;
1208 bool isSuffix = false;
1209 int naffix = 0,
1210 curaffix = 0;
1211 int sflaglen = 0;
1212 char flagflags = 0;
1214 char *recoded;
1215
1216 /* read file to find any flag */
1217 Conf->usecompound = false;
1218 Conf->useFlagAliases = false;
1219 Conf->flagMode = FM_CHAR;
1220
1221 if (!tsearch_readline_begin(&trst, filename))
1222 ereport(ERROR,
1223 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1224 errmsg("could not open affix file \"%s\": %m",
1225 filename)));
1226
1227 while ((recoded = tsearch_readline(&trst)) != NULL)
1228 {
1229 if (*recoded == '\0' || isspace((unsigned char) *recoded) || t_iseq(recoded, '#'))
1230 {
1231 pfree(recoded);
1232 continue;
1233 }
1234
1235 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1236 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1238 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1239 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1241 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1242 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1244 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
1245 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1246 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1248 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1249 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1251 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1252 addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1254 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1256 recoded + strlen("COMPOUNDPERMITFLAG"),
1258 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1260 recoded + strlen("COMPOUNDFORBIDFLAG"),
1262 else if (STRNCMP(recoded, "FLAG") == 0)
1263 {
1264 char *s = recoded + strlen("FLAG");
1265
1266 while (*s && isspace((unsigned char) *s))
1267 s += pg_mblen(s);
1268
1269 if (*s)
1270 {
1271 if (STRNCMP(s, "long") == 0)
1272 Conf->flagMode = FM_LONG;
1273 else if (STRNCMP(s, "num") == 0)
1274 Conf->flagMode = FM_NUM;
1275 else if (STRNCMP(s, "default") != 0)
1276 ereport(ERROR,
1277 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1278 errmsg("Ispell dictionary supports only "
1279 "\"default\", \"long\", "
1280 "and \"num\" flag values")));
1281 }
1282 }
1283
1284 pfree(recoded);
1285 }
1286 tsearch_readline_end(&trst);
1287
1288 if (Conf->nCompoundAffixFlag > 1)
1290 sizeof(CompoundAffixFlag), cmpcmdflag);
1291
1292 if (!tsearch_readline_begin(&trst, filename))
1293 ereport(ERROR,
1294 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1295 errmsg("could not open affix file \"%s\": %m",
1296 filename)));
1297
1298 while ((recoded = tsearch_readline(&trst)) != NULL)
1299 {
1300 int fields_read;
1301
1302 if (*recoded == '\0' || isspace((unsigned char) *recoded) || t_iseq(recoded, '#'))
1303 goto nextline;
1304
1305 fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
1306
1307 if (ptype)
1308 pfree(ptype);
1309 ptype = lowerstr_ctx(Conf, type);
1310
1311 /* First try to parse AF parameter (alias compression) */
1312 if (STRNCMP(ptype, "af") == 0)
1313 {
1314 /* First line is the number of aliases */
1315 if (!Conf->useFlagAliases)
1316 {
1317 Conf->useFlagAliases = true;
1318 naffix = atoi(sflag);
1319 if (naffix <= 0)
1320 ereport(ERROR,
1321 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1322 errmsg("invalid number of flag vector aliases")));
1323
1324 /* Also reserve place for empty flag set */
1325 naffix++;
1326
1327 Conf->AffixData = (const char **) palloc0(naffix * sizeof(char *));
1328 Conf->lenAffixData = Conf->nAffixData = naffix;
1329
1330 /* Add empty flag set into AffixData */
1331 Conf->AffixData[curaffix] = VoidString;
1332 curaffix++;
1333 }
1334 /* Other lines are aliases */
1335 else
1336 {
1337 if (curaffix < naffix)
1338 {
1339 Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1340 curaffix++;
1341 }
1342 else
1343 ereport(ERROR,
1344 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1345 errmsg("number of aliases exceeds specified number %d",
1346 naffix - 1)));
1347 }
1348 goto nextline;
1349 }
1350 /* Else try to parse prefixes and suffixes */
1351 if (fields_read < 4 ||
1352 (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1353 goto nextline;
1354
1355 sflaglen = strlen(sflag);
1356 if (sflaglen == 0
1357 || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1358 || (sflaglen > 2 && Conf->flagMode == FM_LONG))
1359 goto nextline;
1360
1361 /*--------
1362 * Affix header. For example:
1363 * SFX \ N 1
1364 *--------
1365 */
1366 if (fields_read == 4)
1367 {
1368 isSuffix = (STRNCMP(ptype, "sfx") == 0);
1369 if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1370 flagflags = FF_CROSSPRODUCT;
1371 else
1372 flagflags = 0;
1373 }
1374 /*--------
1375 * Affix fields. For example:
1376 * SFX \ 0 Y/L [^Y]
1377 *--------
1378 */
1379 else
1380 {
1381 char *ptr;
1382 int aflg = 0;
1383
1384 /* Get flags after '/' (flags are case sensitive) */
1385 if ((ptr = strchr(repl, '/')) != NULL)
1386 aflg |= getCompoundAffixFlagValue(Conf,
1387 getAffixFlagSet(Conf,
1388 ptr + 1));
1389 /* Get lowercased version of string before '/' */
1390 prepl = lowerstr_ctx(Conf, repl);
1391 if ((ptr = strchr(prepl, '/')) != NULL)
1392 *ptr = '\0';
1393 pfind = lowerstr_ctx(Conf, find);
1394 pmask = lowerstr_ctx(Conf, mask);
1395 if (t_iseq(find, '0'))
1396 *pfind = '\0';
1397 if (t_iseq(repl, '0'))
1398 *prepl = '\0';
1399
1400 NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1401 isSuffix ? FF_SUFFIX : FF_PREFIX);
1402 pfree(prepl);
1403 pfree(pfind);
1404 pfree(pmask);
1405 }
1406
1407nextline:
1408 pfree(recoded);
1409 }
1410
1411 tsearch_readline_end(&trst);
1412 if (ptype)
1413 pfree(ptype);
1414}
1415
1416/*
1417 * import affixes
1418 *
1419 * Note caller must already have applied get_tsearch_config_filename
1420 *
1421 * This function is responsible for parsing ispell ("old format") affix files.
1422 * If we realize that the file contains new-format commands, we pass off the
1423 * work to NIImportOOAffixes(), which will re-read the whole file.
1424 */
1425void
1427{
1428 char *pstr = NULL;
1429 char flag[BUFSIZ];
1430 char mask[BUFSIZ];
1431 char find[BUFSIZ];
1432 char repl[BUFSIZ];
1433 char *s;
1434 bool suffixes = false;
1435 bool prefixes = false;
1436 char flagflags = 0;
1438 bool oldformat = false;
1439 char *recoded = NULL;
1440
1441 if (!tsearch_readline_begin(&trst, filename))
1442 ereport(ERROR,
1443 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1444 errmsg("could not open affix file \"%s\": %m",
1445 filename)));
1446
1447 Conf->usecompound = false;
1448 Conf->useFlagAliases = false;
1449 Conf->flagMode = FM_CHAR;
1450
1451 while ((recoded = tsearch_readline(&trst)) != NULL)
1452 {
1453 pstr = str_tolower(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
1454
1455 /* Skip comments and empty lines */
1456 if (*pstr == '#' || *pstr == '\n')
1457 goto nextline;
1458
1459 if (STRNCMP(pstr, "compoundwords") == 0)
1460 {
1461 /* Find case-insensitive L flag in non-lowercased string */
1462 s = findchar2(recoded, 'l', 'L');
1463 if (s)
1464 {
1465 while (*s && !isspace((unsigned char) *s))
1466 s += pg_mblen(s);
1467 while (*s && isspace((unsigned char) *s))
1468 s += pg_mblen(s);
1469
1470 if (*s && pg_mblen(s) == 1)
1471 {
1473 Conf->usecompound = true;
1474 }
1475 oldformat = true;
1476 goto nextline;
1477 }
1478 }
1479 if (STRNCMP(pstr, "suffixes") == 0)
1480 {
1481 suffixes = true;
1482 prefixes = false;
1483 oldformat = true;
1484 goto nextline;
1485 }
1486 if (STRNCMP(pstr, "prefixes") == 0)
1487 {
1488 suffixes = false;
1489 prefixes = true;
1490 oldformat = true;
1491 goto nextline;
1492 }
1493 if (STRNCMP(pstr, "flag") == 0)
1494 {
1495 s = recoded + 4; /* we need non-lowercased string */
1496 flagflags = 0;
1497
1498 while (*s && isspace((unsigned char) *s))
1499 s += pg_mblen(s);
1500
1501 if (*s == '*')
1502 {
1503 flagflags |= FF_CROSSPRODUCT;
1504 s++;
1505 }
1506 else if (*s == '~')
1507 {
1508 flagflags |= FF_COMPOUNDONLY;
1509 s++;
1510 }
1511
1512 if (*s == '\\')
1513 s++;
1514
1515 /*
1516 * An old-format flag is a single ASCII character; we expect it to
1517 * be followed by EOL, whitespace, or ':'. Otherwise this is a
1518 * new-format flag command.
1519 */
1520 if (*s && pg_mblen(s) == 1)
1521 {
1522 COPYCHAR(flag, s);
1523 flag[1] = '\0';
1524
1525 s++;
1526 if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1527 isspace((unsigned char) *s))
1528 {
1529 oldformat = true;
1530 goto nextline;
1531 }
1532 }
1533 goto isnewformat;
1534 }
1535 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1536 STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1537 STRNCMP(recoded, "PFX") == 0 ||
1538 STRNCMP(recoded, "SFX") == 0)
1539 goto isnewformat;
1540
1541 if ((!suffixes) && (!prefixes))
1542 goto nextline;
1543
1544 if (!parse_affentry(pstr, mask, find, repl))
1545 goto nextline;
1546
1547 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1548
1549nextline:
1550 pfree(recoded);
1551 pfree(pstr);
1552 }
1553 tsearch_readline_end(&trst);
1554 return;
1555
1556isnewformat:
1557 if (oldformat)
1558 ereport(ERROR,
1559 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1560 errmsg("affix file contains both old-style and new-style commands")));
1561 tsearch_readline_end(&trst);
1562
1564}
1565
1566/*
1567 * Merges two affix flag sets and stores a new affix flag set into
1568 * Conf->AffixData.
1569 *
1570 * Returns index of a new affix flag set.
1571 */
1572static int
1573MergeAffix(IspellDict *Conf, int a1, int a2)
1574{
1575 const char **ptr;
1576
1577 Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
1578
1579 /* Do not merge affix flags if one of affix flags is empty */
1580 if (*Conf->AffixData[a1] == '\0')
1581 return a2;
1582 else if (*Conf->AffixData[a2] == '\0')
1583 return a1;
1584
1585 /* Double the size of AffixData if there's not enough space */
1586 if (Conf->nAffixData + 1 >= Conf->lenAffixData)
1587 {
1588 Conf->lenAffixData *= 2;
1589 Conf->AffixData = (const char **) repalloc(Conf->AffixData,
1590 sizeof(char *) * Conf->lenAffixData);
1591 }
1592
1593 ptr = Conf->AffixData + Conf->nAffixData;
1594 if (Conf->flagMode == FM_NUM)
1595 {
1596 char *p = cpalloc(strlen(Conf->AffixData[a1]) +
1597 strlen(Conf->AffixData[a2]) +
1598 1 /* comma */ + 1 /* \0 */ );
1599
1600 sprintf(p, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1601 *ptr = p;
1602 }
1603 else
1604 {
1605 char *p = cpalloc(strlen(Conf->AffixData[a1]) +
1606 strlen(Conf->AffixData[a2]) +
1607 1 /* \0 */ );
1608
1609 sprintf(p, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1610 *ptr = p;
1611 }
1612 ptr++;
1613 *ptr = NULL;
1614 Conf->nAffixData++;
1615
1616 return Conf->nAffixData - 1;
1617}
1618
1619/*
1620 * Returns a set of affix parameters which correspondence to the set of affix
1621 * flags with the given index.
1622 */
1623static uint32
1625{
1626 Assert(affix < Conf->nAffixData);
1627
1628 return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
1630}
1631
1632/*
1633 * Makes a prefix tree for the given level.
1634 *
1635 * Conf: current dictionary.
1636 * low: lower index of the Conf->Spell array.
1637 * high: upper index of the Conf->Spell array.
1638 * level: current prefix tree level.
1639 */
1640static SPNode *
1641mkSPNode(IspellDict *Conf, int low, int high, int level)
1642{
1643 int i;
1644 int nchar = 0;
1645 char lastchar = '\0';
1646 SPNode *rs;
1648 int lownew = low;
1649
1650 for (i = low; i < high; i++)
1651 if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1652 {
1653 nchar++;
1654 lastchar = Conf->Spell[i]->word[level];
1655 }
1656
1657 if (!nchar)
1658 return NULL;
1659
1660 rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1661 rs->length = nchar;
1662 data = rs->data;
1663
1664 lastchar = '\0';
1665 for (i = low; i < high; i++)
1666 if (Conf->Spell[i]->p.d.len > level)
1667 {
1668 if (lastchar != Conf->Spell[i]->word[level])
1669 {
1670 if (lastchar)
1671 {
1672 /* Next level of the prefix tree */
1673 data->node = mkSPNode(Conf, lownew, i, level + 1);
1674 lownew = i;
1675 data++;
1676 }
1677 lastchar = Conf->Spell[i]->word[level];
1678 }
1679 data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1680 if (Conf->Spell[i]->p.d.len == level + 1)
1681 {
1682 bool clearCompoundOnly = false;
1683
1684 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1685 {
1686 /*
1687 * MergeAffix called a few times. If one of word is
1688 * allowed to be in compound word and another isn't, then
1689 * clear FF_COMPOUNDONLY flag.
1690 */
1691
1692 clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1693 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1694 ? false : true;
1695 data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1696 }
1697 else
1698 data->affix = Conf->Spell[i]->p.d.affix;
1699 data->isword = 1;
1700
1701 data->compoundflag = makeCompoundFlags(Conf, data->affix);
1702
1703 if ((data->compoundflag & FF_COMPOUNDONLY) &&
1704 (data->compoundflag & FF_COMPOUNDFLAG) == 0)
1705 data->compoundflag |= FF_COMPOUNDFLAG;
1706
1707 if (clearCompoundOnly)
1708 data->compoundflag &= ~FF_COMPOUNDONLY;
1709 }
1710 }
1711
1712 /* Next level of the prefix tree */
1713 data->node = mkSPNode(Conf, lownew, high, level + 1);
1714
1715 return rs;
1716}
1717
1718/*
1719 * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1720 * and affixes.
1721 */
1722void
1724{
1725 int i;
1726 int naffix;
1727 int curaffix;
1728
1729 /* compress affixes */
1730
1731 /*
1732 * If we use flag aliases then we need to use Conf->AffixData filled in
1733 * the NIImportOOAffixes().
1734 */
1735 if (Conf->useFlagAliases)
1736 {
1737 for (i = 0; i < Conf->nspell; i++)
1738 {
1739 char *end;
1740
1741 if (*Conf->Spell[i]->p.flag != '\0')
1742 {
1743 curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1744 if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1745 ereport(ERROR,
1746 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1747 errmsg("invalid affix alias \"%s\"",
1748 Conf->Spell[i]->p.flag)));
1749 if (curaffix < 0 || curaffix >= Conf->nAffixData)
1750 ereport(ERROR,
1751 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1752 errmsg("invalid affix alias \"%s\"",
1753 Conf->Spell[i]->p.flag)));
1754 if (*end != '\0' && !isdigit((unsigned char) *end) && !isspace((unsigned char) *end))
1755 ereport(ERROR,
1756 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1757 errmsg("invalid affix alias \"%s\"",
1758 Conf->Spell[i]->p.flag)));
1759 }
1760 else
1761 {
1762 /*
1763 * If Conf->Spell[i]->p.flag is empty, then get empty value of
1764 * Conf->AffixData (0 index).
1765 */
1766 curaffix = 0;
1767 }
1768
1769 Conf->Spell[i]->p.d.affix = curaffix;
1770 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1771 }
1772 }
1773 /* Otherwise fill Conf->AffixData here */
1774 else
1775 {
1776 /* Count the number of different flags used in the dictionary */
1777 qsort(Conf->Spell, Conf->nspell, sizeof(SPELL *),
1779
1780 naffix = 0;
1781 for (i = 0; i < Conf->nspell; i++)
1782 {
1783 if (i == 0 ||
1784 strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0)
1785 naffix++;
1786 }
1787
1788 /*
1789 * Fill in Conf->AffixData with the affixes that were used in the
1790 * dictionary. Replace textual flag-field of Conf->Spell entries with
1791 * indexes into Conf->AffixData array.
1792 */
1793 Conf->AffixData = (const char **) palloc0(naffix * sizeof(const char *));
1794
1795 curaffix = -1;
1796 for (i = 0; i < Conf->nspell; i++)
1797 {
1798 if (i == 0 ||
1799 strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0)
1800 {
1801 curaffix++;
1802 Assert(curaffix < naffix);
1803 Conf->AffixData[curaffix] = cpstrdup(Conf,
1804 Conf->Spell[i]->p.flag);
1805 }
1806
1807 Conf->Spell[i]->p.d.affix = curaffix;
1808 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1809 }
1810
1811 Conf->lenAffixData = Conf->nAffixData = naffix;
1812 }
1813
1814 /* Start build a prefix tree */
1815 qsort(Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1816 Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1817}
1818
1819/*
1820 * Makes a prefix tree for the given level using the repl string of an affix
1821 * rule. Affixes with empty replace string do not include in the prefix tree.
1822 * This affixes are included by mkVoidAffix().
1823 *
1824 * Conf: current dictionary.
1825 * low: lower index of the Conf->Affix array.
1826 * high: upper index of the Conf->Affix array.
1827 * level: current prefix tree level.
1828 * type: FF_SUFFIX or FF_PREFIX.
1829 */
1830static AffixNode *
1831mkANode(IspellDict *Conf, int low, int high, int level, int type)
1832{
1833 int i;
1834 int nchar = 0;
1835 uint8 lastchar = '\0';
1836 AffixNode *rs;
1838 int lownew = low;
1839 int naff;
1840 AFFIX **aff;
1841
1842 for (i = low; i < high; i++)
1843 if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1844 {
1845 nchar++;
1846 lastchar = GETCHAR(Conf->Affix + i, level, type);
1847 }
1848
1849 if (!nchar)
1850 return NULL;
1851
1852 aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1853 naff = 0;
1854
1855 rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1856 rs->length = nchar;
1857 data = rs->data;
1858
1859 lastchar = '\0';
1860 for (i = low; i < high; i++)
1861 if (Conf->Affix[i].replen > level)
1862 {
1863 if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1864 {
1865 if (lastchar)
1866 {
1867 /* Next level of the prefix tree */
1868 data->node = mkANode(Conf, lownew, i, level + 1, type);
1869 if (naff)
1870 {
1871 data->naff = naff;
1872 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1873 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1874 naff = 0;
1875 }
1876 data++;
1877 lownew = i;
1878 }
1879 lastchar = GETCHAR(Conf->Affix + i, level, type);
1880 }
1881 data->val = GETCHAR(Conf->Affix + i, level, type);
1882 if (Conf->Affix[i].replen == level + 1)
1883 { /* affix stopped */
1884 aff[naff++] = Conf->Affix + i;
1885 }
1886 }
1887
1888 /* Next level of the prefix tree */
1889 data->node = mkANode(Conf, lownew, high, level + 1, type);
1890 if (naff)
1891 {
1892 data->naff = naff;
1893 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1894 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1895 naff = 0;
1896 }
1897
1898 pfree(aff);
1899
1900 return rs;
1901}
1902
1903/*
1904 * Makes the root void node in the prefix tree. The root void node is created
1905 * for affixes which have empty replace string ("repl" field).
1906 */
1907static void
1908mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1909{
1910 int i,
1911 cnt = 0;
1912 int start = (issuffix) ? startsuffix : 0;
1913 int end = (issuffix) ? Conf->naffixes : startsuffix;
1914 AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1915
1916 Affix->length = 1;
1917 Affix->isvoid = 1;
1918
1919 if (issuffix)
1920 {
1921 Affix->data->node = Conf->Suffix;
1922 Conf->Suffix = Affix;
1923 }
1924 else
1925 {
1926 Affix->data->node = Conf->Prefix;
1927 Conf->Prefix = Affix;
1928 }
1929
1930 /* Count affixes with empty replace string */
1931 for (i = start; i < end; i++)
1932 if (Conf->Affix[i].replen == 0)
1933 cnt++;
1934
1935 /* There is not affixes with empty replace string */
1936 if (cnt == 0)
1937 return;
1938
1939 Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1940 Affix->data->naff = (uint32) cnt;
1941
1942 cnt = 0;
1943 for (i = start; i < end; i++)
1944 if (Conf->Affix[i].replen == 0)
1945 {
1946 Affix->data->aff[cnt] = Conf->Affix + i;
1947 cnt++;
1948 }
1949}
1950
1951/*
1952 * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1953 * contain affixflag if this flag is not used actually by the .dict file.
1954 *
1955 * Conf: current dictionary.
1956 * affixflag: affix flag.
1957 *
1958 * Returns true if the Conf->AffixData array contains affixflag, otherwise
1959 * returns false.
1960 */
1961static bool
1962isAffixInUse(IspellDict *Conf, const char *affixflag)
1963{
1964 int i;
1965
1966 for (i = 0; i < Conf->nAffixData; i++)
1967 if (IsAffixFlagInUse(Conf, i, affixflag))
1968 return true;
1969
1970 return false;
1971}
1972
1973/*
1974 * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1975 */
1976void
1978{
1979 AFFIX *Affix;
1980 size_t i;
1981 CMPDAffix *ptr;
1982 int firstsuffix = Conf->naffixes;
1983
1984 if (Conf->naffixes == 0)
1985 return;
1986
1987 /* Store compound affixes in the Conf->CompoundAffix array */
1988 if (Conf->naffixes > 1)
1989 qsort(Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
1990 Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
1991 ptr->affix = NULL;
1992
1993 for (i = 0; i < Conf->naffixes; i++)
1994 {
1995 Affix = &(((AFFIX *) Conf->Affix)[i]);
1996 if (Affix->type == FF_SUFFIX && i < firstsuffix)
1997 firstsuffix = i;
1998
1999 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
2000 isAffixInUse(Conf, Affix->flag))
2001 {
2002 bool issuffix = (Affix->type == FF_SUFFIX);
2003
2004 if (ptr == Conf->CompoundAffix ||
2005 issuffix != (ptr - 1)->issuffix ||
2006 strbncmp((const unsigned char *) (ptr - 1)->affix,
2007 (const unsigned char *) Affix->repl,
2008 (ptr - 1)->len))
2009 {
2010 /* leave only unique and minimal suffixes */
2011 ptr->affix = Affix->repl;
2012 ptr->len = Affix->replen;
2013 ptr->issuffix = issuffix;
2014 ptr++;
2015 }
2016 }
2017 }
2018 ptr->affix = NULL;
2019 Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
2020
2021 /* Start build a prefix tree */
2022 Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
2023 Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
2024 mkVoidAffix(Conf, true, firstsuffix);
2025 mkVoidAffix(Conf, false, firstsuffix);
2026}
2027
2028static AffixNodeData *
2029FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
2030{
2031 AffixNodeData *StopLow,
2032 *StopHigh,
2033 *StopMiddle;
2034 uint8 symbol;
2035
2036 if (node->isvoid)
2037 { /* search void affixes */
2038 if (node->data->naff)
2039 return node->data;
2040 node = node->data->node;
2041 }
2042
2043 while (node && *level < wrdlen)
2044 {
2045 StopLow = node->data;
2046 StopHigh = node->data + node->length;
2047 while (StopLow < StopHigh)
2048 {
2049 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2050 symbol = GETWCHAR(word, wrdlen, *level, type);
2051
2052 if (StopMiddle->val == symbol)
2053 {
2054 (*level)++;
2055 if (StopMiddle->naff)
2056 return StopMiddle;
2057 node = StopMiddle->node;
2058 break;
2059 }
2060 else if (StopMiddle->val < symbol)
2061 StopLow = StopMiddle + 1;
2062 else
2063 StopHigh = StopMiddle;
2064 }
2065 if (StopLow >= StopHigh)
2066 break;
2067 }
2068 return NULL;
2069}
2070
2071static char *
2072CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2073{
2074 /*
2075 * Check compound allow flags
2076 */
2077
2078 if (flagflags == 0)
2079 {
2080 if (Affix->flagflags & FF_COMPOUNDONLY)
2081 return NULL;
2082 }
2083 else if (flagflags & FF_COMPOUNDBEGIN)
2084 {
2085 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2086 return NULL;
2087 if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2088 if (Affix->type == FF_SUFFIX)
2089 return NULL;
2090 }
2091 else if (flagflags & FF_COMPOUNDMIDDLE)
2092 {
2093 if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2095 return NULL;
2096 }
2097 else if (flagflags & FF_COMPOUNDLAST)
2098 {
2099 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2100 return NULL;
2101 if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2102 if (Affix->type == FF_PREFIX)
2103 return NULL;
2104 }
2105
2106 /*
2107 * make replace pattern of affix
2108 */
2109 if (Affix->type == FF_SUFFIX)
2110 {
2111 strcpy(newword, word);
2112 strcpy(newword + len - Affix->replen, Affix->find);
2113 if (baselen) /* store length of non-changed part of word */
2114 *baselen = len - Affix->replen;
2115 }
2116 else
2117 {
2118 /*
2119 * if prefix is an all non-changed part's length then all word
2120 * contains only prefix and suffix, so out
2121 */
2122 if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
2123 return NULL;
2124 strcpy(newword, Affix->find);
2125 strcat(newword, word + Affix->replen);
2126 }
2127
2128 /*
2129 * check resulting word
2130 */
2131 if (Affix->issimple)
2132 return newword;
2133 else if (Affix->isregis)
2134 {
2135 if (RS_execute(&(Affix->reg.regis), newword))
2136 return newword;
2137 }
2138 else
2139 {
2140 pg_wchar *data;
2141 size_t data_len;
2142 int newword_len;
2143
2144 /* Convert data string to wide characters */
2145 newword_len = strlen(newword);
2146 data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
2147 data_len = pg_mb2wchar_with_len(newword, data, newword_len);
2148
2149 if (pg_regexec(Affix->reg.pregex, data, data_len,
2150 0, NULL, 0, NULL, 0) == REG_OKAY)
2151 {
2152 pfree(data);
2153 return newword;
2154 }
2155 pfree(data);
2156 }
2157
2158 return NULL;
2159}
2160
2161static int
2162addToResult(char **forms, char **cur, char *word)
2163{
2164 if (cur - forms >= MAX_NORM - 1)
2165 return 0;
2166 if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2167 {
2168 *cur = pstrdup(word);
2169 *(cur + 1) = NULL;
2170 return 1;
2171 }
2172
2173 return 0;
2174}
2175
2176static char **
2177NormalizeSubWord(IspellDict *Conf, const char *word, int flag)
2178{
2179 AffixNodeData *suffix = NULL,
2180 *prefix = NULL;
2181 int slevel = 0,
2182 plevel = 0;
2183 int wrdlen = strlen(word),
2184 swrdlen;
2185 char **forms;
2186 char **cur;
2187 char newword[2 * MAXNORMLEN] = "";
2188 char pnewword[2 * MAXNORMLEN] = "";
2189 AffixNode *snode = Conf->Suffix,
2190 *pnode;
2191 int i,
2192 j;
2193
2194 if (wrdlen > MAXNORMLEN)
2195 return NULL;
2196 cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
2197 *cur = NULL;
2198
2199
2200 /* Check that the word itself is normal form */
2201 if (FindWord(Conf, word, VoidString, flag))
2202 {
2203 *cur = pstrdup(word);
2204 cur++;
2205 *cur = NULL;
2206 }
2207
2208 /* Find all other NORMAL forms of the 'word' (check only prefix) */
2209 pnode = Conf->Prefix;
2210 plevel = 0;
2211 while (pnode)
2212 {
2213 prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
2214 if (!prefix)
2215 break;
2216 for (j = 0; j < prefix->naff; j++)
2217 {
2218 if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2219 {
2220 /* prefix success */
2221 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2222 cur += addToResult(forms, cur, newword);
2223 }
2224 }
2225 pnode = prefix->node;
2226 }
2227
2228 /*
2229 * Find all other NORMAL forms of the 'word' (check suffix and then
2230 * prefix)
2231 */
2232 while (snode)
2233 {
2234 int baselen = 0;
2235
2236 /* find possible suffix */
2237 suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2238 if (!suffix)
2239 break;
2240 /* foreach suffix check affix */
2241 for (i = 0; i < suffix->naff; i++)
2242 {
2243 if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2244 {
2245 /* suffix success */
2246 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2247 cur += addToResult(forms, cur, newword);
2248
2249 /* now we will look changed word with prefixes */
2250 pnode = Conf->Prefix;
2251 plevel = 0;
2252 swrdlen = strlen(newword);
2253 while (pnode)
2254 {
2255 prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
2256 if (!prefix)
2257 break;
2258 for (j = 0; j < prefix->naff; j++)
2259 {
2260 if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2261 {
2262 /* prefix success */
2263 const char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2264 VoidString : prefix->aff[j]->flag;
2265
2266 if (FindWord(Conf, pnewword, ff, flag))
2267 cur += addToResult(forms, cur, pnewword);
2268 }
2269 }
2270 pnode = prefix->node;
2271 }
2272 }
2273 }
2274
2275 snode = suffix->node;
2276 }
2277
2278 if (cur == forms)
2279 {
2280 pfree(forms);
2281 return NULL;
2282 }
2283 return forms;
2284}
2285
2286typedef struct SplitVar
2287{
2290 char **stem;
2293
2294static int
2295CheckCompoundAffixes(CMPDAffix **ptr, const char *word, int len, bool CheckInPlace)
2296{
2297 bool issuffix;
2298
2299 /* in case CompoundAffix is null: */
2300 if (*ptr == NULL)
2301 return -1;
2302
2303 if (CheckInPlace)
2304 {
2305 while ((*ptr)->affix)
2306 {
2307 if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2308 {
2309 len = (*ptr)->len;
2310 issuffix = (*ptr)->issuffix;
2311 (*ptr)++;
2312 return (issuffix) ? len : 0;
2313 }
2314 (*ptr)++;
2315 }
2316 }
2317 else
2318 {
2319 char *affbegin;
2320
2321 while ((*ptr)->affix)
2322 {
2323 if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2324 {
2325 len = (*ptr)->len + (affbegin - word);
2326 issuffix = (*ptr)->issuffix;
2327 (*ptr)++;
2328 return (issuffix) ? len : 0;
2329 }
2330 (*ptr)++;
2331 }
2332 }
2333 return -1;
2334}
2335
2336static SplitVar *
2337CopyVar(SplitVar *s, int makedup)
2338{
2339 SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
2340
2341 v->next = NULL;
2342 if (s)
2343 {
2344 int i;
2345
2346 v->lenstem = s->lenstem;
2347 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2348 v->nstem = s->nstem;
2349 for (i = 0; i < s->nstem; i++)
2350 v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2351 }
2352 else
2353 {
2354 v->lenstem = 16;
2355 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2356 v->nstem = 0;
2357 }
2358 return v;
2359}
2360
2361static void
2363{
2364 if (v->nstem >= v->lenstem)
2365 {
2366 v->lenstem *= 2;
2367 v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2368 }
2369
2370 v->stem[v->nstem] = word;
2371 v->nstem++;
2372}
2373
2374static SplitVar *
2375SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, const char *word, int wordlen, int startpos, int minpos)
2376{
2377 SplitVar *var = NULL;
2378 SPNodeData *StopLow,
2379 *StopHigh,
2380 *StopMiddle = NULL;
2381 SPNode *node = (snode) ? snode : Conf->Dictionary;
2382 int level = (snode) ? minpos : startpos; /* recursive
2383 * minpos==level */
2384 int lenaff;
2385 CMPDAffix *caff;
2386 char *notprobed;
2387 int compoundflag = 0;
2388
2389 /* since this function recurses, it could be driven to stack overflow */
2391
2392 notprobed = (char *) palloc(wordlen);
2393 memset(notprobed, 1, wordlen);
2394 var = CopyVar(orig, 1);
2395
2396 while (level < wordlen)
2397 {
2398 /* find word with epenthetic or/and compound affix */
2399 caff = Conf->CompoundAffix;
2400 while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2401 {
2402 /*
2403 * there is one of compound affixes, so check word for existings
2404 */
2405 char buf[MAXNORMLEN];
2406 char **subres;
2407
2408 lenaff = level - startpos + lenaff;
2409
2410 if (!notprobed[startpos + lenaff - 1])
2411 continue;
2412
2413 if (level + lenaff - 1 <= minpos)
2414 continue;
2415
2416 if (lenaff >= MAXNORMLEN)
2417 continue; /* skip too big value */
2418 if (lenaff > 0)
2419 memcpy(buf, word + startpos, lenaff);
2420 buf[lenaff] = '\0';
2421
2422 if (level == 0)
2423 compoundflag = FF_COMPOUNDBEGIN;
2424 else if (level == wordlen - 1)
2425 compoundflag = FF_COMPOUNDLAST;
2426 else
2427 compoundflag = FF_COMPOUNDMIDDLE;
2428 subres = NormalizeSubWord(Conf, buf, compoundflag);
2429 if (subres)
2430 {
2431 /* Yes, it was a word from dictionary */
2432 SplitVar *new = CopyVar(var, 0);
2433 SplitVar *ptr = var;
2434 char **sptr = subres;
2435
2436 notprobed[startpos + lenaff - 1] = 0;
2437
2438 while (*sptr)
2439 {
2440 AddStem(new, *sptr);
2441 sptr++;
2442 }
2443 pfree(subres);
2444
2445 while (ptr->next)
2446 ptr = ptr->next;
2447 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2448
2449 pfree(new->stem);
2450 pfree(new);
2451 }
2452 }
2453
2454 if (!node)
2455 break;
2456
2457 StopLow = node->data;
2458 StopHigh = node->data + node->length;
2459 while (StopLow < StopHigh)
2460 {
2461 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2462 if (StopMiddle->val == ((uint8 *) (word))[level])
2463 break;
2464 else if (StopMiddle->val < ((uint8 *) (word))[level])
2465 StopLow = StopMiddle + 1;
2466 else
2467 StopHigh = StopMiddle;
2468 }
2469
2470 if (StopLow < StopHigh)
2471 {
2472 if (startpos == 0)
2473 compoundflag = FF_COMPOUNDBEGIN;
2474 else if (level == wordlen - 1)
2475 compoundflag = FF_COMPOUNDLAST;
2476 else
2477 compoundflag = FF_COMPOUNDMIDDLE;
2478
2479 /* find infinitive */
2480 if (StopMiddle->isword &&
2481 (StopMiddle->compoundflag & compoundflag) &&
2482 notprobed[level])
2483 {
2484 /* ok, we found full compoundallowed word */
2485 if (level > minpos)
2486 {
2487 /* and its length more than minimal */
2488 if (wordlen == level + 1)
2489 {
2490 /* well, it was last word */
2491 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2492 pfree(notprobed);
2493 return var;
2494 }
2495 else
2496 {
2497 /* then we will search more big word at the same point */
2498 SplitVar *ptr = var;
2499
2500 while (ptr->next)
2501 ptr = ptr->next;
2502 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2503 /* we can find next word */
2504 level++;
2505 AddStem(var, pnstrdup(word + startpos, level - startpos));
2506 node = Conf->Dictionary;
2507 startpos = level;
2508 continue;
2509 }
2510 }
2511 }
2512 node = StopMiddle->node;
2513 }
2514 else
2515 node = NULL;
2516 level++;
2517 }
2518
2519 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2520 pfree(notprobed);
2521 return var;
2522}
2523
2524static void
2525addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
2526{
2527 if (*lres == NULL)
2528 *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
2529
2530 if (*lcur - *lres < MAX_NORM - 1)
2531 {
2532 (*lcur)->lexeme = word;
2533 (*lcur)->flags = flags;
2534 (*lcur)->nvariant = NVariant;
2535 (*lcur)++;
2536 (*lcur)->lexeme = NULL;
2537 }
2538}
2539
2540TSLexeme *
2542{
2543 char **res;
2544 TSLexeme *lcur = NULL,
2545 *lres = NULL;
2546 uint16 NVariant = 1;
2547
2548 res = NormalizeSubWord(Conf, word, 0);
2549
2550 if (res)
2551 {
2552 char **ptr = res;
2553
2554 while (*ptr && (lcur - lres) < MAX_NORM)
2555 {
2556 addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2557 ptr++;
2558 }
2559 pfree(res);
2560 }
2561
2562 if (Conf->usecompound)
2563 {
2564 int wordlen = strlen(word);
2565 SplitVar *ptr,
2566 *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2567 int i;
2568
2569 while (var)
2570 {
2571 if (var->nstem > 1)
2572 {
2573 char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2574
2575 if (subres)
2576 {
2577 char **subptr = subres;
2578
2579 while (*subptr)
2580 {
2581 for (i = 0; i < var->nstem - 1; i++)
2582 {
2583 addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2584 }
2585
2586 addNorm(&lres, &lcur, *subptr, 0, NVariant);
2587 subptr++;
2588 NVariant++;
2589 }
2590
2591 pfree(subres);
2592 var->stem[0] = NULL;
2593 pfree(var->stem[var->nstem - 1]);
2594 }
2595 }
2596
2597 for (i = 0; i < var->nstem && var->stem[i]; i++)
2598 pfree(var->stem[i]);
2599 ptr = var->next;
2600 pfree(var->stem);
2601 pfree(var);
2602 var = ptr;
2603 }
2604 }
2605
2606 return lres;
2607}
unsigned char symbol
Definition: api.h:2
static int32 next
Definition: blutils.c:219
#define MAXALIGN(LEN)
Definition: c.h:765
uint8_t uint8
Definition: c.h:483
#define Assert(condition)
Definition: c.h:812
int32_t int32
Definition: c.h:481
uint16_t uint16
Definition: c.h:484
uint32_t uint32
Definition: c.h:485
struct cursor * cur
Definition: ecpg.c:29
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
void err(int eval, const char *fmt,...)
Definition: err.c:43
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
Definition: formatting.c:1591
return str start
const char * str
static const FormData_pg_attribute a1
Definition: heap.c:142
static const FormData_pg_attribute a2
Definition: heap.c:155
long val
Definition: informix.c:689
int j
Definition: isn.c:73
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
unsigned int pg_wchar
Definition: mbprint.c:31
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:986
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurTransactionContext
Definition: mcxt.c:155
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1707
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
const void size_t len
const void * data
static char * filename
Definition: pg_dumpall.c:119
static XLogRecPtr startpos
static char * buf
Definition: pg_test_fsync.c:72
#define sprintf
Definition: port.h:240
#define qsort(a, b, c, d)
Definition: port.h:447
char * c
char * s1
char * s2
MemoryContextSwitchTo(old_ctx)
static void prefixes(struct vars *v)
Definition: regc_lex.c:99
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition: regcomp.c:372
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1476
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition: regerror.c:60
#define REG_ADVANCED
Definition: regex.h:181
#define REG_OKAY
Definition: regex.h:215
#define REG_NOSUB
Definition: regex.h:185
#define regex_t
Definition: regex.h:245
static int find(struct vars *v, struct cnfa *cnfa, struct colormap *cm)
Definition: regexec.c:419
int pg_regexec(regex_t *re, const chr *string, size_t len, size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], int flags)
Definition: regexec.c:185
void RS_compile(Regis *r, bool issuffix, const char *str)
Definition: regis.c:85
bool RS_execute(Regis *r, char *str)
Definition: regis.c:213
bool RS_isRegis(const char *str)
Definition: regis.c:31
static pg_noinline void Size size
Definition: slab.c:607
void NIStartBuild(IspellDict *Conf)
Definition: spell.c:89
#define GETWCHAR(W, L, N, T)
Definition: spell.c:192
static int strbcmp(const unsigned char *s1, const unsigned char *s2)
Definition: spell.c:258
void NIFinishBuild(IspellDict *Conf)
Definition: spell.c:104
void NIImportAffixes(IspellDict *Conf, const char *filename)
Definition: spell.c:1426
static char * cpstrdup(IspellDict *Conf, const char *str)
Definition: spell.c:163
#define GETCHAR(A, N, T)
Definition: spell.c:193
static int parse_ooaffentry(char *str, char *type, char *flag, char *find, char *repl, char *mask)
Definition: spell.c:858
static const char * getAffixFlagSet(IspellDict *Conf, char *s)
Definition: spell.c:1160
static SPNode * mkSPNode(IspellDict *Conf, int low, int high, int level)
Definition: spell.c:1641
static int cmpspell(const void *s1, const void *s2)
Definition: spell.c:198
#define MAX_NORM
Definition: spell.c:188
#define PAE_WAIT_REPL
Definition: spell.c:776
static bool get_nextfield(char **str, char *next)
Definition: spell.c:792
void NISortDictionary(IspellDict *Conf)
Definition: spell.c:1723
static int FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
Definition: spell.c:603
static AffixNodeData * FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
Definition: spell.c:2029
static char * CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
Definition: spell.c:2072
static SplitVar * CopyVar(SplitVar *s, int makedup)
Definition: spell.c:2337
static void NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
Definition: spell.c:487
static void NIImportOOAffixes(IspellDict *Conf, const char *filename)
Definition: spell.c:1197
#define COMPACT_ALLOC_CHUNK
Definition: spell.c:127
static void addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
Definition: spell.c:2525
static void NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
Definition: spell.c:678
static char * findchar(char *str, int c)
Definition: spell.c:230
static const char * VoidString
Definition: spell.c:195
static char ** NormalizeSubWord(IspellDict *Conf, const char *word, int flag)
Definition: spell.c:2177
static int CheckCompoundAffixes(CMPDAffix **ptr, const char *word, int len, bool CheckInPlace)
Definition: spell.c:2295
#define MAXNORMLEN
Definition: spell.c:189
#define STRNCMP(s, p)
Definition: spell.c:191
static void getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
Definition: spell.c:350
void NISortAffixes(IspellDict *Conf)
Definition: spell.c:1977
static int cmpcmdflag(const void *f1, const void *f2)
Definition: spell.c:211
static char * findchar2(char *str, int c1, int c2)
Definition: spell.c:243
#define PAE_INREPL
Definition: spell.c:777
struct SplitVar SplitVar
static void setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry, char *s, uint32 val)
Definition: spell.c:1032
void NIImportDictionary(IspellDict *Conf, const char *filename)
Definition: spell.c:518
static AffixNode * mkANode(IspellDict *Conf, int low, int high, int level, int type)
Definition: spell.c:1831
static bool IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
Definition: spell.c:455
static bool parse_affentry(char *str, char *mask, char *find, char *repl)
Definition: spell.c:914
static SplitVar * SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, const char *word, int wordlen, int startpos, int minpos)
Definition: spell.c:2375
static uint32 makeCompoundFlags(IspellDict *Conf, int affix)
Definition: spell.c:1624
static int cmpspellaffix(const void *s1, const void *s2)
Definition: spell.c:204
#define cpalloc(size)
Definition: spell.c:159
static int strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
Definition: spell.c:281
TSLexeme * NINormalizeWord(IspellDict *Conf, const char *word)
Definition: spell.c:2541
#define tmpalloc(sz)
Definition: spell.c:80
#define cpalloc0(size)
Definition: spell.c:160
static int MergeAffix(IspellDict *Conf, int a1, int a2)
Definition: spell.c:1573
#define PAE_WAIT_FIND
Definition: spell.c:774
static void * compact_palloc0(IspellDict *Conf, size_t size)
Definition: spell.c:131
static int getCompoundAffixFlagValue(IspellDict *Conf, const char *s)
Definition: spell.c:1124
static void AddStem(SplitVar *v, char *word)
Definition: spell.c:2362
#define PAE_INMASK
Definition: spell.c:773
#define PAE_INFIND
Definition: spell.c:775
static int addToResult(char **forms, char **cur, char *word)
Definition: spell.c:2162
#define COMPACT_MAX_REQ
Definition: spell.c:128
#define PAE_WAIT_FLAG
Definition: spell.c:779
#define PAE_WAIT_MASK
Definition: spell.c:772
static void mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
Definition: spell.c:1908
static bool isAffixInUse(IspellDict *Conf, const char *affixflag)
Definition: spell.c:1962
static void addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
Definition: spell.c:1067
static char * lowerstr_ctx(IspellDict *Conf, const char *src)
Definition: spell.c:176
#define PAE_WAIT_TYPE
Definition: spell.c:778
static int cmpaffix(const void *s1, const void *s2)
Definition: spell.c:312
#define FLAGNUM_MAXSIZE
Definition: spell.h:182
#define FF_SUFFIX
Definition: spell.h:121
#define FF_COMPOUNDFLAG
Definition: spell.h:46
#define FF_PREFIX
Definition: spell.h:122
#define ANHRDSZ
Definition: spell.h:145
#define FF_COMPOUNDFLAGMASK
Definition: spell.h:48
#define SPELLHDRSZ
Definition: spell.h:82
#define FF_COMPOUNDFORBIDFLAG
Definition: spell.h:114
#define FF_COMPOUNDBEGIN
Definition: spell.h:43
#define FF_COMPOUNDPERMITFLAG
Definition: spell.h:113
#define FF_CROSSPRODUCT
Definition: spell.h:115
#define FF_COMPOUNDMIDDLE
Definition: spell.h:44
@ FM_LONG
Definition: spell.h:160
@ FM_CHAR
Definition: spell.h:159
@ FM_NUM
Definition: spell.h:161
#define SPNHDRSZ
Definition: spell.h:56
#define FF_COMPOUNDONLY
Definition: spell.h:42
#define FF_COMPOUNDLAST
Definition: spell.h:45
int f1[ARRAY_SIZE]
Definition: sql-declare.c:113
int f2[ARRAY_SIZE]
Definition: sql-declare.c:116
void check_stack_depth(void)
Definition: stack_depth.c:95
uint32 naff
Definition: spell.h:133
AFFIX ** aff
Definition: spell.h:134
uint32 val
Definition: spell.h:132
struct AffixNode * node
Definition: spell.h:135
uint32 isvoid
Definition: spell.h:140
AffixNodeData data[FLEXIBLE_ARRAY_MEMBER]
Definition: spell.h:142
uint32 length
Definition: spell.h:141
int len
Definition: spell.h:150
bool issuffix
Definition: spell.h:151
const char * affix
Definition: spell.h:149
uint32 value
Definition: spell.h:179
union CompoundAffixFlag::@135 flag
FlagMode flagMode
Definition: spell.h:178
const char * s
Definition: spell.h:173
int maffixes
Definition: spell.h:186
int lenAffixData
Definition: spell.h:196
MemoryContext buildCxt
Definition: spell.h:220
int mspell
Definition: spell.h:225
AffixNode * Suffix
Definition: spell.h:190
int naffixes
Definition: spell.h:187
bool usecompound
Definition: spell.h:202
CompoundAffixFlag * CompoundAffixFlags
Definition: spell.h:210
AFFIX * Affix
Definition: spell.h:188
int nAffixData
Definition: spell.h:197
int nCompoundAffixFlag
Definition: spell.h:212
CMPDAffix * CompoundAffix
Definition: spell.h:200
bool useFlagAliases
Definition: spell.h:198
SPNode * Dictionary
Definition: spell.h:193
int mCompoundAffixFlag
Definition: spell.h:214
int nspell
Definition: spell.h:224
char * firstfree
Definition: spell.h:228
const char ** AffixData
Definition: spell.h:195
FlagMode flagMode
Definition: spell.h:203
size_t avail
Definition: spell.h:229
AffixNode * Prefix
Definition: spell.h:191
SPELL ** Spell
Definition: spell.h:223
struct SPNode * node
Definition: spell.h:35
uint32 val
Definition: spell.h:29
uint32 compoundflag
Definition: spell.h:32
uint32 isword
Definition: spell.h:30
uint32 affix
Definition: spell.h:34
Definition: spell.h:51
SPNodeData data[FLEXIBLE_ARRAY_MEMBER]
Definition: spell.h:53
uint32 length
Definition: spell.h:52
int nstem
Definition: spell.c:2288
struct SplitVar * next
Definition: spell.c:2291
int lenstem
Definition: spell.c:2289
char ** stem
Definition: spell.c:2290
const char * find
Definition: spell.h:96
uint32 isregis
Definition: spell.h:94
uint32 type
Definition: spell.h:91
Regis regis
Definition: spell.h:106
const char * flag
Definition: spell.h:89
uint32 replen
Definition: spell.h:95
regex_t * pregex
Definition: spell.h:105
uint32 flagflags
Definition: spell.h:92
const char * repl
Definition: spell.h:97
uint32 issimple
Definition: spell.h:93
union aff_struct::@134 reg
int len
Definition: spell.h:76
struct spell_struct::@132::@133 d
union spell_struct::@132 p
const char * flag
Definition: spell.h:69
char word[FLEXIBLE_ARRAY_MEMBER]
Definition: spell.h:79
int affix
Definition: spell.h:74
Definition: regguts.h:323
char * flag(int b)
Definition: test-ctype.c:33
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:89
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:112
int t_isalpha(const char *ptr)
Definition: ts_locale.c:35
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:157
#define t_iseq(x, c)
Definition: ts_locale.h:38
#define COPYCHAR(d, s)
Definition: ts_locale.h:40
const char * type