PostgreSQL Source Code git master
Loading...
Searching...
No Matches
spell.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * spell.c
4 * Normalizing word with ISpell
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 *
8 * Ispell dictionary
9 * -----------------
10 *
11 * Rules of dictionaries are defined in two files with .affix and .dict
12 * extensions. They are used by spell checker programs Ispell and Hunspell.
13 *
14 * An .affix file declares morphological rules to get a basic form of words.
15 * The format of an .affix file has different structure for Ispell and Hunspell
16 * dictionaries. The Hunspell format is more complicated. But when an .affix
17 * file is imported and compiled, it is stored in the same structure AffixNode.
18 *
19 * A .dict file stores a list of basic forms of words with references to
20 * affix rules. The format of a .dict file has the same structure for Ispell
21 * and Hunspell dictionaries.
22 *
23 * Compilation of a dictionary
24 * ---------------------------
25 *
26 * A compiled dictionary is stored in the IspellDict structure. Compilation of
27 * a dictionary is divided into the several steps:
28 * - NIImportDictionary() - stores each word of a .dict file in the
29 * temporary Spell field.
30 * - NIImportAffixes() - stores affix rules of an .affix file in the
31 * Affix field (not temporary) if an .affix file has the Ispell format.
32 * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33 * Hunspell format. The AffixData field is initialized if AF parameter
34 * is defined.
35 * - NISortDictionary() - builds a prefix tree (Trie) from the words list
36 * and stores it in the Dictionary field. The words list is got from the
37 * Spell field. The AffixData field is initialized if AF parameter is not
38 * defined.
39 * - NISortAffixes():
40 * - builds a list of compound affixes from the affix list and stores it
41 * in the CompoundAffix.
42 * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43 * and stores them in Suffix and Prefix fields.
44 * The affix list is got from the Affix field.
45 *
46 * Memory management
47 * -----------------
48 *
49 * The IspellDict structure has the Spell field which is used only in compile
50 * time. The Spell field stores a words list. It can take a lot of memory.
51 * Therefore when a dictionary is compiled this field is cleared by
52 * NIFinishBuild().
53 *
54 * All resources which should cleared by NIFinishBuild() is initialized using
55 * tmpalloc() and tmpalloc0().
56 *
57 * IDENTIFICATION
58 * src/backend/tsearch/spell.c
59 *
60 *-------------------------------------------------------------------------
61 */
62
63#include "postgres.h"
64
66#include "miscadmin.h"
67#include "tsearch/dicts/spell.h"
68#include "tsearch/ts_locale.h"
69#include "utils/formatting.h"
70#include "utils/memutils.h"
71
72
73/*
74 * Initialization requires a lot of memory that's not needed
75 * after the initialization is done. During initialization,
76 * CurrentMemoryContext is the long-lived memory context associated
77 * with the dictionary cache entry. We keep the short-lived stuff
78 * in the Conf->buildCxt context.
79 */
80#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
81#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
82
83/*
84 * Prepare for constructing an ISpell dictionary.
85 *
86 * The IspellDict struct is assumed to be zeroed when allocated.
87 */
88void
90{
91 /*
92 * The temp context is a child of CurTransactionContext, so that it will
93 * go away automatically on error.
94 */
96 "Ispell dictionary init context",
98}
99
100/*
101 * Clean up when dictionary construction is complete.
102 */
103void
105{
106 /* Release no-longer-needed temp memory */
107 MemoryContextDelete(Conf->buildCxt);
108 /* Just for cleanliness, zero the now-dangling pointers */
109 Conf->buildCxt = NULL;
110 Conf->Spell = NULL;
111 Conf->firstfree = NULL;
112 Conf->CompoundAffixFlags = NULL;
113}
114
115
116/*
117 * "Compact" palloc: allocate without extra palloc overhead.
118 *
119 * Since we have no need to free the ispell data items individually, there's
120 * not much value in the per-chunk overhead normally consumed by palloc.
121 * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
122 *
123 * We currently pre-zero all data allocated this way, even though some of it
124 * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
125 * to indicate which allocations actually require zeroing.
126 */
127#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
128#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
129
130static void *
132{
133 void *result;
134
135 /* Should only be called during init */
136 Assert(Conf->buildCxt != NULL);
137
138 /* No point in this for large chunks */
139 if (size > COMPACT_MAX_REQ)
140 return palloc0(size);
141
142 /* Keep everything maxaligned */
143 size = MAXALIGN(size);
144
145 /* Need more space? */
146 if (size > Conf->avail)
147 {
148 Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
149 Conf->avail = COMPACT_ALLOC_CHUNK;
150 }
151
152 result = Conf->firstfree;
153 Conf->firstfree += size;
154 Conf->avail -= size;
155
156 return result;
157}
158
159#define cpalloc(size) compact_palloc0(Conf, size)
160#define cpalloc0(size) compact_palloc0(Conf, size)
161
162static char *
164{
165 char *res = cpalloc(strlen(str) + 1);
166
167 strcpy(res, str);
168 return res;
169}
170
171
172/*
173 * Apply str_tolower(), producing a temporary result (in the buildCxt).
174 */
175static char *
176lowerstr_ctx(IspellDict *Conf, const char *src)
177{
179 char *dst;
180
184
185 return dst;
186}
187
188#define MAX_NORM 1024
189#define MAXNORMLEN 256
190
191#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
192#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
193#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
194
195static const char *VoidString = "";
196
197static int
198cmpspell(const void *s1, const void *s2)
199{
200 return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
201}
202
203static int
204cmpspellaffix(const void *s1, const void *s2)
205{
206 return strcmp((*(SPELL *const *) s1)->p.flag,
207 (*(SPELL *const *) s2)->p.flag);
208}
209
210static int
211cmpcmdflag(const void *f1, const void *f2)
212{
213 const CompoundAffixFlag *fv1 = f1;
214 const CompoundAffixFlag *fv2 = f2;
215
216 Assert(fv1->flagMode == fv2->flagMode);
217
218 if (fv1->flagMode == FM_NUM)
219 {
220 if (fv1->flag.i == fv2->flag.i)
221 return 0;
222
223 return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
224 }
225
226 return strcmp(fv1->flag.s, fv2->flag.s);
227}
228
229static char *
230findchar(char *str, int c)
231{
232 while (*str)
233 {
234 if (t_iseq(str, c))
235 return str;
237 }
238
239 return NULL;
240}
241
242static char *
243findchar2(char *str, int c1, int c2)
244{
245 while (*str)
246 {
247 if (t_iseq(str, c1) || t_iseq(str, c2))
248 return str;
250 }
251
252 return NULL;
253}
254
255
256/* backward string compare for suffix tree operations */
257static int
258strbcmp(const unsigned char *s1, const unsigned char *s2)
259{
260 int l1 = strlen((const char *) s1) - 1,
261 l2 = strlen((const char *) s2) - 1;
262
263 while (l1 >= 0 && l2 >= 0)
264 {
265 if (s1[l1] < s2[l2])
266 return -1;
267 if (s1[l1] > s2[l2])
268 return 1;
269 l1--;
270 l2--;
271 }
272 if (l1 < l2)
273 return -1;
274 if (l1 > l2)
275 return 1;
276
277 return 0;
278}
279
280static int
281strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
282{
283 int l1 = strlen((const char *) s1) - 1,
284 l2 = strlen((const char *) s2) - 1,
285 l = count;
286
287 while (l1 >= 0 && l2 >= 0 && l > 0)
288 {
289 if (s1[l1] < s2[l2])
290 return -1;
291 if (s1[l1] > s2[l2])
292 return 1;
293 l1--;
294 l2--;
295 l--;
296 }
297 if (l == 0)
298 return 0;
299 if (l1 < l2)
300 return -1;
301 if (l1 > l2)
302 return 1;
303 return 0;
304}
305
306/*
307 * Compares affixes.
308 * First compares the type of an affix. Prefixes should go before affixes.
309 * If types are equal then compares replaceable string.
310 */
311static int
312cmpaffix(const void *s1, const void *s2)
313{
314 const AFFIX *a1 = (const AFFIX *) s1;
315 const AFFIX *a2 = (const AFFIX *) s2;
316
317 if (a1->type < a2->type)
318 return -1;
319 if (a1->type > a2->type)
320 return 1;
321 if (a1->type == FF_PREFIX)
322 return strcmp(a1->repl, a2->repl);
323 else
324 return strbcmp((const unsigned char *) a1->repl,
325 (const unsigned char *) a2->repl);
326}
327
328/*
329 * Gets an affix flag from the set of affix flags (sflagset).
330 *
331 * Several flags can be stored in a single string. Flags can be represented by:
332 * - 1 character (FM_CHAR). A character may be Unicode.
333 * - 2 characters (FM_LONG). A character may be Unicode.
334 * - numbers from 1 to 65000 (FM_NUM).
335 *
336 * Depending on the flagMode an affix string can have the following format:
337 * - FM_CHAR: ABCD
338 * Here we have 4 flags: A, B, C and D
339 * - FM_LONG: ABCDE*
340 * Here we have 3 flags: AB, CD and E*
341 * - FM_NUM: 200,205,50
342 * Here we have 3 flags: 200, 205 and 50
343 *
344 * Conf: current dictionary.
345 * sflagset: the set of affix flags. Returns a reference to the start of a next
346 * affix flag.
347 * sflag: returns an affix flag from sflagset.
348 */
349static void
351{
352 int32 s;
353 char *next;
354 const char *sbuf = *sflagset;
355 int maxstep;
356 int clen;
357 bool stop = false;
358 bool met_comma = false;
359
360 maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
361
362 while (**sflagset)
363 {
364 switch (Conf->flagMode)
365 {
366 case FM_LONG:
367 case FM_CHAR:
369 sflag += clen;
370
371 /* Go to start of the next flag */
372 *sflagset += clen;
373
374 /* Check if we get all characters of flag */
375 maxstep--;
376 stop = (maxstep == 0);
377 break;
378 case FM_NUM:
379 errno = 0;
380 s = strtol(*sflagset, &next, 10);
381 if (*sflagset == next || errno == ERANGE)
384 errmsg("invalid affix flag \"%s\"", *sflagset)));
388 errmsg("affix flag \"%s\" is out of range",
389 *sflagset)));
390 sflag += sprintf(sflag, "%0d", s);
391
392 /* Go to start of the next flag */
393 *sflagset = next;
394 while (**sflagset)
395 {
396 if (isdigit((unsigned char) **sflagset))
397 {
398 if (!met_comma)
401 errmsg("invalid affix flag \"%s\"",
402 *sflagset)));
403 break;
404 }
405 else if (t_iseq(*sflagset, ','))
406 {
407 if (met_comma)
410 errmsg("invalid affix flag \"%s\"",
411 *sflagset)));
412 met_comma = true;
413 }
414 else if (!isspace((unsigned char) **sflagset))
415 {
418 errmsg("invalid character in affix flag \"%s\"",
419 *sflagset)));
420 }
421
423 }
424 stop = true;
425 break;
426 default:
427 elog(ERROR, "unrecognized type of Conf->flagMode: %d",
428 Conf->flagMode);
429 }
430
431 if (stop)
432 break;
433 }
434
435 if (Conf->flagMode == FM_LONG && maxstep > 0)
438 errmsg("invalid affix flag \"%s\" with \"long\" flag value",
439 sbuf)));
440
441 *sflag = '\0';
442}
443
444/*
445 * Checks if the affix set Conf->AffixData[affix] contains affixflag.
446 * Conf->AffixData[affix] does not contain affixflag if this flag is not used
447 * actually by the .dict file.
448 *
449 * Conf: current dictionary.
450 * affix: index of the Conf->AffixData array.
451 * affixflag: the affix flag.
452 *
453 * Returns true if the string Conf->AffixData[affix] contains affixflag,
454 * otherwise returns false.
455 */
456static bool
457IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
458{
459 const char *flagcur;
460 char flag[BUFSIZ];
461
462 if (*affixflag == 0)
463 return true;
464
465 Assert(affix < Conf->nAffixData);
466
467 flagcur = Conf->AffixData[affix];
468
469 while (*flagcur)
470 {
472 /* Compare first affix flag in flagcur with affixflag */
473 if (strcmp(flag, affixflag) == 0)
474 return true;
475 }
476
477 /* Could not find affixflag */
478 return false;
479}
480
481/*
482 * Adds the new word into the temporary array Spell.
483 *
484 * Conf: current dictionary.
485 * word: new word.
486 * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
487 */
488static void
489NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
490{
491 if (Conf->nspell >= Conf->mspell)
492 {
493 if (Conf->mspell)
494 {
495 Conf->mspell *= 2;
496 Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
497 }
498 else
499 {
500 Conf->mspell = 1024 * 20;
501 Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
502 }
503 }
504 Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
505 strcpy(Conf->Spell[Conf->nspell]->word, word);
506 Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
508 Conf->nspell++;
509}
510
511/*
512 * Imports dictionary into the temporary array Spell.
513 *
514 * Note caller must already have applied get_tsearch_config_filename.
515 *
516 * Conf: current dictionary.
517 * filename: path to the .dict file.
518 */
519void
521{
523 char *line;
524
528 errmsg("could not open dictionary file \"%s\": %m",
529 filename)));
530
531 while ((line = tsearch_readline(&trst)) != NULL)
532 {
533 char *s,
534 *pstr;
535
536 /* Set of affix flags */
537 const char *flag;
538
539 /* Extract flag from the line */
540 flag = NULL;
541 if ((s = findchar(line, '/')))
542 {
543 *s++ = '\0';
544 flag = s;
545 while (*s)
546 {
547 /* we allow only single encoded flags for faster works */
548 if (pg_mblen_cstr(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
549 s++;
550 else
551 {
552 *s = '\0';
553 break;
554 }
555 }
556 }
557 else
558 flag = "";
559
560 /* Remove trailing spaces */
561 s = line;
562 while (*s)
563 {
564 if (isspace((unsigned char) *s))
565 {
566 *s = '\0';
567 break;
568 }
569 s += pg_mblen_cstr(s);
570 }
571 pstr = lowerstr_ctx(Conf, line);
572
574 pfree(pstr);
575
576 pfree(line);
577 }
579}
580
581/*
582 * Searches a basic form of word in the prefix tree. This word was generated
583 * using an affix rule. This rule may not be presented in an affix set of
584 * a basic form of word.
585 *
586 * For example, we have the entry in the .dict file:
587 * meter/GMD
588 *
589 * The affix rule with the flag S:
590 * SFX S y ies [^aeiou]y
591 * is not presented here.
592 *
593 * The affix rule with the flag M:
594 * SFX M 0 's .
595 * is presented here.
596 *
597 * Conf: current dictionary.
598 * word: basic form of word.
599 * affixflag: affix flag, by which a basic form of word was generated.
600 * flag: compound flag used to compare with StopMiddle->compoundflag.
601 *
602 * Returns 1 if the word was found in the prefix tree, else returns 0.
603 */
604static int
605FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
606{
607 SPNode *node = Conf->Dictionary;
609 *StopHigh,
610 *StopMiddle;
611 const uint8 *ptr = (const uint8 *) word;
612
614
615 while (node && *ptr)
616 {
617 StopLow = node->data;
618 StopHigh = node->data + node->length;
619 while (StopLow < StopHigh)
620 {
621 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
622 if (StopMiddle->val == *ptr)
623 {
624 if (*(ptr + 1) == '\0' && StopMiddle->isword)
625 {
626 if (flag == 0)
627 {
628 /*
629 * The word can be formed only with another word. And
630 * in the flag parameter there is not a sign that we
631 * search compound words.
632 */
633 if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
634 return 0;
635 }
636 else if ((flag & StopMiddle->compoundflag) == 0)
637 return 0;
638
639 /*
640 * Check if this affix rule is presented in the affix set
641 * with index StopMiddle->affix.
642 */
644 return 1;
645 }
646 node = StopMiddle->node;
647 ptr++;
648 break;
649 }
650 else if (StopMiddle->val < *ptr)
651 StopLow = StopMiddle + 1;
652 else
654 }
655 if (StopLow >= StopHigh)
656 break;
657 }
658 return 0;
659}
660
661/*
662 * Adds a new affix rule to the Affix field.
663 *
664 * Conf: current dictionary.
665 * flag: affix flag ('\' in the below example).
666 * flagflags: set of flags from the flagval field for this affix rule. This set
667 * is listed after '/' character in the added string (repl).
668 *
669 * For example L flag in the hunspell_sample.affix:
670 * SFX \ 0 Y/L [^Y]
671 *
672 * mask: condition for search ('[^Y]' in the above example).
673 * find: stripping characters from beginning (at prefix) or end (at suffix)
674 * of the word ('0' in the above example, 0 means that there is not
675 * stripping character).
676 * repl: adding string after stripping ('Y' in the above example).
677 * type: FF_SUFFIX or FF_PREFIX.
678 */
679static void
680NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
681 const char *find, const char *repl, int type)
682{
683 AFFIX *Affix;
684
685 if (Conf->naffixes >= Conf->maffixes)
686 {
687 if (Conf->maffixes)
688 {
689 Conf->maffixes *= 2;
690 Conf->Affix = (AFFIX *) repalloc(Conf->Affix, Conf->maffixes * sizeof(AFFIX));
691 }
692 else
693 {
694 Conf->maffixes = 16;
695 Conf->Affix = palloc_array(AFFIX, Conf->maffixes);
696 }
697 }
698
699 Affix = Conf->Affix + Conf->naffixes;
700
701 /* This affix rule can be applied for words with any ending */
702 if (strcmp(mask, ".") == 0 || *mask == '\0')
703 {
704 Affix->issimple = 1;
705 Affix->isregis = 0;
706 }
707 /* This affix rule will use regis to search word ending */
708 else if (RS_isRegis(mask))
709 {
710 Affix->issimple = 0;
711 Affix->isregis = 1;
712 RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
713 *mask ? mask : VoidString);
714 }
715 /* This affix rule will use regex_t to search word ending */
716 else
717 {
718 int masklen;
719 int wmasklen;
720 int err;
722 char *tmask;
723
724 Affix->issimple = 0;
725 Affix->isregis = 0;
726 tmask = (char *) tmpalloc(strlen(mask) + 3);
727 if (type == FF_SUFFIX)
728 sprintf(tmask, "%s$", mask);
729 else
730 sprintf(tmask, "^%s", mask);
731
732 masklen = strlen(tmask);
733 wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
735
736 /*
737 * The regex and all internal state created by pg_regcomp are
738 * allocated in the dictionary's memory context, and will be freed
739 * automatically when it is destroyed.
740 */
745 if (err)
746 {
747 char errstr[100];
748
749 pg_regerror(err, Affix->reg.pregex, errstr, sizeof(errstr));
752 errmsg("invalid regular expression: %s", errstr)));
753 }
754 }
755
756 Affix->flagflags = flagflags;
757 if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
758 {
759 if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
760 Affix->flagflags |= FF_COMPOUNDFLAG;
761 }
762 Affix->flag = cpstrdup(Conf, flag);
763 Affix->type = type;
764
765 Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
766 if ((Affix->replen = strlen(repl)) > 0)
767 Affix->repl = cpstrdup(Conf, repl);
768 else
769 Affix->repl = VoidString;
770 Conf->naffixes++;
771}
772
773/* Parsing states for parse_affentry() and friends */
774#define PAE_WAIT_MASK 0
775#define PAE_INMASK 1
776#define PAE_WAIT_FIND 2
777#define PAE_INFIND 3
778#define PAE_WAIT_REPL 4
779#define PAE_INREPL 5
780#define PAE_WAIT_TYPE 6
781#define PAE_WAIT_FLAG 7
782
783/*
784 * Parse next space-separated field of an .affix file line.
785 *
786 * *str is the input pointer (will be advanced past field)
787 * next is where to copy the field value to, with null termination
788 *
789 * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
790 *
791 * Returns true if we found a field, false if not.
792 */
793static bool
794get_nextfield(char **str, char *next)
795{
796 int state = PAE_WAIT_MASK;
797 int avail = BUFSIZ;
798
799 while (**str)
800 {
801 int clen = pg_mblen_cstr(*str);
802
803 if (state == PAE_WAIT_MASK)
804 {
805 if (t_iseq(*str, '#'))
806 return false;
807 else if (!isspace((unsigned char) **str))
808 {
809 if (clen < avail)
810 {
812 next += clen;
813 avail -= clen;
814 }
816 }
817 }
818 else /* state == PAE_INMASK */
819 {
820 if (isspace((unsigned char) **str))
821 {
822 *next = '\0';
823 return true;
824 }
825 else
826 {
827 if (clen < avail)
828 {
830 next += clen;
831 avail -= clen;
832 }
833 }
834 }
835 *str += clen;
836 }
837
838 *next = '\0';
839
840 return (state == PAE_INMASK); /* OK if we got a nonempty field */
841}
842
843/*
844 * Parses entry of an .affix file of MySpell or Hunspell format.
845 *
846 * An .affix file entry has the following format:
847 * - header
848 * <type> <flag> <cross_flag> <flag_count>
849 * - fields after header:
850 * <type> <flag> <find> <replace> <mask>
851 *
852 * str is the input line
853 * field values are returned to type etc, which must be buffers of size BUFSIZ.
854 *
855 * Returns number of fields found; any omitted fields are set to empty strings.
856 */
857static int
858parse_ooaffentry(char *str, char *type, char *flag, char *find,
859 char *repl, char *mask)
860{
861 int state = PAE_WAIT_TYPE;
862 int fields_read = 0;
863 bool valid = false;
864
865 *type = *flag = *find = *repl = *mask = '\0';
866
867 while (*str)
868 {
869 switch (state)
870 {
871 case PAE_WAIT_TYPE:
872 valid = get_nextfield(&str, type);
874 break;
875 case PAE_WAIT_FLAG:
876 valid = get_nextfield(&str, flag);
878 break;
879 case PAE_WAIT_FIND:
880 valid = get_nextfield(&str, find);
882 break;
883 case PAE_WAIT_REPL:
884 valid = get_nextfield(&str, repl);
886 break;
887 case PAE_WAIT_MASK:
888 valid = get_nextfield(&str, mask);
889 state = -1; /* force loop exit */
890 break;
891 default:
892 elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
893 state);
894 break;
895 }
896 if (valid)
897 fields_read++;
898 else
899 break; /* early EOL */
900 if (state < 0)
901 break; /* got all fields */
902 }
903
904 return fields_read;
905}
906
907/*
908 * Parses entry of an .affix file of Ispell format
909 *
910 * An .affix file entry has the following format:
911 * <mask> > [-<find>,]<replace>
912 *
913 * Output buffers mask, find, repl must be of length BUFSIZ;
914 * we truncate the input to fit.
915 */
916static bool
917parse_affentry(const char *str, char *mask, char *find, char *repl)
918{
919 int state = PAE_WAIT_MASK;
920 char *pmask = mask,
921 *pfind = find,
922 *prepl = repl;
923 char *emask = mask + BUFSIZ;
924 char *efind = find + BUFSIZ;
925 char *erepl = repl + BUFSIZ;
926
927 *mask = *find = *repl = '\0';
928
929 while (*str)
930 {
931 int clen = pg_mblen_cstr(str);
932
933 if (state == PAE_WAIT_MASK)
934 {
935 if (t_iseq(str, '#'))
936 return false;
937 else if (!isspace((unsigned char) *str))
938 {
939 if (pmask < emask - clen)
942 }
943 }
944 else if (state == PAE_INMASK)
945 {
946 if (t_iseq(str, '>'))
947 {
948 *pmask = '\0';
950 }
951 else if (!isspace((unsigned char) *str))
952 {
953 if (pmask < emask - clen)
955 }
956 }
957 else if (state == PAE_WAIT_FIND)
958 {
959 if (t_iseq(str, '-'))
960 {
962 }
963 else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ )
964 {
965 if (prepl < erepl - clen)
968 }
969 else if (!isspace((unsigned char) *str))
972 errmsg("syntax error")));
973 }
974 else if (state == PAE_INFIND)
975 {
976 if (t_iseq(str, ','))
977 {
978 *pfind = '\0';
980 }
981 else if (t_isalpha_cstr(str))
982 {
983 if (pfind < efind - clen)
985 }
986 else if (!isspace((unsigned char) *str))
989 errmsg("syntax error")));
990 }
991 else if (state == PAE_WAIT_REPL)
992 {
993 if (t_iseq(str, '-'))
994 {
995 break; /* void repl */
996 }
997 else if (t_isalpha_cstr(str))
998 {
999 if (prepl < erepl - clen)
1001 state = PAE_INREPL;
1002 }
1003 else if (!isspace((unsigned char) *str))
1004 ereport(ERROR,
1006 errmsg("syntax error")));
1007 }
1008 else if (state == PAE_INREPL)
1009 {
1010 if (t_iseq(str, '#'))
1011 {
1012 *prepl = '\0';
1013 break;
1014 }
1015 else if (t_isalpha_cstr(str))
1016 {
1017 if (prepl < erepl - clen)
1019 }
1020 else if (!isspace((unsigned char) *str))
1021 ereport(ERROR,
1023 errmsg("syntax error")));
1024 }
1025 else
1026 elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1027
1028 str += clen;
1029 }
1030
1031 *pmask = *pfind = *prepl = '\0';
1032
1033 return (*mask && (*find || *repl));
1034}
1035
1036/*
1037 * Sets a Hunspell options depending on flag type.
1038 */
1039static void
1041 char *s, uint32 val)
1042{
1043 if (Conf->flagMode == FM_NUM)
1044 {
1045 char *next;
1046 int i;
1047
1048 errno = 0;
1049 i = strtol(s, &next, 10);
1050 if (s == next || errno == ERANGE)
1051 ereport(ERROR,
1053 errmsg("invalid affix flag \"%s\"", s)));
1055 ereport(ERROR,
1057 errmsg("affix flag \"%s\" is out of range", s)));
1058
1059 entry->flag.i = i;
1060 }
1061 else
1062 entry->flag.s = cpstrdup(Conf, s);
1063
1064 entry->flagMode = Conf->flagMode;
1065 entry->value = val;
1066}
1067
1068/*
1069 * Sets up a correspondence for the affix parameter with the affix flag.
1070 *
1071 * Conf: current dictionary.
1072 * s: affix flag in string.
1073 * val: affix parameter.
1074 */
1075static void
1077{
1079 char sbuf[BUFSIZ];
1080 char *sflag;
1081
1082 while (*s && isspace((unsigned char) *s))
1083 s += pg_mblen_cstr(s);
1084
1085 if (!*s)
1086 ereport(ERROR,
1088 errmsg("syntax error")));
1089
1090 /* Get flag without \n */
1091 sflag = sbuf;
1092 while (*s && !isspace((unsigned char) *s) && *s != '\n')
1093 {
1094 int clen = pg_mblen_cstr(s);
1095
1096 /* Truncate the input to fit in BUFSIZ */
1097 if (sflag < sbuf + BUFSIZ - clen)
1099 s += clen;
1100 }
1101 *sflag = '\0';
1102
1103 /* Resize array or allocate memory for array CompoundAffixFlag */
1104 if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1105 {
1106 if (Conf->mCompoundAffixFlag)
1107 {
1108 Conf->mCompoundAffixFlag *= 2;
1109 Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1110 repalloc(Conf->CompoundAffixFlags,
1111 Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1112 }
1113 else
1114 {
1115 Conf->mCompoundAffixFlag = 10;
1116 Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1117 tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1118 }
1119 }
1120
1121 newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1122
1124
1125 Conf->usecompound = true;
1126 Conf->nCompoundAffixFlag++;
1127}
1128
1129/*
1130 * Returns a set of affix parameters which correspondence to the set of affix
1131 * flags s.
1132 */
1133static int
1135{
1136 uint32 flag = 0;
1137 CompoundAffixFlag *found,
1138 key;
1139 char sflag[BUFSIZ];
1140 const char *flagcur;
1141
1142 if (Conf->nCompoundAffixFlag == 0)
1143 return 0;
1144
1145 flagcur = s;
1146 while (*flagcur)
1147 {
1150
1151 found = (CompoundAffixFlag *)
1152 bsearch(&key, Conf->CompoundAffixFlags,
1153 Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
1154 cmpcmdflag);
1155 if (found != NULL)
1156 flag |= found->value;
1157 }
1158
1159 return flag;
1160}
1161
1162/*
1163 * Returns a flag set using the s parameter.
1164 *
1165 * If Conf->useFlagAliases is true then the s parameter is index of the
1166 * Conf->AffixData array and function returns its entry.
1167 * Else function returns the s parameter.
1168 */
1169static const char *
1171{
1172 if (Conf->useFlagAliases && *s != '\0')
1173 {
1174 int curaffix;
1175 char *end;
1176
1177 errno = 0;
1178 curaffix = strtol(s, &end, 10);
1179 if (s == end || errno == ERANGE)
1180 ereport(ERROR,
1182 errmsg("invalid affix alias \"%s\"", s)));
1183
1184 if (curaffix > 0 && curaffix < Conf->nAffixData)
1185
1186 /*
1187 * Do not subtract 1 from curaffix because empty string was added
1188 * in NIImportOOAffixes
1189 */
1190 return Conf->AffixData[curaffix];
1191 else if (curaffix > Conf->nAffixData)
1192 ereport(ERROR,
1194 errmsg("invalid affix alias \"%s\"", s)));
1195 return VoidString;
1196 }
1197 else
1198 return s;
1199}
1200
1201/*
1202 * Import an affix file that follows MySpell or Hunspell format.
1203 *
1204 * Conf: current dictionary.
1205 * filename: path to the .affix file.
1206 */
1207static void
1209{
1210 char type[BUFSIZ],
1211 *ptype = NULL;
1212 char sflag[BUFSIZ];
1213 char mask[BUFSIZ],
1214 *pmask;
1215 char find[BUFSIZ],
1216 *pfind;
1217 char repl[BUFSIZ],
1218 *prepl;
1219 bool isSuffix = false;
1220 int naffix = 0,
1221 curaffix = 0;
1222 int sflaglen = 0;
1223 char flagflags = 0;
1225 char *recoded;
1226
1227 /* read file to find any flag */
1228 Conf->usecompound = false;
1229 Conf->useFlagAliases = false;
1230 Conf->flagMode = FM_CHAR;
1231
1233 ereport(ERROR,
1235 errmsg("could not open affix file \"%s\": %m",
1236 filename)));
1237
1238 while ((recoded = tsearch_readline(&trst)) != NULL)
1239 {
1240 if (*recoded == '\0' || isspace((unsigned char) *recoded) || t_iseq(recoded, '#'))
1241 {
1242 pfree(recoded);
1243 continue;
1244 }
1245
1246 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1247 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1249 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1250 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1252 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1253 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1255 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
1256 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1257 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1259 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1260 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1262 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1263 addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1265 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1267 recoded + strlen("COMPOUNDPERMITFLAG"),
1269 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1271 recoded + strlen("COMPOUNDFORBIDFLAG"),
1273 else if (STRNCMP(recoded, "FLAG") == 0)
1274 {
1275 char *s = recoded + strlen("FLAG");
1276
1277 while (*s && isspace((unsigned char) *s))
1278 s += pg_mblen_cstr(s);
1279
1280 if (*s)
1281 {
1282 if (STRNCMP(s, "long") == 0)
1283 Conf->flagMode = FM_LONG;
1284 else if (STRNCMP(s, "num") == 0)
1285 Conf->flagMode = FM_NUM;
1286 else if (STRNCMP(s, "default") != 0)
1287 ereport(ERROR,
1289 errmsg("Ispell dictionary supports only "
1290 "\"default\", \"long\", "
1291 "and \"num\" flag values")));
1292 }
1293 }
1294
1295 pfree(recoded);
1296 }
1298
1299 if (Conf->nCompoundAffixFlag > 1)
1300 qsort(Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
1301 sizeof(CompoundAffixFlag), cmpcmdflag);
1302
1304 ereport(ERROR,
1306 errmsg("could not open affix file \"%s\": %m",
1307 filename)));
1308
1309 while ((recoded = tsearch_readline(&trst)) != NULL)
1310 {
1311 int fields_read;
1312
1313 if (*recoded == '\0' || isspace((unsigned char) *recoded) || t_iseq(recoded, '#'))
1314 goto nextline;
1315
1317
1318 if (ptype)
1319 pfree(ptype);
1320 ptype = lowerstr_ctx(Conf, type);
1321
1322 /* First try to parse AF parameter (alias compression) */
1323 if (STRNCMP(ptype, "af") == 0)
1324 {
1325 /* First line is the number of aliases */
1326 if (!Conf->useFlagAliases)
1327 {
1328 Conf->useFlagAliases = true;
1329 naffix = atoi(sflag);
1330 if (naffix <= 0)
1331 ereport(ERROR,
1333 errmsg("invalid number of flag vector aliases")));
1334
1335 /* Also reserve place for empty flag set */
1336 naffix++;
1337
1338 Conf->AffixData = palloc0_array(const char *, naffix);
1339 Conf->lenAffixData = Conf->nAffixData = naffix;
1340
1341 /* Add empty flag set into AffixData */
1342 Conf->AffixData[curaffix] = VoidString;
1343 curaffix++;
1344 }
1345 /* Other lines are aliases */
1346 else
1347 {
1348 if (curaffix < naffix)
1349 {
1350 Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1351 curaffix++;
1352 }
1353 else
1354 ereport(ERROR,
1356 errmsg("number of aliases exceeds specified number %d",
1357 naffix - 1)));
1358 }
1359 goto nextline;
1360 }
1361 /* Else try to parse prefixes and suffixes */
1362 if (fields_read < 4 ||
1363 (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1364 goto nextline;
1365
1367 if (sflaglen == 0
1368 || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1369 || (sflaglen > 2 && Conf->flagMode == FM_LONG))
1370 goto nextline;
1371
1372 /*--------
1373 * Affix header. For example:
1374 * SFX \ N 1
1375 *--------
1376 */
1377 if (fields_read == 4)
1378 {
1379 isSuffix = (STRNCMP(ptype, "sfx") == 0);
1380 if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1381 flagflags = FF_CROSSPRODUCT;
1382 else
1383 flagflags = 0;
1384 }
1385 /*--------
1386 * Affix fields. For example:
1387 * SFX \ 0 Y/L [^Y]
1388 *--------
1389 */
1390 else
1391 {
1392 char *ptr;
1393 int aflg = 0;
1394
1395 /* Get flags after '/' (flags are case sensitive) */
1396 if ((ptr = strchr(repl, '/')) != NULL)
1399 ptr + 1));
1400 /* Get lowercased version of string before '/' */
1401 prepl = lowerstr_ctx(Conf, repl);
1402 if ((ptr = strchr(prepl, '/')) != NULL)
1403 *ptr = '\0';
1405 pmask = lowerstr_ctx(Conf, mask);
1406 if (t_iseq(find, '0'))
1407 *pfind = '\0';
1408 if (t_iseq(repl, '0'))
1409 *prepl = '\0';
1410
1411 NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1413 pfree(prepl);
1414 pfree(pfind);
1415 pfree(pmask);
1416 }
1417
1418nextline:
1419 pfree(recoded);
1420 }
1421
1423 if (ptype)
1424 pfree(ptype);
1425}
1426
1427/*
1428 * import affixes
1429 *
1430 * Note caller must already have applied get_tsearch_config_filename
1431 *
1432 * This function is responsible for parsing ispell ("old format") affix files.
1433 * If we realize that the file contains new-format commands, we pass off the
1434 * work to NIImportOOAffixes(), which will re-read the whole file.
1435 */
1436void
1438{
1439 char *pstr = NULL;
1440 char flag[BUFSIZ];
1441 char mask[BUFSIZ];
1442 char find[BUFSIZ];
1443 char repl[BUFSIZ];
1444 char *s;
1445 bool suffixes = false;
1446 bool prefixes = false;
1447 char flagflags = 0;
1449 bool oldformat = false;
1450 char *recoded = NULL;
1451
1453 ereport(ERROR,
1455 errmsg("could not open affix file \"%s\": %m",
1456 filename)));
1457
1458 Conf->usecompound = false;
1459 Conf->useFlagAliases = false;
1460 Conf->flagMode = FM_CHAR;
1461
1462 while ((recoded = tsearch_readline(&trst)) != NULL)
1463 {
1465
1466 /* Skip comments and empty lines */
1467 if (*pstr == '#' || *pstr == '\n')
1468 goto nextline;
1469
1470 if (STRNCMP(pstr, "compoundwords") == 0)
1471 {
1472 /* Find case-insensitive L flag in non-lowercased string */
1473 s = findchar2(recoded, 'l', 'L');
1474 if (s)
1475 {
1476 while (*s && !isspace((unsigned char) *s))
1477 s += pg_mblen_cstr(s);
1478 while (*s && isspace((unsigned char) *s))
1479 s += pg_mblen_cstr(s);
1480
1481 if (*s && pg_mblen_cstr(s) == 1)
1482 {
1484 Conf->usecompound = true;
1485 }
1486 oldformat = true;
1487 goto nextline;
1488 }
1489 }
1490 if (STRNCMP(pstr, "suffixes") == 0)
1491 {
1492 suffixes = true;
1493 prefixes = false;
1494 oldformat = true;
1495 goto nextline;
1496 }
1497 if (STRNCMP(pstr, "prefixes") == 0)
1498 {
1499 suffixes = false;
1500 prefixes = true;
1501 oldformat = true;
1502 goto nextline;
1503 }
1504 if (STRNCMP(pstr, "flag") == 0)
1505 {
1506 s = recoded + 4; /* we need non-lowercased string */
1507 flagflags = 0;
1508
1509 while (*s && isspace((unsigned char) *s))
1510 s += pg_mblen_cstr(s);
1511
1512 if (*s == '*')
1513 {
1514 flagflags |= FF_CROSSPRODUCT;
1515 s++;
1516 }
1517 else if (*s == '~')
1518 {
1519 flagflags |= FF_COMPOUNDONLY;
1520 s++;
1521 }
1522
1523 if (*s == '\\')
1524 s++;
1525
1526 /*
1527 * An old-format flag is a single ASCII character; we expect it to
1528 * be followed by EOL, whitespace, or ':'. Otherwise this is a
1529 * new-format flag command.
1530 */
1531 if (*s && pg_mblen_cstr(s) == 1)
1532 {
1533 flag[0] = *s++;
1534 flag[1] = '\0';
1535
1536 if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1537 isspace((unsigned char) *s))
1538 {
1539 oldformat = true;
1540 goto nextline;
1541 }
1542 }
1543 goto isnewformat;
1544 }
1545 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1546 STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1547 STRNCMP(recoded, "PFX") == 0 ||
1548 STRNCMP(recoded, "SFX") == 0)
1549 goto isnewformat;
1550
1551 if ((!suffixes) && (!prefixes))
1552 goto nextline;
1553
1554 if (!parse_affentry(pstr, mask, find, repl))
1555 goto nextline;
1556
1557 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1558
1559nextline:
1560 pfree(recoded);
1561 pfree(pstr);
1562 }
1564 return;
1565
1567 if (oldformat)
1568 ereport(ERROR,
1570 errmsg("affix file contains both old-style and new-style commands")));
1572
1574}
1575
1576/*
1577 * Merges two affix flag sets and stores a new affix flag set into
1578 * Conf->AffixData.
1579 *
1580 * Returns index of a new affix flag set.
1581 */
1582static int
1584{
1585 const char **ptr;
1586
1587 Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
1588
1589 /* Do not merge affix flags if one of affix flags is empty */
1590 if (*Conf->AffixData[a1] == '\0')
1591 return a2;
1592 else if (*Conf->AffixData[a2] == '\0')
1593 return a1;
1594
1595 /* Double the size of AffixData if there's not enough space */
1596 if (Conf->nAffixData + 1 >= Conf->lenAffixData)
1597 {
1598 Conf->lenAffixData *= 2;
1599 Conf->AffixData = (const char **) repalloc(Conf->AffixData,
1600 sizeof(char *) * Conf->lenAffixData);
1601 }
1602
1603 ptr = Conf->AffixData + Conf->nAffixData;
1604 if (Conf->flagMode == FM_NUM)
1605 {
1606 char *p = cpalloc(strlen(Conf->AffixData[a1]) +
1607 strlen(Conf->AffixData[a2]) +
1608 1 /* comma */ + 1 /* \0 */ );
1609
1610 sprintf(p, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1611 *ptr = p;
1612 }
1613 else
1614 {
1615 char *p = cpalloc(strlen(Conf->AffixData[a1]) +
1616 strlen(Conf->AffixData[a2]) +
1617 1 /* \0 */ );
1618
1619 sprintf(p, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1620 *ptr = p;
1621 }
1622 ptr++;
1623 *ptr = NULL;
1624 Conf->nAffixData++;
1625
1626 return Conf->nAffixData - 1;
1627}
1628
1629/*
1630 * Returns a set of affix parameters which correspondence to the set of affix
1631 * flags with the given index.
1632 */
1633static uint32
1635{
1636 Assert(affix < Conf->nAffixData);
1637
1638 return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
1640}
1641
1642/*
1643 * Makes a prefix tree for the given level.
1644 *
1645 * Conf: current dictionary.
1646 * low: lower index of the Conf->Spell array.
1647 * high: upper index of the Conf->Spell array.
1648 * level: current prefix tree level.
1649 */
1650static SPNode *
1651mkSPNode(IspellDict *Conf, int low, int high, int level)
1652{
1653 int i;
1654 int nchar = 0;
1655 char lastchar = '\0';
1656 SPNode *rs;
1658 int lownew = low;
1659
1660 for (i = low; i < high; i++)
1661 if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1662 {
1663 nchar++;
1664 lastchar = Conf->Spell[i]->word[level];
1665 }
1666
1667 if (!nchar)
1668 return NULL;
1669
1670 rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1671 rs->length = nchar;
1672 data = rs->data;
1673
1674 lastchar = '\0';
1675 for (i = low; i < high; i++)
1676 if (Conf->Spell[i]->p.d.len > level)
1677 {
1678 if (lastchar != Conf->Spell[i]->word[level])
1679 {
1680 if (lastchar)
1681 {
1682 /* Next level of the prefix tree */
1683 data->node = mkSPNode(Conf, lownew, i, level + 1);
1684 lownew = i;
1685 data++;
1686 }
1687 lastchar = Conf->Spell[i]->word[level];
1688 }
1689 data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1690 if (Conf->Spell[i]->p.d.len == level + 1)
1691 {
1692 bool clearCompoundOnly = false;
1693
1694 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1695 {
1696 /*
1697 * MergeAffix called a few times. If one of word is
1698 * allowed to be in compound word and another isn't, then
1699 * clear FF_COMPOUNDONLY flag.
1700 */
1701
1702 clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1703 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1704 ? false : true;
1705 data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1706 }
1707 else
1708 data->affix = Conf->Spell[i]->p.d.affix;
1709 data->isword = 1;
1710
1711 data->compoundflag = makeCompoundFlags(Conf, data->affix);
1712
1713 if ((data->compoundflag & FF_COMPOUNDONLY) &&
1714 (data->compoundflag & FF_COMPOUNDFLAG) == 0)
1715 data->compoundflag |= FF_COMPOUNDFLAG;
1716
1718 data->compoundflag &= ~FF_COMPOUNDONLY;
1719 }
1720 }
1721
1722 /* Next level of the prefix tree */
1723 data->node = mkSPNode(Conf, lownew, high, level + 1);
1724
1725 return rs;
1726}
1727
1728/*
1729 * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1730 * and affixes.
1731 */
1732void
1734{
1735 int i;
1736 int naffix;
1737 int curaffix;
1738
1739 /* compress affixes */
1740
1741 /*
1742 * If we use flag aliases then we need to use Conf->AffixData filled in
1743 * the NIImportOOAffixes().
1744 */
1745 if (Conf->useFlagAliases)
1746 {
1747 for (i = 0; i < Conf->nspell; i++)
1748 {
1749 char *end;
1750
1751 if (*Conf->Spell[i]->p.flag != '\0')
1752 {
1753 errno = 0;
1754 curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1755 if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1756 ereport(ERROR,
1758 errmsg("invalid affix alias \"%s\"",
1759 Conf->Spell[i]->p.flag)));
1760 if (curaffix < 0 || curaffix >= Conf->nAffixData)
1761 ereport(ERROR,
1763 errmsg("invalid affix alias \"%s\"",
1764 Conf->Spell[i]->p.flag)));
1765 if (*end != '\0' && !isdigit((unsigned char) *end) && !isspace((unsigned char) *end))
1766 ereport(ERROR,
1768 errmsg("invalid affix alias \"%s\"",
1769 Conf->Spell[i]->p.flag)));
1770 }
1771 else
1772 {
1773 /*
1774 * If Conf->Spell[i]->p.flag is empty, then get empty value of
1775 * Conf->AffixData (0 index).
1776 */
1777 curaffix = 0;
1778 }
1779
1780 Conf->Spell[i]->p.d.affix = curaffix;
1781 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1782 }
1783 }
1784 /* Otherwise fill Conf->AffixData here */
1785 else
1786 {
1787 /* Count the number of different flags used in the dictionary */
1788 qsort(Conf->Spell, Conf->nspell, sizeof(SPELL *),
1790
1791 naffix = 0;
1792 for (i = 0; i < Conf->nspell; i++)
1793 {
1794 if (i == 0 ||
1795 strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0)
1796 naffix++;
1797 }
1798
1799 /*
1800 * Fill in Conf->AffixData with the affixes that were used in the
1801 * dictionary. Replace textual flag-field of Conf->Spell entries with
1802 * indexes into Conf->AffixData array.
1803 */
1804 Conf->AffixData = palloc0_array(const char *, naffix);
1805
1806 curaffix = -1;
1807 for (i = 0; i < Conf->nspell; i++)
1808 {
1809 if (i == 0 ||
1810 strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0)
1811 {
1812 curaffix++;
1814 Conf->AffixData[curaffix] = cpstrdup(Conf,
1815 Conf->Spell[i]->p.flag);
1816 }
1817
1818 Conf->Spell[i]->p.d.affix = curaffix;
1819 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1820 }
1821
1822 Conf->lenAffixData = Conf->nAffixData = naffix;
1823 }
1824
1825 /* Start build a prefix tree */
1826 qsort(Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1827 Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1828}
1829
1830/*
1831 * Makes a prefix tree for the given level using the repl string of an affix
1832 * rule. Affixes with empty replace string do not include in the prefix tree.
1833 * This affixes are included by mkVoidAffix().
1834 *
1835 * Conf: current dictionary.
1836 * low: lower index of the Conf->Affix array.
1837 * high: upper index of the Conf->Affix array.
1838 * level: current prefix tree level.
1839 * type: FF_SUFFIX or FF_PREFIX.
1840 */
1841static AffixNode *
1842mkANode(IspellDict *Conf, int low, int high, int level, int type)
1843{
1844 int i;
1845 int nchar = 0;
1846 uint8 lastchar = '\0';
1847 AffixNode *rs;
1849 int lownew = low;
1850 int naff;
1851 AFFIX **aff;
1852
1853 for (i = low; i < high; i++)
1854 if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1855 {
1856 nchar++;
1857 lastchar = GETCHAR(Conf->Affix + i, level, type);
1858 }
1859
1860 if (!nchar)
1861 return NULL;
1862
1863 aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1864 naff = 0;
1865
1866 rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1867 rs->length = nchar;
1868 data = rs->data;
1869
1870 lastchar = '\0';
1871 for (i = low; i < high; i++)
1872 if (Conf->Affix[i].replen > level)
1873 {
1874 if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1875 {
1876 if (lastchar)
1877 {
1878 /* Next level of the prefix tree */
1879 data->node = mkANode(Conf, lownew, i, level + 1, type);
1880 if (naff)
1881 {
1882 data->naff = naff;
1883 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1884 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1885 naff = 0;
1886 }
1887 data++;
1888 lownew = i;
1889 }
1890 lastchar = GETCHAR(Conf->Affix + i, level, type);
1891 }
1892 data->val = GETCHAR(Conf->Affix + i, level, type);
1893 if (Conf->Affix[i].replen == level + 1)
1894 { /* affix stopped */
1895 aff[naff++] = Conf->Affix + i;
1896 }
1897 }
1898
1899 /* Next level of the prefix tree */
1900 data->node = mkANode(Conf, lownew, high, level + 1, type);
1901 if (naff)
1902 {
1903 data->naff = naff;
1904 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1905 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1906 naff = 0;
1907 }
1908
1909 pfree(aff);
1910
1911 return rs;
1912}
1913
1914/*
1915 * Makes the root void node in the prefix tree. The root void node is created
1916 * for affixes which have empty replace string ("repl" field).
1917 */
1918static void
1920{
1921 int i,
1922 cnt = 0;
1923 int start = (issuffix) ? startsuffix : 0;
1924 int end = (issuffix) ? Conf->naffixes : startsuffix;
1925 AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1926
1927 Affix->length = 1;
1928 Affix->isvoid = 1;
1929
1930 if (issuffix)
1931 {
1932 Affix->data->node = Conf->Suffix;
1933 Conf->Suffix = Affix;
1934 }
1935 else
1936 {
1937 Affix->data->node = Conf->Prefix;
1938 Conf->Prefix = Affix;
1939 }
1940
1941 /* Count affixes with empty replace string */
1942 for (i = start; i < end; i++)
1943 if (Conf->Affix[i].replen == 0)
1944 cnt++;
1945
1946 /* There is not affixes with empty replace string */
1947 if (cnt == 0)
1948 return;
1949
1950 Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1951 Affix->data->naff = (uint32) cnt;
1952
1953 cnt = 0;
1954 for (i = start; i < end; i++)
1955 if (Conf->Affix[i].replen == 0)
1956 {
1957 Affix->data->aff[cnt] = Conf->Affix + i;
1958 cnt++;
1959 }
1960}
1961
1962/*
1963 * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1964 * contain affixflag if this flag is not used actually by the .dict file.
1965 *
1966 * Conf: current dictionary.
1967 * affixflag: affix flag.
1968 *
1969 * Returns true if the Conf->AffixData array contains affixflag, otherwise
1970 * returns false.
1971 */
1972static bool
1974{
1975 int i;
1976
1977 for (i = 0; i < Conf->nAffixData; i++)
1979 return true;
1980
1981 return false;
1982}
1983
1984/*
1985 * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1986 */
1987void
1989{
1990 AFFIX *Affix;
1991 size_t i;
1992 CMPDAffix *ptr;
1993 int firstsuffix = Conf->naffixes;
1994
1995 if (Conf->naffixes == 0)
1996 return;
1997
1998 /* Store compound affixes in the Conf->CompoundAffix array */
1999 if (Conf->naffixes > 1)
2000 qsort(Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
2001 Conf->CompoundAffix = ptr = palloc_array(CMPDAffix, Conf->naffixes);
2002 ptr->affix = NULL;
2003
2004 for (i = 0; i < Conf->naffixes; i++)
2005 {
2006 Affix = &(((AFFIX *) Conf->Affix)[i]);
2007 if (Affix->type == FF_SUFFIX && i < firstsuffix)
2008 firstsuffix = i;
2009
2010 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
2011 isAffixInUse(Conf, Affix->flag))
2012 {
2013 bool issuffix = (Affix->type == FF_SUFFIX);
2014
2015 if (ptr == Conf->CompoundAffix ||
2016 issuffix != (ptr - 1)->issuffix ||
2017 strbncmp((const unsigned char *) (ptr - 1)->affix,
2018 (const unsigned char *) Affix->repl,
2019 (ptr - 1)->len))
2020 {
2021 /* leave only unique and minimal suffixes */
2022 ptr->affix = Affix->repl;
2023 ptr->len = Affix->replen;
2024 ptr->issuffix = issuffix;
2025 ptr++;
2026 }
2027 }
2028 }
2029 ptr->affix = NULL;
2030 Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
2031
2032 /* Start build a prefix tree */
2033 Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
2034 Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
2036 mkVoidAffix(Conf, false, firstsuffix);
2037}
2038
2039static AffixNodeData *
2040FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
2041{
2043 *StopHigh,
2044 *StopMiddle;
2045 uint8 symbol;
2046
2047 if (node->isvoid)
2048 { /* search void affixes */
2049 if (node->data->naff)
2050 return node->data;
2051 node = node->data->node;
2052 }
2053
2054 while (node && *level < wrdlen)
2055 {
2056 StopLow = node->data;
2057 StopHigh = node->data + node->length;
2058 while (StopLow < StopHigh)
2059 {
2060 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2061 symbol = GETWCHAR(word, wrdlen, *level, type);
2062
2063 if (StopMiddle->val == symbol)
2064 {
2065 (*level)++;
2066 if (StopMiddle->naff)
2067 return StopMiddle;
2068 node = StopMiddle->node;
2069 break;
2070 }
2071 else if (StopMiddle->val < symbol)
2072 StopLow = StopMiddle + 1;
2073 else
2075 }
2076 if (StopLow >= StopHigh)
2077 break;
2078 }
2079 return NULL;
2080}
2081
2082/*
2083 * Checks to see if affix applies to word, transforms word if so.
2084 * The transformation consists of replacing Affix->replen leading or
2085 * trailing bytes with the Affix->find string.
2086 *
2087 * word: input word
2088 * len: length of input word
2089 * Affix: affix to consider
2090 * flagflags: context flags showing whether we are handling a compound word
2091 * newword: output buffer (MUST be of length 2 * MAXNORMLEN)
2092 * baselen: input/output argument
2093 *
2094 * If baselen isn't NULL, then *baselen is used to return the length of
2095 * the non-changed part of the word when applying a suffix, and is used
2096 * to detect whether the input contained only a prefix and suffix when
2097 * later applying a prefix.
2098 *
2099 * Returns newword on success, or NULL if the affix can't be applied.
2100 * On success, the modified word is stored into newword.
2101 */
2102static char *
2103CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2104{
2105 size_t keeplen,
2106 findlen;
2107
2108 /*
2109 * Check compound allow flags
2110 */
2111
2112 if (flagflags == 0)
2113 {
2114 if (Affix->flagflags & FF_COMPOUNDONLY)
2115 return NULL;
2116 }
2117 else if (flagflags & FF_COMPOUNDBEGIN)
2118 {
2119 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2120 return NULL;
2121 if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2122 if (Affix->type == FF_SUFFIX)
2123 return NULL;
2124 }
2125 else if (flagflags & FF_COMPOUNDMIDDLE)
2126 {
2127 if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2129 return NULL;
2130 }
2131 else if (flagflags & FF_COMPOUNDLAST)
2132 {
2133 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2134 return NULL;
2135 if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2136 if (Affix->type == FF_PREFIX)
2137 return NULL;
2138 }
2139
2140 /*
2141 * Protect against output buffer overrun (len < Affix->replen would be
2142 * caller error, but check anyway)
2143 */
2144 Assert(len == strlen(word));
2145 if (len < Affix->replen)
2146 return NULL;
2147 keeplen = len - Affix->replen; /* how much of word we will keep */
2148 findlen = strlen(Affix->find);
2149 if (keeplen + findlen >= 2 * MAXNORMLEN)
2150 return NULL;
2151
2152 /*
2153 * make replace pattern of affix
2154 */
2155 if (Affix->type == FF_SUFFIX)
2156 {
2158 strcpy(newword + keeplen, Affix->find);
2159 if (baselen) /* store length of non-changed part of word */
2160 *baselen = keeplen;
2161 }
2162 else
2163 {
2164 /*
2165 * if prefix is an all non-changed part's length then all word
2166 * contains only prefix and suffix, so out
2167 */
2168 if (baselen && *baselen + findlen <= Affix->replen)
2169 return NULL;
2170 memcpy(newword, Affix->find, findlen);
2171 strcpy(newword + findlen, word + Affix->replen);
2172 }
2173
2174 /*
2175 * check resulting word
2176 */
2177 if (Affix->issimple)
2178 return newword;
2179 else if (Affix->isregis)
2180 {
2181 if (RS_execute(&(Affix->reg.regis), newword))
2182 return newword;
2183 }
2184 else
2185 {
2186 pg_wchar *data;
2187 size_t data_len;
2188 int newword_len;
2189
2190 /* Convert data string to wide characters */
2194
2195 if (pg_regexec(Affix->reg.pregex, data, data_len,
2196 0, NULL, 0, NULL, 0) == REG_OKAY)
2197 {
2198 pfree(data);
2199 return newword;
2200 }
2201 pfree(data);
2202 }
2203
2204 return NULL;
2205}
2206
2207static int
2208addToResult(char **forms, char **cur, char *word)
2209{
2210 if (cur - forms >= MAX_NORM - 1)
2211 return 0;
2212 if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2213 {
2214 *cur = pstrdup(word);
2215 *(cur + 1) = NULL;
2216 return 1;
2217 }
2218
2219 return 0;
2220}
2221
2222static char **
2224{
2225 AffixNodeData *suffix = NULL,
2226 *prefix = NULL;
2227 int slevel = 0,
2228 plevel = 0;
2229 int wrdlen = strlen(word),
2230 swrdlen;
2231 char **forms;
2232 char **cur;
2233 char newword[2 * MAXNORMLEN] = "";
2234 char pnewword[2 * MAXNORMLEN] = "";
2235 AffixNode *snode = Conf->Suffix,
2236 *pnode;
2237 int i,
2238 j;
2239
2240 if (wrdlen > MAXNORMLEN)
2241 return NULL;
2242 cur = forms = palloc_array(char *, MAX_NORM);
2243 *cur = NULL;
2244
2245
2246 /* Check that the word itself is normal form */
2248 {
2249 *cur = pstrdup(word);
2250 cur++;
2251 *cur = NULL;
2252 }
2253
2254 /* Find all other NORMAL forms of the 'word' (check only prefix) */
2255 pnode = Conf->Prefix;
2256 plevel = 0;
2257 while (pnode)
2258 {
2260 if (!prefix)
2261 break;
2262 for (j = 0; j < prefix->naff; j++)
2263 {
2264 if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2265 {
2266 /* prefix success */
2267 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2269 }
2270 }
2271 pnode = prefix->node;
2272 }
2273
2274 /*
2275 * Find all other NORMAL forms of the 'word' (check suffix and then
2276 * prefix)
2277 */
2278 while (snode)
2279 {
2280 int baselen = 0;
2281
2282 /* find possible suffix */
2284 if (!suffix)
2285 break;
2286 /* foreach suffix check affix */
2287 for (i = 0; i < suffix->naff; i++)
2288 {
2289 if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2290 {
2291 /* suffix success */
2292 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2294
2295 /* now we will look changed word with prefixes */
2296 pnode = Conf->Prefix;
2297 plevel = 0;
2299 while (pnode)
2300 {
2302 if (!prefix)
2303 break;
2304 for (j = 0; j < prefix->naff; j++)
2305 {
2306 if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2307 {
2308 /* prefix success */
2309 const char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2310 VoidString : prefix->aff[j]->flag;
2311
2312 if (FindWord(Conf, pnewword, ff, flag))
2314 }
2315 }
2316 pnode = prefix->node;
2317 }
2318 }
2319 }
2320
2321 snode = suffix->node;
2322 }
2323
2324 if (cur == forms)
2325 {
2326 pfree(forms);
2327 return NULL;
2328 }
2329 return forms;
2330}
2331
2332typedef struct SplitVar
2333{
2336 char **stem;
2339
2340static int
2342{
2343 bool issuffix;
2344
2345 /* in case CompoundAffix is null: */
2346 if (*ptr == NULL)
2347 return -1;
2348
2349 if (CheckInPlace)
2350 {
2351 while ((*ptr)->affix)
2352 {
2353 if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2354 {
2355 len = (*ptr)->len;
2356 issuffix = (*ptr)->issuffix;
2357 (*ptr)++;
2358 return (issuffix) ? len : 0;
2359 }
2360 (*ptr)++;
2361 }
2362 }
2363 else
2364 {
2365 const char *affbegin;
2366
2367 while ((*ptr)->affix)
2368 {
2369 if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2370 {
2371 len = (*ptr)->len + (affbegin - word);
2372 issuffix = (*ptr)->issuffix;
2373 (*ptr)++;
2374 return (issuffix) ? len : 0;
2375 }
2376 (*ptr)++;
2377 }
2378 }
2379 return -1;
2380}
2381
2382static SplitVar *
2384{
2386
2387 v->next = NULL;
2388 if (s)
2389 {
2390 int i;
2391
2392 v->lenstem = s->lenstem;
2393 v->stem = palloc_array(char *, v->lenstem);
2394 v->nstem = s->nstem;
2395 for (i = 0; i < s->nstem; i++)
2396 v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2397 }
2398 else
2399 {
2400 v->lenstem = 16;
2401 v->stem = palloc_array(char *, v->lenstem);
2402 v->nstem = 0;
2403 }
2404 return v;
2405}
2406
2407static void
2409{
2410 if (v->nstem >= v->lenstem)
2411 {
2412 v->lenstem *= 2;
2413 v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2414 }
2415
2416 v->stem[v->nstem] = word;
2417 v->nstem++;
2418}
2419
2420static SplitVar *
2421SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, const char *word, int wordlen, int startpos, int minpos)
2422{
2423 SplitVar *var = NULL;
2425 *StopHigh,
2426 *StopMiddle = NULL;
2427 SPNode *node = (snode) ? snode : Conf->Dictionary;
2428 int level = (snode) ? minpos : startpos; /* recursive
2429 * minpos==level */
2430 int lenaff;
2431 CMPDAffix *caff;
2432 char *notprobed;
2433 int compoundflag = 0;
2434
2435 /* since this function recurses, it could be driven to stack overflow */
2437
2438 notprobed = (char *) palloc(wordlen);
2439 memset(notprobed, 1, wordlen);
2440 var = CopyVar(orig, 1);
2441
2442 while (level < wordlen)
2443 {
2444 /* find word with epenthetic or/and compound affix */
2445 caff = Conf->CompoundAffix;
2446 while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2447 {
2448 /*
2449 * there is one of compound affixes, so check word for existings
2450 */
2451 char buf[MAXNORMLEN];
2452 char **subres;
2453
2454 lenaff = level - startpos + lenaff;
2455
2456 if (!notprobed[startpos + lenaff - 1])
2457 continue;
2458
2459 if (level + lenaff - 1 <= minpos)
2460 continue;
2461
2462 if (lenaff >= MAXNORMLEN)
2463 continue; /* skip too big value */
2464 if (lenaff > 0)
2466 buf[lenaff] = '\0';
2467
2468 if (level == 0)
2469 compoundflag = FF_COMPOUNDBEGIN;
2470 else if (level == wordlen - 1)
2471 compoundflag = FF_COMPOUNDLAST;
2472 else
2473 compoundflag = FF_COMPOUNDMIDDLE;
2474 subres = NormalizeSubWord(Conf, buf, compoundflag);
2475 if (subres)
2476 {
2477 /* Yes, it was a word from dictionary */
2478 SplitVar *new = CopyVar(var, 0);
2479 SplitVar *ptr = var;
2480 char **sptr = subres;
2481
2482 notprobed[startpos + lenaff - 1] = 0;
2483
2484 while (*sptr)
2485 {
2486 AddStem(new, *sptr);
2487 sptr++;
2488 }
2489 pfree(subres);
2490
2491 while (ptr->next)
2492 ptr = ptr->next;
2493 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2494
2495 pfree(new->stem);
2496 pfree(new);
2497 }
2498 }
2499
2500 if (!node)
2501 break;
2502
2503 StopLow = node->data;
2504 StopHigh = node->data + node->length;
2505 while (StopLow < StopHigh)
2506 {
2507 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2508 if (StopMiddle->val == ((const uint8 *) (word))[level])
2509 break;
2510 else if (StopMiddle->val < ((const uint8 *) (word))[level])
2511 StopLow = StopMiddle + 1;
2512 else
2514 }
2515
2516 if (StopLow < StopHigh)
2517 {
2518 if (startpos == 0)
2519 compoundflag = FF_COMPOUNDBEGIN;
2520 else if (level == wordlen - 1)
2521 compoundflag = FF_COMPOUNDLAST;
2522 else
2523 compoundflag = FF_COMPOUNDMIDDLE;
2524
2525 /* find infinitive */
2526 if (StopMiddle->isword &&
2527 (StopMiddle->compoundflag & compoundflag) &&
2528 notprobed[level])
2529 {
2530 /* ok, we found full compoundallowed word */
2531 if (level > minpos)
2532 {
2533 /* and its length more than minimal */
2534 if (wordlen == level + 1)
2535 {
2536 /* well, it was last word */
2537 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2539 return var;
2540 }
2541 else
2542 {
2543 /* then we will search more big word at the same point */
2544 SplitVar *ptr = var;
2545
2546 while (ptr->next)
2547 ptr = ptr->next;
2548 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2549 /* we can find next word */
2550 level++;
2551 AddStem(var, pnstrdup(word + startpos, level - startpos));
2552 node = Conf->Dictionary;
2553 startpos = level;
2554 continue;
2555 }
2556 }
2557 }
2558 node = StopMiddle->node;
2559 }
2560 else
2561 node = NULL;
2562 level++;
2563 }
2564
2565 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2567 return var;
2568}
2569
2570static void
2572{
2573 if (*lres == NULL)
2575
2576 if (*lcur - *lres < MAX_NORM - 1)
2577 {
2578 (*lcur)->lexeme = word;
2579 (*lcur)->flags = flags;
2580 (*lcur)->nvariant = NVariant;
2581 (*lcur)++;
2582 (*lcur)->lexeme = NULL;
2583 }
2584}
2585
2586TSLexeme *
2588{
2589 char **res;
2590 TSLexeme *lcur = NULL,
2591 *lres = NULL;
2592 uint16 NVariant = 1;
2593
2594 res = NormalizeSubWord(Conf, word, 0);
2595
2596 if (res)
2597 {
2598 char **ptr = res;
2599
2600 while (*ptr && (lcur - lres) < MAX_NORM)
2601 {
2602 addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2603 ptr++;
2604 }
2605 pfree(res);
2606 }
2607
2608 if (Conf->usecompound)
2609 {
2610 int wordlen = strlen(word);
2611 SplitVar *ptr,
2612 *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2613 int i;
2614
2615 while (var)
2616 {
2617 if (var->nstem > 1)
2618 {
2619 char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2620
2621 if (subres)
2622 {
2623 char **subptr = subres;
2624
2625 while (*subptr)
2626 {
2627 for (i = 0; i < var->nstem - 1; i++)
2628 {
2629 addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2630 }
2631
2632 addNorm(&lres, &lcur, *subptr, 0, NVariant);
2633 subptr++;
2634 NVariant++;
2635 }
2636
2637 pfree(subres);
2638 var->stem[0] = NULL;
2639 pfree(var->stem[var->nstem - 1]);
2640 }
2641 }
2642
2643 for (i = 0; i < var->nstem && var->stem[i]; i++)
2644 pfree(var->stem[i]);
2645 ptr = var->next;
2646 pfree(var->stem);
2647 pfree(var);
2648 var = ptr;
2649 }
2650 }
2651
2652 return lres;
2653}
unsigned char symbol
Definition api.h:4
static int32 next
Definition blutils.c:225
#define MAXALIGN(LEN)
Definition c.h:896
uint8_t uint8
Definition c.h:622
#define Assert(condition)
Definition c.h:943
int32_t int32
Definition c.h:620
uint16_t uint16
Definition c.h:623
uint32_t uint32
Definition c.h:624
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
struct cursor * cur
Definition ecpg.c:29
int errcode(int sqlerrcode)
Definition elog.c:875
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
void err(int eval, const char *fmt,...)
Definition err.c:43
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_array(type, count)
Definition fe_memutils.h:77
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
return str start
const char * str
static const FormData_pg_attribute a1
Definition heap.c:144
static const FormData_pg_attribute a2
Definition heap.c:157
long val
Definition informix.c:689
int j
Definition isn.c:78
int i
Definition isn.c:77
unsigned int pg_wchar
Definition mbprint.c:31
int pg_mblen_cstr(const char *mbstr)
Definition mbutils.c:1045
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition mbutils.c:997
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurTransactionContext
Definition mcxt.c:172
char * pnstrdup(const char *in, Size len)
Definition mcxt.c:1792
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
static char * errmsg
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
const void size_t len
const void * data
static char * filename
Definition pg_dumpall.c:133
static XLogRecPtr startpos
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define sprintf
Definition port.h:262
#define qsort(a, b, c, d)
Definition port.h:495
char * c
static int fb(int x)
char * s1
char * s2
static void prefixes(struct vars *v)
Definition regc_lex.c:99
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition regcomp.c:372
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition regcomp.c:1476
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition regerror.c:60
#define REG_ADVANCED
Definition regex.h:181
#define REG_OKAY
Definition regex.h:215
#define REG_NOSUB
Definition regex.h:185
#define regex_t
Definition regex.h:245
static int find(struct vars *v, struct cnfa *cnfa, struct colormap *cm)
Definition regexec.c:419
int pg_regexec(regex_t *re, const chr *string, size_t len, size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], int flags)
Definition regexec.c:185
void RS_compile(Regis *r, bool issuffix, const char *str)
Definition regis.c:85
bool RS_execute(Regis *r, char *str)
Definition regis.c:208
bool RS_isRegis(const char *str)
Definition regis.c:31
void NIStartBuild(IspellDict *Conf)
Definition spell.c:89
#define GETWCHAR(W, L, N, T)
Definition spell.c:192
static int strbcmp(const unsigned char *s1, const unsigned char *s2)
Definition spell.c:258
void NIFinishBuild(IspellDict *Conf)
Definition spell.c:104
void NIImportAffixes(IspellDict *Conf, const char *filename)
Definition spell.c:1437
static char * cpstrdup(IspellDict *Conf, const char *str)
Definition spell.c:163
#define GETCHAR(A, N, T)
Definition spell.c:193
static int parse_ooaffentry(char *str, char *type, char *flag, char *find, char *repl, char *mask)
Definition spell.c:858
static const char * getAffixFlagSet(IspellDict *Conf, char *s)
Definition spell.c:1170
static SPNode * mkSPNode(IspellDict *Conf, int low, int high, int level)
Definition spell.c:1651
static void addCompoundAffixFlagValue(IspellDict *Conf, const char *s, uint32 val)
Definition spell.c:1076
static int cmpspell(const void *s1, const void *s2)
Definition spell.c:198
#define MAX_NORM
Definition spell.c:188
#define PAE_WAIT_REPL
Definition spell.c:778
static bool get_nextfield(char **str, char *next)
Definition spell.c:794
void NISortDictionary(IspellDict *Conf)
Definition spell.c:1733
static int FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
Definition spell.c:605
static AffixNodeData * FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
Definition spell.c:2040
static char * CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
Definition spell.c:2103
static SplitVar * CopyVar(SplitVar *s, int makedup)
Definition spell.c:2383
static void NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
Definition spell.c:489
static void NIImportOOAffixes(IspellDict *Conf, const char *filename)
Definition spell.c:1208
#define COMPACT_ALLOC_CHUNK
Definition spell.c:127
static void addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
Definition spell.c:2571
static void NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
Definition spell.c:680
static char * findchar(char *str, int c)
Definition spell.c:230
static const char * VoidString
Definition spell.c:195
static char ** NormalizeSubWord(IspellDict *Conf, const char *word, int flag)
Definition spell.c:2223
static int CheckCompoundAffixes(CMPDAffix **ptr, const char *word, int len, bool CheckInPlace)
Definition spell.c:2341
#define MAXNORMLEN
Definition spell.c:189
#define STRNCMP(s, p)
Definition spell.c:191
static void getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
Definition spell.c:350
void NISortAffixes(IspellDict *Conf)
Definition spell.c:1988
static int cmpcmdflag(const void *f1, const void *f2)
Definition spell.c:211
static char * findchar2(char *str, int c1, int c2)
Definition spell.c:243
#define PAE_INREPL
Definition spell.c:779
static void setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry, char *s, uint32 val)
Definition spell.c:1040
void NIImportDictionary(IspellDict *Conf, const char *filename)
Definition spell.c:520
static AffixNode * mkANode(IspellDict *Conf, int low, int high, int level, int type)
Definition spell.c:1842
static bool IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
Definition spell.c:457
static SplitVar * SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, const char *word, int wordlen, int startpos, int minpos)
Definition spell.c:2421
static uint32 makeCompoundFlags(IspellDict *Conf, int affix)
Definition spell.c:1634
static int cmpspellaffix(const void *s1, const void *s2)
Definition spell.c:204
#define cpalloc(size)
Definition spell.c:159
static int strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
Definition spell.c:281
TSLexeme * NINormalizeWord(IspellDict *Conf, const char *word)
Definition spell.c:2587
static bool parse_affentry(const char *str, char *mask, char *find, char *repl)
Definition spell.c:917
#define tmpalloc(sz)
Definition spell.c:80
#define cpalloc0(size)
Definition spell.c:160
static int MergeAffix(IspellDict *Conf, int a1, int a2)
Definition spell.c:1583
#define PAE_WAIT_FIND
Definition spell.c:776
static void * compact_palloc0(IspellDict *Conf, size_t size)
Definition spell.c:131
static int getCompoundAffixFlagValue(IspellDict *Conf, const char *s)
Definition spell.c:1134
static void AddStem(SplitVar *v, char *word)
Definition spell.c:2408
#define PAE_INMASK
Definition spell.c:775
#define PAE_INFIND
Definition spell.c:777
static int addToResult(char **forms, char **cur, char *word)
Definition spell.c:2208
#define COMPACT_MAX_REQ
Definition spell.c:128
#define PAE_WAIT_FLAG
Definition spell.c:781
#define PAE_WAIT_MASK
Definition spell.c:774
static void mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
Definition spell.c:1919
static bool isAffixInUse(IspellDict *Conf, const char *affixflag)
Definition spell.c:1973
static char * lowerstr_ctx(IspellDict *Conf, const char *src)
Definition spell.c:176
#define PAE_WAIT_TYPE
Definition spell.c:780
static int cmpaffix(const void *s1, const void *s2)
Definition spell.c:312
#define FLAGNUM_MAXSIZE
Definition spell.h:182
#define FF_SUFFIX
Definition spell.h:121
#define FF_COMPOUNDFLAG
Definition spell.h:46
#define FF_PREFIX
Definition spell.h:122
#define ANHRDSZ
Definition spell.h:145
#define FF_COMPOUNDFLAGMASK
Definition spell.h:48
#define SPELLHDRSZ
Definition spell.h:82
#define FF_COMPOUNDFORBIDFLAG
Definition spell.h:114
#define FF_COMPOUNDBEGIN
Definition spell.h:43
#define FF_COMPOUNDPERMITFLAG
Definition spell.h:113
#define FF_CROSSPRODUCT
Definition spell.h:115
#define FF_COMPOUNDMIDDLE
Definition spell.h:44
@ FM_LONG
Definition spell.h:160
@ FM_CHAR
Definition spell.h:159
@ FM_NUM
Definition spell.h:161
#define SPNHDRSZ
Definition spell.h:56
#define FF_COMPOUNDONLY
Definition spell.h:42
#define FF_COMPOUNDLAST
Definition spell.h:45
int f1[ARRAY_SIZE]
int f2[ARRAY_SIZE]
void check_stack_depth(void)
Definition stack_depth.c:95
uint32 naff
Definition spell.h:133
AFFIX ** aff
Definition spell.h:134
struct AffixNode * node
Definition spell.h:135
uint32 isvoid
Definition spell.h:140
AffixNodeData data[FLEXIBLE_ARRAY_MEMBER]
Definition spell.h:142
uint32 length
Definition spell.h:141
int len
Definition spell.h:150
bool issuffix
Definition spell.h:151
const char * affix
Definition spell.h:149
union CompoundAffixFlag::@148 flag
FlagMode flagMode
Definition spell.h:178
const char * s
Definition spell.h:173
struct SPNode * node
Definition spell.h:35
Definition spell.h:51
SPNodeData data[FLEXIBLE_ARRAY_MEMBER]
Definition spell.h:53
uint32 length
Definition spell.h:52
int nstem
Definition spell.c:2334
struct SplitVar * next
Definition spell.c:2337
int lenstem
Definition spell.c:2335
char ** stem
Definition spell.c:2336
union aff_struct::@147 reg
const char * find
Definition spell.h:96
uint32 isregis
Definition spell.h:94
uint32 type
Definition spell.h:91
Regis regis
Definition spell.h:106
const char * flag
Definition spell.h:89
uint32 replen
Definition spell.h:95
regex_t * pregex
Definition spell.h:105
uint32 flagflags
Definition spell.h:92
const char * repl
Definition spell.h:97
uint32 issimple
Definition spell.h:93
char * flag(int b)
Definition test-ctype.c:33
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition ts_locale.c:85
char * tsearch_readline(tsearch_readline_state *stp)
Definition ts_locale.c:108
void tsearch_readline_end(tsearch_readline_state *stp)
Definition ts_locale.c:153
static int ts_copychar_with_len(void *dest, const void *src, int length)
Definition ts_locale.h:42
static int ts_copychar_cstr(void *dest, const void *src)
Definition ts_locale.h:50
#define t_iseq(x, c)
Definition ts_locale.h:38
const char * type