PostgreSQL Source Code git master
Loading...
Searching...
No Matches
wparser_def.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wparser_def.c
4 * Default text search parser
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/wparser_def.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include <limits.h>
18#include <wctype.h>
19
20#include "commands/defrem.h"
21#include "mb/pg_wchar.h"
22#include "miscadmin.h"
23#include "tsearch/ts_public.h"
24#include "tsearch/ts_type.h"
25#include "tsearch/ts_utils.h"
26#include "utils/builtins.h"
27#include "utils/pg_locale.h"
28
29
30/* Define me to enable tracing of parser behavior */
31/* #define WPARSER_TRACE */
32
33
34/* Output token categories */
35
36#define ASCIIWORD 1
37#define WORD_T 2
38#define NUMWORD 3
39#define EMAIL 4
40#define URL_T 5
41#define HOST 6
42#define SCIENTIFIC 7
43#define VERSIONNUMBER 8
44#define NUMPARTHWORD 9
45#define PARTHWORD 10
46#define ASCIIPARTHWORD 11
47#define SPACE 12
48#define TAG_T 13
49#define PROTOCOL 14
50#define NUMHWORD 15
51#define ASCIIHWORD 16
52#define HWORD 17
53#define URLPATH 18
54#define FILEPATH 19
55#define DECIMAL_T 20
56#define SIGNEDINT 21
57#define UNSIGNEDINT 22
58#define XMLENTITY 23
59
60#define LASTNUM 23
61
62static const char *const tok_alias[] = {
63 "",
64 "asciiword",
65 "word",
66 "numword",
67 "email",
68 "url",
69 "host",
70 "sfloat",
71 "version",
72 "hword_numpart",
73 "hword_part",
74 "hword_asciipart",
75 "blank",
76 "tag",
77 "protocol",
78 "numhword",
79 "asciihword",
80 "hword",
81 "url_path",
82 "file",
83 "float",
84 "int",
85 "uint",
86 "entity"
87};
88
89static const char *const lex_descr[] = {
90 "",
91 "Word, all ASCII",
92 "Word, all letters",
93 "Word, letters and digits",
94 "Email address",
95 "URL",
96 "Host",
97 "Scientific notation",
98 "Version number",
99 "Hyphenated word part, letters and digits",
100 "Hyphenated word part, all letters",
101 "Hyphenated word part, all ASCII",
102 "Space symbols",
103 "XML tag",
104 "Protocol head",
105 "Hyphenated word, letters and digits",
106 "Hyphenated word, all ASCII",
107 "Hyphenated word, all letters",
108 "URL path",
109 "File or path name",
110 "Decimal notation",
111 "Signed integer",
112 "Unsigned integer",
113 "XML entity"
114};
115
116
117/* Parser states */
118
119typedef enum
120{
198 TPS_Null /* last state (fake value) */
200
201/* forward declaration */
202struct TParser;
203
204typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
205 * except p_iseq */
206typedef void (*TParserSpecial) (struct TParser *); /* special handler for
207 * special cases... */
208
218
219/* Flag bits in TParserStateActionItem.flags */
220#define A_NEXT 0x0000
221#define A_BINGO 0x0001
222#define A_POP 0x0002
223#define A_PUSH 0x0004
224#define A_RERUN 0x0008
225#define A_CLEAR 0x0010
226#define A_MERGE 0x0020
227#define A_CLRALL 0x0040
228
229typedef struct TParserPosition
230{
231 int posbyte; /* position of parser in bytes */
232 int poschar; /* position of parser in characters */
233 int charlen; /* length of current char */
234 int lenbytetoken; /* length of token-so-far in bytes */
235 int lenchartoken; /* and in chars */
240
241typedef struct TParser
242{
243 /* string and position information */
244 char *str; /* multibyte string */
245 int lenstr; /* length of mbstring */
246 pg_wchar *pgwstr; /* wide character string for C-locale */
247
248 /* State of parse */
251 bool ignore;
253
254 /* silly char */
255 char c;
256
257 /* out */
258 char *token;
261 int type;
263
264
265/* forward decls here */
266static bool TParserGet(TParser *prs);
267
268
269static TParserPosition *
271{
273
274 if (prev)
275 memcpy(res, prev, sizeof(TParserPosition));
276 else
277 memset(res, 0, sizeof(TParserPosition));
278
279 res->prev = prev;
280
281 res->pushedAtAction = NULL;
282
283 return res;
284}
285
286static TParser *
287TParserInit(char *str, int len)
288{
290
292 prs->str = str;
293 prs->lenstr = len;
294 prs->pgwstr = palloc_array(pg_wchar, prs->lenstr + 1);
295 pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
296
298 prs->state->state = TPS_Base;
299
300#ifdef WPARSER_TRACE
301 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
302#endif
303
304 return prs;
305}
306
307/*
308 * As an alternative to a full TParserInit one can create a
309 * TParserCopy which basically is a regular TParser without a private
310 * copy of the string - instead it uses the one from another TParser.
311 * This is useful because at some places TParsers are created
312 * recursively and the repeated copying around of the strings can
313 * cause major inefficiency if the source string is long.
314 * The new parser starts parsing at the original's current position.
315 *
316 * Obviously one must not close the original TParser before the copy.
317 */
318static TParser *
320{
322
323 prs->charmaxlen = orig->charmaxlen;
324 prs->str = orig->str + orig->state->posbyte;
325 prs->lenstr = orig->lenstr - orig->state->posbyte;
326
327 if (orig->pgwstr)
328 prs->pgwstr = orig->pgwstr + orig->state->poschar;
329
331 prs->state->state = TPS_Base;
332
333#ifdef WPARSER_TRACE
334 fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
335#endif
336
337 return prs;
338}
339
340
341static void
343{
344 while (prs->state)
345 {
346 TParserPosition *ptr = prs->state->prev;
347
348 pfree(prs->state);
349 prs->state = ptr;
350 }
351
352 if (prs->pgwstr)
353 pfree(prs->pgwstr);
354
355#ifdef WPARSER_TRACE
356 fprintf(stderr, "closing parser\n");
357#endif
358 pfree(prs);
359}
360
361/*
362 * Close a parser created with TParserCopyInit
363 */
364static void
366{
367 while (prs->state)
368 {
369 TParserPosition *ptr = prs->state->prev;
370
371 pfree(prs->state);
372 prs->state = ptr;
373 }
374
375#ifdef WPARSER_TRACE
376 fprintf(stderr, "closing parser copy\n");
377#endif
378 pfree(prs);
379}
380
381
382/*
383 * Character-type support functions using the database default locale. If the
384 * locale is C, and the input character is non-ascii, the value to be returned
385 * is determined by the 'nonascii' macro argument.
386 */
387
388#define p_iswhat(type, nonascii) \
389 \
390static int \
391p_is##type(TParser *prs) \
392{ \
393 pg_locale_t locale = pg_database_locale(); \
394 pg_wchar wc; \
395 Assert(prs->state); \
396 wc = prs->pgwstr[prs->state->poschar]; \
397 if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f) \
398 return nonascii; \
399 return pg_isw##type(wc, pg_database_locale()); \
400} \
401 \
402static int \
403p_isnot##type(TParser *prs) \
404{ \
405 return !p_is##type(prs); \
406}
407
408/*
409 * In C locale with a multibyte encoding, any non-ASCII symbol is considered
410 * an alpha character, but not a member of other char classes.
411 */
413p_iswhat(alpha, 1)
414p_iswhat(digit, 0)
415p_iswhat(lower, 0)
416p_iswhat(print, 0)
417p_iswhat(punct, 0)
418p_iswhat(space, 0)
419p_iswhat(upper, 0)
421
422/* p_iseq should be used only for ascii symbols */
423
424static int
425p_iseq(TParser *prs, char c)
426{
427 Assert(prs->state);
428 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
429}
430
431static int
433{
434 Assert(prs->state);
435 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
436}
437
438static int
440{
441 return p_iseq(prs, prs->c);
442}
443
444static int
446{
447 return !p_iseq(prs, prs->c);
448}
449
450static int
452{
453 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
454}
455
456static int
458{
459 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
460}
461
462static int
464{
465 char ch;
466
467 /* no non-ASCII need apply */
468 if (prs->state->charlen != 1)
469 return 0;
470 ch = *(prs->str + prs->state->posbyte);
471 /* no spaces or control characters */
472 if (ch <= 0x20 || ch >= 0x7F)
473 return 0;
474 /* reject characters disallowed by RFC 3986 */
475 switch (ch)
476 {
477 case '"':
478 case '<':
479 case '>':
480 case '\\':
481 case '^':
482 case '`':
483 case '{':
484 case '|':
485 case '}':
486 return 0;
487 }
488 return 1;
489}
490
491
492/* deliberately suppress unused-function complaints for the above */
493void _make_compiler_happy(void);
494void
519
520
521static void
523{
524 switch (prs->state->lenchartoken)
525 {
526 case 8: /* </script */
527 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
528 prs->ignore = false;
529 break;
530 case 7: /* <script || </style */
531 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
532 prs->ignore = false;
533 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
534 prs->ignore = true;
535 break;
536 case 6: /* <style */
537 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
538 prs->ignore = true;
539 break;
540 default:
541 break;
542 }
543}
544
545static void
547{
548 prs->wanthost = true;
549 prs->state->posbyte -= prs->state->lenbytetoken;
550 prs->state->poschar -= prs->state->lenchartoken;
551}
552
553static void
555{
556 prs->state->posbyte -= prs->state->lenbytetoken;
557 prs->state->poschar -= prs->state->lenchartoken;
558}
559
560static void
562{
563 prs->state->posbyte -= prs->state->lenbytetoken;
564 prs->state->poschar -= prs->state->lenchartoken;
565 prs->state->lenbytetoken = 0;
566 prs->state->lenchartoken = 0;
567}
568
569static int
571{
572 if (prs->wanthost)
573 {
574 prs->wanthost = false;
575 return 1;
576 }
577 return 0;
578}
579
580static int
582{
583 return (prs->ignore) ? 1 : 0;
584}
585
586static int
588{
590 int res = 0;
591
592 tmpprs->wanthost = true;
593
594 /*
595 * Check stack depth before recursing. (Since TParserGet() doesn't
596 * normally recurse, we put the cost of checking here not there.)
597 */
599
600 if (TParserGet(tmpprs) && tmpprs->type == HOST)
601 {
602 prs->state->posbyte += tmpprs->lenbytetoken;
603 prs->state->poschar += tmpprs->lenchartoken;
604 prs->state->lenbytetoken += tmpprs->lenbytetoken;
605 prs->state->lenchartoken += tmpprs->lenchartoken;
606 prs->state->charlen = tmpprs->state->charlen;
607 res = 1;
608 }
610
611 return res;
612}
613
614static int
616{
618 int res = 0;
619
620 tmpprs->state = newTParserPosition(tmpprs->state);
621 tmpprs->state->state = TPS_InURLPathFirst;
622
623 /*
624 * Check stack depth before recursing. (Since TParserGet() doesn't
625 * normally recurse, we put the cost of checking here not there.)
626 */
628
629 if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
630 {
631 prs->state->posbyte += tmpprs->lenbytetoken;
632 prs->state->poschar += tmpprs->lenchartoken;
633 prs->state->lenbytetoken += tmpprs->lenbytetoken;
634 prs->state->lenchartoken += tmpprs->lenchartoken;
635 prs->state->charlen = tmpprs->state->charlen;
636 res = 1;
637 }
639
640 return res;
641}
642
643/*
644 * returns true if current character has zero display length or
645 * it's a special sign in several languages. Such characters
646 * aren't a word-breaker although they aren't an isalpha.
647 * In beginning of word they aren't a part of it.
648 */
649static int
651{
652 /*
653 * pg_dsplen could return -1 which means error or control character
654 */
655 if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
656 return 1;
657
658 /*
659 * Unicode Characters in the 'Mark, Spacing Combining' Category That
660 * characters are not alpha although they are not breakers of word too.
661 * Check that only in utf encoding, because other encodings aren't
662 * supported by postgres or even exists.
663 */
665 {
666 static const pg_wchar strange_letter[] = {
667 /*
668 * use binary search, so elements should be ordered
669 */
670 0x0903, /* DEVANAGARI SIGN VISARGA */
671 0x093E, /* DEVANAGARI VOWEL SIGN AA */
672 0x093F, /* DEVANAGARI VOWEL SIGN I */
673 0x0940, /* DEVANAGARI VOWEL SIGN II */
674 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
675 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
676 0x094B, /* DEVANAGARI VOWEL SIGN O */
677 0x094C, /* DEVANAGARI VOWEL SIGN AU */
678 0x0982, /* BENGALI SIGN ANUSVARA */
679 0x0983, /* BENGALI SIGN VISARGA */
680 0x09BE, /* BENGALI VOWEL SIGN AA */
681 0x09BF, /* BENGALI VOWEL SIGN I */
682 0x09C0, /* BENGALI VOWEL SIGN II */
683 0x09C7, /* BENGALI VOWEL SIGN E */
684 0x09C8, /* BENGALI VOWEL SIGN AI */
685 0x09CB, /* BENGALI VOWEL SIGN O */
686 0x09CC, /* BENGALI VOWEL SIGN AU */
687 0x09D7, /* BENGALI AU LENGTH MARK */
688 0x0A03, /* GURMUKHI SIGN VISARGA */
689 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
690 0x0A3F, /* GURMUKHI VOWEL SIGN I */
691 0x0A40, /* GURMUKHI VOWEL SIGN II */
692 0x0A83, /* GUJARATI SIGN VISARGA */
693 0x0ABE, /* GUJARATI VOWEL SIGN AA */
694 0x0ABF, /* GUJARATI VOWEL SIGN I */
695 0x0AC0, /* GUJARATI VOWEL SIGN II */
696 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
697 0x0ACB, /* GUJARATI VOWEL SIGN O */
698 0x0ACC, /* GUJARATI VOWEL SIGN AU */
699 0x0B02, /* ORIYA SIGN ANUSVARA */
700 0x0B03, /* ORIYA SIGN VISARGA */
701 0x0B3E, /* ORIYA VOWEL SIGN AA */
702 0x0B40, /* ORIYA VOWEL SIGN II */
703 0x0B47, /* ORIYA VOWEL SIGN E */
704 0x0B48, /* ORIYA VOWEL SIGN AI */
705 0x0B4B, /* ORIYA VOWEL SIGN O */
706 0x0B4C, /* ORIYA VOWEL SIGN AU */
707 0x0B57, /* ORIYA AU LENGTH MARK */
708 0x0BBE, /* TAMIL VOWEL SIGN AA */
709 0x0BBF, /* TAMIL VOWEL SIGN I */
710 0x0BC1, /* TAMIL VOWEL SIGN U */
711 0x0BC2, /* TAMIL VOWEL SIGN UU */
712 0x0BC6, /* TAMIL VOWEL SIGN E */
713 0x0BC7, /* TAMIL VOWEL SIGN EE */
714 0x0BC8, /* TAMIL VOWEL SIGN AI */
715 0x0BCA, /* TAMIL VOWEL SIGN O */
716 0x0BCB, /* TAMIL VOWEL SIGN OO */
717 0x0BCC, /* TAMIL VOWEL SIGN AU */
718 0x0BD7, /* TAMIL AU LENGTH MARK */
719 0x0C01, /* TELUGU SIGN CANDRABINDU */
720 0x0C02, /* TELUGU SIGN ANUSVARA */
721 0x0C03, /* TELUGU SIGN VISARGA */
722 0x0C41, /* TELUGU VOWEL SIGN U */
723 0x0C42, /* TELUGU VOWEL SIGN UU */
724 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
725 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
726 0x0C82, /* KANNADA SIGN ANUSVARA */
727 0x0C83, /* KANNADA SIGN VISARGA */
728 0x0CBE, /* KANNADA VOWEL SIGN AA */
729 0x0CC0, /* KANNADA VOWEL SIGN II */
730 0x0CC1, /* KANNADA VOWEL SIGN U */
731 0x0CC2, /* KANNADA VOWEL SIGN UU */
732 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
733 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
734 0x0CC7, /* KANNADA VOWEL SIGN EE */
735 0x0CC8, /* KANNADA VOWEL SIGN AI */
736 0x0CCA, /* KANNADA VOWEL SIGN O */
737 0x0CCB, /* KANNADA VOWEL SIGN OO */
738 0x0CD5, /* KANNADA LENGTH MARK */
739 0x0CD6, /* KANNADA AI LENGTH MARK */
740 0x0D02, /* MALAYALAM SIGN ANUSVARA */
741 0x0D03, /* MALAYALAM SIGN VISARGA */
742 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
743 0x0D3F, /* MALAYALAM VOWEL SIGN I */
744 0x0D40, /* MALAYALAM VOWEL SIGN II */
745 0x0D46, /* MALAYALAM VOWEL SIGN E */
746 0x0D47, /* MALAYALAM VOWEL SIGN EE */
747 0x0D48, /* MALAYALAM VOWEL SIGN AI */
748 0x0D4A, /* MALAYALAM VOWEL SIGN O */
749 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
750 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
751 0x0D57, /* MALAYALAM AU LENGTH MARK */
752 0x0D82, /* SINHALA SIGN ANUSVARAYA */
753 0x0D83, /* SINHALA SIGN VISARGAYA */
754 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
755 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
756 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
757 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
758 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
759 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
760 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
761 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
762 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
763 * AELA-PILLA */
764 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
765 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
766 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
767 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
768 0x0F3E, /* TIBETAN SIGN YAR TSHES */
769 0x0F3F, /* TIBETAN SIGN MAR TSHES */
770 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
771 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
772 0x102C, /* MYANMAR VOWEL SIGN AA */
773 0x1031, /* MYANMAR VOWEL SIGN E */
774 0x1038, /* MYANMAR SIGN VISARGA */
775 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
776 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
777 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
778 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
779 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
780 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
781 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
782 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
783 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
784 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
785 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
786 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
787 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
788 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
789 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
790 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
791 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
792 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
793 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
794 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
795 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
796 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
797 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
798 0x17B6, /* KHMER VOWEL SIGN AA */
799 0x17BE, /* KHMER VOWEL SIGN OE */
800 0x17BF, /* KHMER VOWEL SIGN YA */
801 0x17C0, /* KHMER VOWEL SIGN IE */
802 0x17C1, /* KHMER VOWEL SIGN E */
803 0x17C2, /* KHMER VOWEL SIGN AE */
804 0x17C3, /* KHMER VOWEL SIGN AI */
805 0x17C4, /* KHMER VOWEL SIGN OO */
806 0x17C5, /* KHMER VOWEL SIGN AU */
807 0x17C7, /* KHMER SIGN REAHMUK */
808 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
809 0x1923, /* LIMBU VOWEL SIGN EE */
810 0x1924, /* LIMBU VOWEL SIGN AI */
811 0x1925, /* LIMBU VOWEL SIGN OO */
812 0x1926, /* LIMBU VOWEL SIGN AU */
813 0x1929, /* LIMBU SUBJOINED LETTER YA */
814 0x192A, /* LIMBU SUBJOINED LETTER RA */
815 0x192B, /* LIMBU SUBJOINED LETTER WA */
816 0x1930, /* LIMBU SMALL LETTER KA */
817 0x1931, /* LIMBU SMALL LETTER NGA */
818 0x1933, /* LIMBU SMALL LETTER TA */
819 0x1934, /* LIMBU SMALL LETTER NA */
820 0x1935, /* LIMBU SMALL LETTER PA */
821 0x1936, /* LIMBU SMALL LETTER MA */
822 0x1937, /* LIMBU SMALL LETTER RA */
823 0x1938, /* LIMBU SMALL LETTER LA */
824 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
825 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
826 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
827 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
828 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
829 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
830 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
831 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
832 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
833 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
834 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
835 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
836 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
837 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
838 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
839 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
840 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
841 0x19C8, /* NEW TAI LUE TONE MARK-1 */
842 0x19C9, /* NEW TAI LUE TONE MARK-2 */
843 0x1A19, /* BUGINESE VOWEL SIGN E */
844 0x1A1A, /* BUGINESE VOWEL SIGN O */
845 0x1A1B, /* BUGINESE VOWEL SIGN AE */
846 0x1B04, /* BALINESE SIGN BISAH */
847 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
848 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
849 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
850 0x1B3E, /* BALINESE VOWEL SIGN TALING */
851 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
852 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
853 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
854 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
855 0x1B44, /* BALINESE ADEG ADEG */
856 0x1B82, /* SUNDANESE SIGN PANGWISAD */
857 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
858 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
859 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
860 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
861 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
862 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
863 0x1C26, /* LEPCHA VOWEL SIGN AA */
864 0x1C27, /* LEPCHA VOWEL SIGN I */
865 0x1C28, /* LEPCHA VOWEL SIGN O */
866 0x1C29, /* LEPCHA VOWEL SIGN OO */
867 0x1C2A, /* LEPCHA VOWEL SIGN U */
868 0x1C2B, /* LEPCHA VOWEL SIGN UU */
869 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
870 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
871 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
872 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
873 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
874 0xA880, /* SAURASHTRA SIGN ANUSVARA */
875 0xA881, /* SAURASHTRA SIGN VISARGA */
876 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
877 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
878 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
879 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
880 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
881 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
882 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
883 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
884 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
885 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
886 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
887 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
888 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
889 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
890 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
891 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
892 0xA952, /* REJANG CONSONANT SIGN H */
893 0xA953, /* REJANG VIRAMA */
894 0xAA2F, /* CHAM VOWEL SIGN O */
895 0xAA30, /* CHAM VOWEL SIGN AI */
896 0xAA33, /* CHAM CONSONANT SIGN YA */
897 0xAA34, /* CHAM CONSONANT SIGN RA */
898 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
899 };
902 *StopMiddle;
903 pg_wchar c;
904
905 c = *(prs->pgwstr + prs->state->poschar);
906
907 while (StopLow < StopHigh)
908 {
909 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
910 if (*StopMiddle == c)
911 return 1;
912 else if (*StopMiddle < c)
913 StopLow = StopMiddle + 1;
914 else
916 }
917 }
918
919 return 0;
920}
921
922/*
923 * Table of state/action of parser
924 */
925
941
942
953
971
980
997
999 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1001 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1002};
1003
1012
1015 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1016 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1017 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1018 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1019 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1020 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1022 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1023};
1024
1026 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1028 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1029};
1030
1039
1041 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1043 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1044};
1045
1054
1060
1066
1067
1069 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1071 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1072};
1073
1080
1082 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1086 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1087};
1088
1090 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1092 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1093};
1094
1100
1102 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1105 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1106 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1107 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1108};
1109
1111 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1113 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1114 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1115 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1116 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1118 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1119};
1120
1128
1134
1136 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1139 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1140};
1141
1148
1152
1154 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1157 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1159 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1160 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1161 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1162};
1163
1165 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1166 /* <?xml ... */
1167 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1168 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1169 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1170};
1171
1173 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1175 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1176};
1177
1179 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1180 /* <br/> case */
1181 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1184 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1185 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1186 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1187 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1188 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1189 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1190};
1191
1193 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1195 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1196};
1197
1199 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1201 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1202 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1203 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1204 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1205 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1206 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1207 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1208 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1209 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1210 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1211 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1212 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1213 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1214 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1215 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1217 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1218};
1219
1221 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1223 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1224 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1225};
1226
1228 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1230 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1232};
1233
1235 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1236 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1237};
1238
1242
1244 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1245 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1246 /* <!DOCTYPE ...> */
1247 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1248 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1249 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1250};
1251
1253 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1254 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1255 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1256};
1257
1259 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1262};
1263
1269
1271 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1272 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1273 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1274 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1275};
1276
1280
1287
1289 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1292 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1293 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1295 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1296 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1297};
1298
1313
1315 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1317 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1318};
1319
1327
1329 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1330 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1331 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1332 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1333};
1334
1336 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1337 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1338 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1339 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1341 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1342 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1343 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1344};
1345
1347 {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1349 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1350};
1351
1353 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1354 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1355 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1356 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1357 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1358 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1359 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1360};
1361
1363 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1364 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1365 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1366 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1367 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1368 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1369};
1370
1372 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1373 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1374 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1375 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1376 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1377 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1378 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1379};
1380
1382 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1383 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1384 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1385 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1386};
1387
1395
1398 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1399 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1400 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1401 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1402 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1403 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1405};
1406
1408 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1409 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1410 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1411 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1412 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1413};
1414
1416 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1418 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1419};
1420
1424
1430
1432 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1434 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1435};
1436
1438 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1440 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1441};
1442
1444 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1445 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1446 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1447};
1448
1452
1460
1470
1477
1486
1493
1501
1509
1518
1525
1533
1542
1549
1557
1558
1559/*
1560 * main table of per-state parser actions
1561 */
1562typedef struct
1563{
1564 const TParserStateActionItem *action; /* the actual state info */
1565 TParserState state; /* only for Assert crosscheck */
1566#ifdef WPARSER_TRACE
1567 const char *state_name; /* only for debug printout */
1568#endif
1570
1571#ifdef WPARSER_TRACE
1572#define TPARSERSTATEACTION(state) \
1573 { CppConcat(action,state), state, CppAsString(state) }
1574#else
1575#define TPARSERSTATEACTION(state) \
1576 { CppConcat(action,state), state }
1577#endif
1578
1579/*
1580 * order must be the same as in typedef enum {} TParserState!!
1581 */
1582
1583static const TParserStateAction Actions[] = {
1661};
1662
1663
1664static bool
1666{
1667 const TParserStateActionItem *item = NULL;
1668
1670
1671 Assert(prs->state);
1672
1673 if (prs->state->posbyte >= prs->lenstr)
1674 return false;
1675
1676 prs->token = prs->str + prs->state->posbyte;
1677 prs->state->pushedAtAction = NULL;
1678
1679 /* look at string */
1680 while (prs->state->posbyte <= prs->lenstr)
1681 {
1682 if (prs->state->posbyte == prs->lenstr)
1683 prs->state->charlen = 0;
1684 else
1685 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1686 pg_mblen_range(prs->str + prs->state->posbyte,
1687 prs->str + prs->lenstr);
1688
1689 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1690 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1691 Assert(Actions[prs->state->state].state == prs->state->state);
1692
1693 if (prs->state->pushedAtAction)
1694 {
1695 /* After a POP, pick up at the next test */
1696 item = prs->state->pushedAtAction + 1;
1697 prs->state->pushedAtAction = NULL;
1698 }
1699 else
1700 {
1701 item = Actions[prs->state->state].action;
1702 Assert(item != NULL);
1703 }
1704
1705 /* find action by character class */
1706 while (item->isclass)
1707 {
1708 prs->c = item->c;
1709 if (item->isclass(prs) != 0)
1710 break;
1711 item++;
1712 }
1713
1714#ifdef WPARSER_TRACE
1715 {
1716 TParserPosition *ptr;
1717
1718 fprintf(stderr, "state ");
1719 /* indent according to stack depth */
1720 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1721 fprintf(stderr, " ");
1722 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1723 if (prs->state->posbyte < prs->lenstr)
1724 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1725 else
1726 fprintf(stderr, "at EOF");
1727 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1728 (int) (item - Actions[prs->state->state].action),
1729 (item->flags & A_BINGO) ? " BINGO" : "",
1730 (item->flags & A_POP) ? " POP" : "",
1731 (item->flags & A_PUSH) ? " PUSH" : "",
1732 (item->flags & A_RERUN) ? " RERUN" : "",
1733 (item->flags & A_CLEAR) ? " CLEAR" : "",
1734 (item->flags & A_MERGE) ? " MERGE" : "",
1735 (item->flags & A_CLRALL) ? " CLRALL" : "",
1736 (item->tostate != TPS_Null) ? " tostate " : "",
1737 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1738 (item->type > 0) ? " type " : "",
1739 tok_alias[item->type]);
1740 }
1741#endif
1742
1743 /* call special handler if exists */
1744 if (item->special)
1745 item->special(prs);
1746
1747 /* BINGO, token is found */
1748 if (item->flags & A_BINGO)
1749 {
1750 Assert(item->type > 0);
1751 prs->lenbytetoken = prs->state->lenbytetoken;
1752 prs->lenchartoken = prs->state->lenchartoken;
1753 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1754 prs->type = item->type;
1755 }
1756
1757 /* do various actions by flags */
1758 if (item->flags & A_POP)
1759 { /* pop stored state in stack */
1760 TParserPosition *ptr = prs->state->prev;
1761
1762 pfree(prs->state);
1763 prs->state = ptr;
1764 Assert(prs->state);
1765 }
1766 else if (item->flags & A_PUSH)
1767 { /* push (store) state in stack */
1768 prs->state->pushedAtAction = item; /* remember where we push */
1769 prs->state = newTParserPosition(prs->state);
1770 }
1771 else if (item->flags & A_CLEAR)
1772 { /* clear previous pushed state */
1773 TParserPosition *ptr;
1774
1775 Assert(prs->state->prev);
1776 ptr = prs->state->prev->prev;
1777 pfree(prs->state->prev);
1778 prs->state->prev = ptr;
1779 }
1780 else if (item->flags & A_CLRALL)
1781 { /* clear all previous pushed state */
1782 TParserPosition *ptr;
1783
1784 while (prs->state->prev)
1785 {
1786 ptr = prs->state->prev->prev;
1787 pfree(prs->state->prev);
1788 prs->state->prev = ptr;
1789 }
1790 }
1791 else if (item->flags & A_MERGE)
1792 { /* merge posinfo with current and pushed state */
1793 TParserPosition *ptr = prs->state;
1794
1795 Assert(prs->state->prev);
1796 prs->state = prs->state->prev;
1797
1798 prs->state->posbyte = ptr->posbyte;
1799 prs->state->poschar = ptr->poschar;
1800 prs->state->charlen = ptr->charlen;
1801 prs->state->lenbytetoken = ptr->lenbytetoken;
1802 prs->state->lenchartoken = ptr->lenchartoken;
1803 pfree(ptr);
1804 }
1805
1806 /* set new state if pointed */
1807 if (item->tostate != TPS_Null)
1808 prs->state->state = item->tostate;
1809
1810 /* check for go away */
1811 if ((item->flags & A_BINGO) ||
1812 (prs->state->posbyte >= prs->lenstr &&
1813 (item->flags & A_RERUN) == 0))
1814 break;
1815
1816 /* go to beginning of loop if we should rerun or we just restore state */
1817 if (item->flags & (A_RERUN | A_POP))
1818 continue;
1819
1820 /* move forward */
1821 if (prs->state->charlen)
1822 {
1823 prs->state->posbyte += prs->state->charlen;
1824 prs->state->lenbytetoken += prs->state->charlen;
1825 prs->state->poschar++;
1826 prs->state->lenchartoken++;
1827 }
1828 }
1829
1830 return (item && (item->flags & A_BINGO));
1831}
1832
1833Datum
1835{
1836 LexDescr *descr = palloc_array(LexDescr, LASTNUM + 1);
1837 int i;
1838
1839 for (i = 1; i <= LASTNUM; i++)
1840 {
1841 descr[i - 1].lexid = i;
1842 descr[i - 1].alias = pstrdup(tok_alias[i]);
1843 descr[i - 1].descr = pstrdup(lex_descr[i]);
1844 }
1845
1846 descr[LASTNUM].lexid = 0;
1847
1848 PG_RETURN_POINTER(descr);
1849}
1850
1851Datum
1856
1857Datum
1859{
1860 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1861 char **t = (char **) PG_GETARG_POINTER(1);
1862 int *tlen = (int *) PG_GETARG_POINTER(2);
1863
1864 if (!TParserGet(p))
1865 PG_RETURN_INT32(0);
1866
1867 *t = p->token;
1868 *tlen = p->lenbytetoken;
1869
1871}
1872
1873Datum
1875{
1876 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1877
1878 TParserClose(p);
1880}
1881
1882
1883/*
1884 * ts_headline support begins here
1885 */
1886
1887/* token type classification macros */
1888#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1889#define HLIDREPLACE(x) ( (x)==TAG_T )
1890#define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1891#define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1892#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1893#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1894
1895/*
1896 * Macros useful in headline selection. These rely on availability of
1897 * "HeadlineParsedText *prs" describing some text, and "int shortword"
1898 * describing the "short word" length parameter.
1899 */
1900
1901/* Interesting words are non-repeated search terms */
1902#define INTERESTINGWORD(j) \
1903 (prs->words[j].item && !prs->words[j].repeated)
1904
1905/* Don't want to end at a non-word or a short word, unless interesting */
1906#define BADENDPOINT(j) \
1907 ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1908 !INTERESTINGWORD(j))
1909
1910typedef struct
1911{
1912 /* one cover (well, really one fragment) for mark_hl_fragments */
1913 int32 startpos; /* fragment's starting word index */
1914 int32 endpos; /* ending word index (inclusive) */
1915 int32 poslen; /* number of interesting words */
1916 int32 curlen; /* total number of words */
1917 bool chosen; /* chosen? */
1918 bool excluded; /* excluded? */
1919} CoverPos;
1920
1921typedef struct
1922{
1923 /* callback data for checkcondition_HL */
1925 int len;
1926} hlCheck;
1927
1928
1929/*
1930 * TS_execute callback for matching a tsquery operand to headline words
1931 *
1932 * Note: it's tempting to report words[] indexes as pos values to save
1933 * searching in hlCover; but that would screw up phrase matching, which
1934 * expects to measure distances in lexemes not tokens.
1935 */
1936static TSTernaryValue
1938{
1939 hlCheck *checkval = (hlCheck *) opaque;
1940 int i;
1941
1942 /* scan words array for matching items */
1943 for (i = 0; i < checkval->len; i++)
1944 {
1945 if (checkval->words[i].item == val)
1946 {
1947 /* if data == NULL, don't need to report positions */
1948 if (!data)
1949 return TS_YES;
1950
1951 if (!data->pos)
1952 {
1953 data->pos = palloc_array(WordEntryPos, checkval->len);
1954 data->allocated = true;
1955 data->npos = 1;
1956 data->pos[0] = checkval->words[i].pos;
1957 }
1958 else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1959 {
1960 data->pos[data->npos++] = checkval->words[i].pos;
1961 }
1962 }
1963 }
1964
1965 if (data && data->npos > 0)
1966 return TS_YES;
1967
1968 return TS_NO;
1969}
1970
1971/*
1972 * hlCover: try to find a substring of prs' word list that satisfies query
1973 *
1974 * locations is the result of TS_execute_locations() for the query.
1975 * We use this to identify plausible subranges of the query.
1976 *
1977 * *nextpos is the lexeme position (NOT word index) to start the search
1978 * at. Caller should initialize this to zero. If successful, we'll
1979 * advance it to the next place to search at.
1980 *
1981 * On success, sets *p to first word index and *q to last word index of the
1982 * cover substring, and returns true.
1983 *
1984 * The result is a minimal cover, in the sense that both *p and *q will be
1985 * words used in the query.
1986 */
1987static bool
1989 int *nextpos, int *p, int *q)
1990{
1991 int pos = *nextpos;
1992
1993 /* This loop repeats when our selected word-range fails the query */
1994 for (;;)
1995 {
1996 int posb,
1997 pose;
1998 ListCell *lc;
1999
2000 /*
2001 * For each AND'ed query term or phrase, find its first occurrence at
2002 * or after pos; set pose to the maximum of those positions.
2003 *
2004 * We need not consider ORs or NOTs here; see the comments for
2005 * TS_execute_locations(). Rechecking the match with TS_execute(),
2006 * below, will deal with any ensuing imprecision.
2007 */
2008 pose = -1;
2009 foreach(lc, locations)
2010 {
2012 int first = -1;
2013
2014 for (int i = 0; i < pdata->npos; i++)
2015 {
2016 /* For phrase matches, use the ending lexeme */
2017 int endp = pdata->pos[i];
2018
2019 if (endp >= pos)
2020 {
2021 first = endp;
2022 break;
2023 }
2024 }
2025 if (first < 0)
2026 return false; /* no more matches for this term */
2027 if (first > pose)
2028 pose = first;
2029 }
2030
2031 if (pose < 0)
2032 return false; /* we only get here if empty list */
2033
2034 /*
2035 * Now, for each AND'ed query term or phrase, find its last occurrence
2036 * at or before pose; set posb to the minimum of those positions.
2037 *
2038 * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2039 * posb + 1 below.
2040 */
2041 posb = INT_MAX - 1;
2042 foreach(lc, locations)
2043 {
2045 int last = -1;
2046
2047 for (int i = pdata->npos - 1; i >= 0; i--)
2048 {
2049 /* For phrase matches, use the starting lexeme */
2050 int startp = pdata->pos[i] - pdata->width;
2051
2052 if (startp <= pose)
2053 {
2054 last = startp;
2055 break;
2056 }
2057 }
2058 if (last < posb)
2059 posb = last;
2060 }
2061
2062 /*
2063 * We could end up with posb to the left of pos, in case some phrase
2064 * match crosses pos. Try the match starting at pos anyway, since the
2065 * result of TS_execute_locations is imprecise for phrase matches OR'd
2066 * with plain matches; that is, if the query is "(A <-> B) | C" then C
2067 * could match at pos even though the phrase match would have to
2068 * extend to the left of pos.
2069 */
2070 posb = Max(posb, pos);
2071
2072 /* This test probably always succeeds, but be paranoid */
2073 if (posb <= pose)
2074 {
2075 /*
2076 * posb .. pose is now the shortest, earliest-after-pos range of
2077 * lexeme positions containing all the query terms. It will
2078 * contain all phrase matches, too, except in the corner case
2079 * described just above.
2080 *
2081 * Now convert these lexeme positions to indexes in prs->words[].
2082 */
2083 int idxb = -1;
2084 int idxe = -1;
2085
2086 for (int i = 0; i < prs->curwords; i++)
2087 {
2088 if (prs->words[i].item == NULL)
2089 continue;
2090 if (idxb < 0 && prs->words[i].pos >= posb)
2091 idxb = i;
2092 if (prs->words[i].pos <= pose)
2093 idxe = i;
2094 else
2095 break;
2096 }
2097
2098 /* This test probably always succeeds, but be paranoid */
2099 if (idxb >= 0 && idxe >= idxb)
2100 {
2101 /*
2102 * Finally, check that the selected range satisfies the query.
2103 * This should succeed in all simple cases; but odd cases
2104 * involving non-top-level NOT conditions or phrase matches
2105 * OR'd with other things could fail, since the result of
2106 * TS_execute_locations doesn't fully represent such things.
2107 */
2108 hlCheck ch;
2109
2110 ch.words = &(prs->words[idxb]);
2111 ch.len = idxe - idxb + 1;
2112 if (TS_execute(GETQUERY(query), &ch,
2114 {
2115 /* Match! Advance *nextpos and return the word range. */
2116 *nextpos = posb + 1;
2117 *p = idxb;
2118 *q = idxe;
2119 return true;
2120 }
2121 }
2122 }
2123
2124 /*
2125 * Advance pos and try again. Any later workable match must start
2126 * beyond posb.
2127 */
2128 pos = posb + 1;
2129 }
2130 /* Can't get here, but stupider compilers complain if we leave it off */
2131 return false;
2132}
2133
2134/*
2135 * Apply suitable highlight marking to words selected by headline selector
2136 *
2137 * The words from startpos to endpos inclusive are marked per highlightall
2138 */
2139static void
2141 int startpos, int endpos)
2142{
2143 int i;
2144
2145 for (i = startpos; i <= endpos; i++)
2146 {
2147 if (prs->words[i].item)
2148 prs->words[i].selected = 1;
2149 if (!highlightall)
2150 {
2151 if (HLIDREPLACE(prs->words[i].type))
2152 prs->words[i].replace = 1;
2153 else if (HLIDSKIP(prs->words[i].type))
2154 prs->words[i].skip = 1;
2155 }
2156 else
2157 {
2158 if (XMLHLIDSKIP(prs->words[i].type))
2159 prs->words[i].skip = 1;
2160 }
2161
2162 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2163 }
2164}
2165
2166/*
2167 * split a cover substring into fragments not longer than max_words
2168 *
2169 * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2170 * substring. They are updated to hold the bounds of the next fragment.
2171 *
2172 * *curlen and *poslen are set to the fragment's length, in words and
2173 * interesting words respectively.
2174 */
2175static void
2177 int *curlen, int *poslen, int max_words)
2178{
2179 int i;
2180
2181 /*
2182 * Objective: select a fragment of words between startpos and endpos such
2183 * that it has at most max_words and both ends have query words. If the
2184 * startpos and endpos are the endpoints of the cover and the cover has
2185 * fewer words than max_words, then this function should just return the
2186 * cover
2187 */
2188 /* first move startpos to an item */
2189 for (i = *startpos; i <= *endpos; i++)
2190 {
2191 *startpos = i;
2192 if (INTERESTINGWORD(i))
2193 break;
2194 }
2195 /* cut endpos to have only max_words */
2196 *curlen = 0;
2197 *poslen = 0;
2198 for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2199 {
2200 if (!NONWORDTOKEN(prs->words[i].type))
2201 *curlen += 1;
2202 if (INTERESTINGWORD(i))
2203 *poslen += 1;
2204 }
2205 /* if the cover was cut then move back endpos to a query item */
2206 if (*endpos > i)
2207 {
2208 *endpos = i;
2209 for (i = *endpos; i >= *startpos; i--)
2210 {
2211 *endpos = i;
2212 if (INTERESTINGWORD(i))
2213 break;
2214 if (!NONWORDTOKEN(prs->words[i].type))
2215 *curlen -= 1;
2216 }
2217 }
2218}
2219
2220/*
2221 * Headline selector used when MaxFragments > 0
2222 *
2223 * Note: in this mode, highlightall is disregarded for phrase selection;
2224 * it only controls presentation details.
2225 */
2226static void
2228 bool highlightall,
2229 int shortword, int min_words,
2230 int max_words, int max_fragments)
2231{
2232 int32 poslen,
2233 curlen,
2234 i,
2235 f,
2236 num_f = 0;
2237 int32 stretch,
2238 maxstretch,
2239 posmarker;
2240
2241 int32 startpos = 0,
2242 endpos = 0,
2243 nextpos = 0,
2244 p = 0,
2245 q = 0;
2246
2247 int32 numcovers = 0,
2248 maxcovers = 32;
2249
2250 int32 minI,
2251 minwords,
2252 maxitems;
2254
2255 covers = palloc(maxcovers * sizeof(CoverPos));
2256
2257 /* get all covers */
2258 while (hlCover(prs, query, locations, &nextpos, &p, &q))
2259 {
2260 startpos = p;
2261 endpos = q;
2262
2263 /*
2264 * Break the cover into smaller fragments such that each fragment has
2265 * at most max_words. Also ensure that each end of each fragment is a
2266 * query word. This will allow us to stretch the fragment in either
2267 * direction
2268 */
2269
2270 while (startpos <= endpos)
2271 {
2272 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2273 if (numcovers >= maxcovers)
2274 {
2275 maxcovers *= 2;
2276 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2277 }
2278 covers[numcovers].startpos = startpos;
2279 covers[numcovers].endpos = endpos;
2280 covers[numcovers].curlen = curlen;
2281 covers[numcovers].poslen = poslen;
2282 covers[numcovers].chosen = false;
2283 covers[numcovers].excluded = false;
2284 numcovers++;
2285 startpos = endpos + 1;
2286 endpos = q;
2287 }
2288 }
2289
2290 /* choose best covers */
2291 for (f = 0; f < max_fragments; f++)
2292 {
2293 maxitems = 0;
2295 minI = -1;
2296
2297 /*
2298 * Choose the cover that contains max items. In case of tie choose the
2299 * one with smaller number of words.
2300 */
2301 for (i = 0; i < numcovers; i++)
2302 {
2303 if (!covers[i].chosen && !covers[i].excluded &&
2304 (maxitems < covers[i].poslen ||
2305 (maxitems == covers[i].poslen &&
2306 minwords > covers[i].curlen)))
2307 {
2308 maxitems = covers[i].poslen;
2309 minwords = covers[i].curlen;
2310 minI = i;
2311 }
2312 }
2313 /* if a cover was found mark it */
2314 if (minI >= 0)
2315 {
2316 covers[minI].chosen = true;
2317 /* adjust the size of cover */
2318 startpos = covers[minI].startpos;
2319 endpos = covers[minI].endpos;
2320 curlen = covers[minI].curlen;
2321 /* stretch the cover if cover size is lower than max_words */
2322 if (curlen < max_words)
2323 {
2324 /* divide the stretch on both sides of cover */
2325 maxstretch = (max_words - curlen) / 2;
2326
2327 /*
2328 * first stretch the startpos stop stretching if 1. we hit the
2329 * beginning of document 2. exceed maxstretch 3. we hit an
2330 * already marked fragment
2331 */
2332 stretch = 0;
2334 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2335 {
2336 if (!NONWORDTOKEN(prs->words[i].type))
2337 {
2338 curlen++;
2339 stretch++;
2340 }
2341 posmarker = i;
2342 }
2343 /* cut back startpos till we find a good endpoint */
2344 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2345 {
2346 if (!NONWORDTOKEN(prs->words[i].type))
2347 curlen--;
2348 }
2349 startpos = i;
2350 /* now stretch the endpos as much as possible */
2351 posmarker = endpos;
2352 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2353 {
2354 if (!NONWORDTOKEN(prs->words[i].type))
2355 curlen++;
2356 posmarker = i;
2357 }
2358 /* cut back endpos till we find a good endpoint */
2359 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2360 {
2361 if (!NONWORDTOKEN(prs->words[i].type))
2362 curlen--;
2363 }
2364 endpos = i;
2365 }
2366 covers[minI].startpos = startpos;
2367 covers[minI].endpos = endpos;
2368 covers[minI].curlen = curlen;
2369 /* Mark the chosen fragments (covers) */
2371 num_f++;
2372 /* Exclude covers overlapping this one from future consideration */
2373 for (i = 0; i < numcovers; i++)
2374 {
2375 if (i != minI &&
2376 ((covers[i].startpos >= startpos &&
2377 covers[i].startpos <= endpos) ||
2378 (covers[i].endpos >= startpos &&
2379 covers[i].endpos <= endpos) ||
2380 (covers[i].startpos < startpos &&
2381 covers[i].endpos > endpos)))
2382 covers[i].excluded = true;
2383 }
2384 }
2385 else
2386 break; /* no selectable covers remain */
2387 }
2388
2389 /* show the first min_words words if we have not marked anything */
2390 if (num_f <= 0)
2391 {
2392 startpos = curlen = 0;
2393 endpos = -1;
2394 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2395 {
2396 if (!NONWORDTOKEN(prs->words[i].type))
2397 curlen++;
2398 endpos = i;
2399 }
2401 }
2402
2403 pfree(covers);
2404}
2405
2406/*
2407 * Headline selector used when MaxFragments == 0
2408 */
2409static void
2411 bool highlightall,
2412 int shortword, int min_words, int max_words)
2413{
2414 int nextpos = 0,
2415 p = 0,
2416 q = 0;
2417 int bestb = -1,
2418 beste = -1;
2419 int bestlen = -1;
2420 bool bestcover = false;
2421 int pose,
2422 posb,
2423 poslen,
2424 curlen;
2425 bool poscover;
2426 int i;
2427
2428 if (!highlightall)
2429 {
2430 /* examine all covers, select a headline using the best one */
2431 while (hlCover(prs, query, locations, &nextpos, &p, &q))
2432 {
2433 /*
2434 * Count words (curlen) and interesting words (poslen) within
2435 * cover, but stop once we reach max_words. This step doesn't
2436 * consider whether that's a good stopping point. posb and pose
2437 * are set to the start and end indexes of the possible headline.
2438 */
2439 curlen = 0;
2440 poslen = 0;
2441 posb = pose = p;
2442 for (i = p; i <= q && curlen < max_words; i++)
2443 {
2444 if (!NONWORDTOKEN(prs->words[i].type))
2445 curlen++;
2446 if (INTERESTINGWORD(i))
2447 poslen++;
2448 pose = i;
2449 }
2450
2451 if (curlen < max_words)
2452 {
2453 /*
2454 * We have room to lengthen the headline, so search forward
2455 * until it's full or we find a good stopping point. We'll
2456 * reconsider the word at "q", then move forward.
2457 */
2458 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2459 {
2460 if (i > q)
2461 {
2462 if (!NONWORDTOKEN(prs->words[i].type))
2463 curlen++;
2464 if (INTERESTINGWORD(i))
2465 poslen++;
2466 }
2467 pose = i;
2468 if (BADENDPOINT(i))
2469 continue;
2470 if (curlen >= min_words)
2471 break;
2472 }
2473 if (curlen < min_words)
2474 {
2475 /*
2476 * Reached end of text and our headline is still shorter
2477 * than min_words, so try to extend it to the left.
2478 */
2479 for (i = p - 1; i >= 0; i--)
2480 {
2481 if (!NONWORDTOKEN(prs->words[i].type))
2482 curlen++;
2483 if (INTERESTINGWORD(i))
2484 poslen++;
2485 if (curlen >= max_words)
2486 break;
2487 if (BADENDPOINT(i))
2488 continue;
2489 if (curlen >= min_words)
2490 break;
2491 }
2492 posb = (i >= 0) ? i : 0;
2493 }
2494 }
2495 else
2496 {
2497 /*
2498 * Can't make headline longer, so consider making it shorter
2499 * if needed to avoid a bad endpoint.
2500 */
2501 if (i > q)
2502 i = q;
2503 for (; curlen > min_words; i--)
2504 {
2505 if (!BADENDPOINT(i))
2506 break;
2507 if (!NONWORDTOKEN(prs->words[i].type))
2508 curlen--;
2509 if (INTERESTINGWORD(i))
2510 poslen--;
2511 pose = i - 1;
2512 }
2513 }
2514
2515 /*
2516 * Check whether the proposed headline includes the original
2517 * cover; it might not if we trimmed it due to max_words.
2518 */
2520
2521 /*
2522 * Adopt this headline if it's better than the last one, giving
2523 * highest priority to headlines including the cover, then to
2524 * headlines with more interesting words, then to headlines with
2525 * good stopping points. (Since bestlen is initially -1, we will
2526 * certainly adopt the first headline.)
2527 */
2528 if (poscover > bestcover ||
2529 (poscover == bestcover && poslen > bestlen) ||
2530 (poscover == bestcover && poslen == bestlen &&
2532 {
2533 bestb = posb;
2534 beste = pose;
2535 bestlen = poslen;
2537 }
2538 }
2539
2540 /*
2541 * If we found nothing acceptable, select min_words words starting at
2542 * the beginning.
2543 */
2544 if (bestlen < 0)
2545 {
2546 curlen = 0;
2547 pose = -1;
2548 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2549 {
2550 if (!NONWORDTOKEN(prs->words[i].type))
2551 curlen++;
2552 pose = i;
2553 }
2554 bestb = 0;
2555 beste = pose;
2556 }
2557 }
2558 else
2559 {
2560 /* highlightall mode: headline is whole document */
2561 bestb = 0;
2562 beste = prs->curwords - 1;
2563 }
2564
2566}
2567
2568/*
2569 * Default parser's prsheadline function
2570 */
2571Datum
2573{
2575 List *prsoptions = (List *) PG_GETARG_POINTER(1);
2576 TSQuery query = PG_GETARG_TSQUERY(2);
2577 List *locations;
2578
2579 /* default option values: */
2580 int min_words = 15;
2581 int max_words = 35;
2582 int shortword = 3;
2583 int max_fragments = 0;
2584 bool highlightall = false;
2585 ListCell *l;
2586
2587 /* Extract configuration option values */
2588 prs->startsel = NULL;
2589 prs->stopsel = NULL;
2590 prs->fragdelim = NULL;
2591 foreach(l, prsoptions)
2592 {
2593 DefElem *defel = (DefElem *) lfirst(l);
2594 char *val = defGetString(defel);
2595
2596 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2598 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2600 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2602 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2604 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2605 prs->startsel = pstrdup(val);
2606 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2607 prs->stopsel = pstrdup(val);
2608 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2609 prs->fragdelim = pstrdup(val);
2610 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2611 highlightall = (pg_strcasecmp(val, "1") == 0 ||
2612 pg_strcasecmp(val, "on") == 0 ||
2613 pg_strcasecmp(val, "true") == 0 ||
2614 pg_strcasecmp(val, "t") == 0 ||
2615 pg_strcasecmp(val, "y") == 0 ||
2616 pg_strcasecmp(val, "yes") == 0);
2617 else
2618 ereport(ERROR,
2620 errmsg("unrecognized headline parameter: \"%s\"",
2621 defel->defname)));
2622 }
2623
2624 /* in HighlightAll mode these parameters are ignored */
2625 if (!highlightall)
2626 {
2627 if (min_words >= max_words)
2628 ereport(ERROR,
2630 errmsg("%s must be less than %s", "MinWords", "MaxWords")));
2631 if (min_words <= 0)
2632 ereport(ERROR,
2634 errmsg("%s must be positive", "MinWords")));
2635 if (shortword < 0)
2636 ereport(ERROR,
2638 errmsg("%s must be >= 0", "ShortWord")));
2639 if (max_fragments < 0)
2640 ereport(ERROR,
2642 errmsg("%s must be >= 0", "MaxFragments")));
2643 }
2644
2645 /* Locate words and phrases matching the query */
2646 if (query->size > 0)
2647 {
2648 hlCheck ch;
2649
2650 ch.words = prs->words;
2651 ch.len = prs->curwords;
2654 }
2655 else
2656 locations = NIL; /* empty query matches nothing */
2657
2658 /* Apply appropriate headline selector */
2659 if (max_fragments == 0)
2662 else
2665
2666 /* Fill in default values for string options */
2667 if (!prs->startsel)
2668 prs->startsel = pstrdup("<b>");
2669 if (!prs->stopsel)
2670 prs->stopsel = pstrdup("</b>");
2671 if (!prs->fragdelim)
2672 prs->fragdelim = pstrdup(" ... ");
2673
2674 /* Caller will need these lengths, too */
2675 prs->startsellen = strlen(prs->startsel);
2676 prs->stopsellen = strlen(prs->stopsel);
2677 prs->fragdelimlen = strlen(prs->fragdelim);
2678
2679 PG_RETURN_POINTER(prs);
2680}
#define GETQUERY(x)
Definition _int.h:157
void print(const void *obj)
Definition print.c:36
#define PG_INT32_MAX
Definition c.h:603
#define Max(x, y)
Definition c.h:991
#define Assert(condition)
Definition c.h:873
int32_t int32
Definition c.h:542
uint16_t uint16
Definition c.h:545
#define lengthof(array)
Definition c.h:803
#define fprintf(file, fmt, msg)
Definition cubescan.l:21
char * defGetString(DefElem *def)
Definition define.c:34
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define ERROR
Definition elog.h:39
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_object(type)
Definition fe_memutils.h:75
#define PG_RETURN_VOID()
Definition fmgr.h:350
#define PG_GETARG_POINTER(n)
Definition fmgr.h:277
#define PG_RETURN_INT32(x)
Definition fmgr.h:355
#define PG_GETARG_INT32(n)
Definition fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
const char * str
long val
Definition informix.c:689
int i
Definition isn.c:77
#define PG_UTF8
Definition mbprint.c:43
unsigned int pg_wchar
Definition mbprint.c:31
int GetDatabaseEncoding(void)
Definition mbutils.c:1387
int pg_dsplen(const char *mbstr)
Definition mbutils.c:1156
int pg_mblen_range(const char *mbstr, const char *end)
Definition mbutils.c:1084
int pg_database_encoding_max_length(void)
Definition mbutils.c:1672
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition mbutils.c:997
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
int32 pg_strtoint32(const char *s)
Definition numutils.c:382
Datum lower(PG_FUNCTION_ARGS)
Datum upper(PG_FUNCTION_ARGS)
const void size_t len
const void * data
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
static XLogRecPtr endpos
static XLogRecPtr startpos
int pg_strcasecmp(const char *s1, const char *s2)
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
uint64_t Datum
Definition postgres.h:70
char * c
static int fb(int x)
void check_stack_depth(void)
Definition stack_depth.c:95
int32 endpos
int32 curlen
int32 startpos
bool excluded
int32 poslen
WordEntryPos * pos
Definition ts_utils.h:164
HeadlineWordEntry * words
Definition ts_public.h:76
WordEntryPos pos
Definition ts_public.h:68
QueryOperand * item
Definition ts_public.h:70
char * alias
Definition ts_public.h:28
int lexid
Definition ts_public.h:27
char * descr
Definition ts_public.h:29
Definition pg_list.h:54
const TParserStateActionItem * pushedAtAction
struct TParserPosition * prev
TParserState state
TParserCharTest isclass
TParserSpecial special
const TParserStateActionItem * action
TParserState state
char * str
pg_wchar * pgwstr
char * token
int charmaxlen
bool wanthost
int lenbytetoken
bool ignore
TParserPosition * state
int lenchartoken
int32 size
Definition ts_type.h:221
HeadlineWordEntry * words
#define PG_GETARG_TSQUERY(n)
Definition ts_type.h:266
uint16 WordEntryPos
Definition ts_type.h:63
TSTernaryValue
Definition ts_utils.h:131
@ TS_NO
Definition ts_utils.h:132
@ TS_YES
Definition ts_utils.h:133
#define TS_EXEC_EMPTY
Definition ts_utils.h:186
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
List * TS_execute_locations(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
static const TParserStateActionItem actionTPS_InParseHyphen[]
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
#define NONWORDTOKEN(x)
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
static const TParserStateActionItem actionTPS_InHostFirstAN[]
#define VERSIONNUMBER
Definition wparser_def.c:43
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
#define BADENDPOINT(j)
#define ASCIIWORD
Definition wparser_def.c:36
#define PROTOCOL
Definition wparser_def.c:49
static const TParserStateActionItem actionTPS_InPathSecond[]
static const TParserStateActionItem actionTPS_InPathFirst[]
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
static void SpecialFURL(TParser *prs)
static const TParserStateActionItem actionTPS_InCommentEnd[]
static TSTernaryValue checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
void _make_compiler_happy(void)
static const TParserStateActionItem actionTPS_InURLPathStart[]
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
static const TParserStateActionItem actionTPS_InHostDomain[]
static const TParserStateActionItem actionTPS_InVersion[]
#define XMLHLIDSKIP(x)
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
static const TParserStateActionItem actionTPS_InTagName[]
#define DECIMAL_T
Definition wparser_def.c:55
static const TParserStateActionItem actionTPS_InFileNext[]
static const TParserStateActionItem actionTPS_InXMLEntity[]
#define ASCIIPARTHWORD
Definition wparser_def.c:46
static const TParserStateActionItem actionTPS_InFURL[]
#define p_iswhat(type, nonascii)
static const TParserStateActionItem actionTPS_InMantissaSign[]
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
#define WORD_T
Definition wparser_def.c:37
TParserState
@ TPS_InXMLEntityHexNumFirst
@ TPS_InPort
@ TPS_InXMLEntityHexNum
@ TPS_InHostDomainSecond
@ TPS_InMantissaFirst
@ TPS_InTagName
@ TPS_InHyphenAsciiWordFirst
@ TPS_Null
@ TPS_InPathFirstFirst
@ TPS_InSignedIntFirst
@ TPS_InSignedInt
@ TPS_InUnsignedInt
@ TPS_InMantissa
@ TPS_InProtocolFirst
@ TPS_InFURL
@ TPS_InMantissaSign
@ TPS_InXMLBegin
@ TPS_InCommentEnd
@ TPS_InHyphenWordFirst
@ TPS_InHyphenNumWordPart
@ TPS_InPortFirst
@ TPS_InProtocolEnd
@ TPS_InXMLEntityFirst
@ TPS_InHyphenNumWordFirst
@ TPS_InCommentLast
@ TPS_InFileTwiddle
@ TPS_InURLPathStart
@ TPS_InURLPathFirst
@ TPS_InPathFirst
@ TPS_InPathSecond
@ TPS_InHyphenUnsignedInt
@ TPS_InFileFirst
@ TPS_InXMLEntityNumFirst
@ TPS_InHyphenWordPart
@ TPS_InNumWord
@ TPS_InAsciiWord
@ TPS_InVersion
@ TPS_InHost
@ TPS_InFile
@ TPS_InProtocolSecond
@ TPS_InCloseCommentFirst
@ TPS_InTagEscapeK
@ TPS_InParseHyphenHyphen
@ TPS_InTagBackSleshed
@ TPS_InTagFirst
@ TPS_InTagEnd
@ TPS_InComment
@ TPS_InHyphenWord
@ TPS_InHyphenAsciiWord
@ TPS_InWord
@ TPS_InXMLEntityEnd
@ TPS_InTagEscapeKK
@ TPS_InSpace
@ TPS_InFileNext
@ TPS_InURLPath
@ TPS_Base
@ TPS_InUDecimal
@ TPS_InParseHyphen
@ TPS_InHostFirstAN
@ TPS_InEmail
@ TPS_InDecimalFirst
@ TPS_InVersionFirst
@ TPS_InCloseCommentLast
@ TPS_InSVerVersion
@ TPS_InHyphenAsciiWordPart
@ TPS_InCommentFirst
@ TPS_InUDecimalFirst
@ TPS_InHostFirstDomain
@ TPS_InHostDomain
@ TPS_InHyphenDigitLookahead
@ TPS_InVerVersion
@ TPS_InXMLEntityNum
@ TPS_InTag
@ TPS_InDecimal
@ TPS_InTagCloseFirst
@ TPS_InXMLEntity
@ TPS_InHyphenNumWord
@ TPS_InTagBeginEnd
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
static const TParserStateActionItem actionTPS_InDecimal[]
#define A_POP
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
static const TParserStateActionItem actionTPS_InTagEscapeK[]
static const TParserStateActionItem actionTPS_InSpace[]
static const TParserStateActionItem actionTPS_InFile[]
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
#define LASTNUM
Definition wparser_def.c:60
static int p_iseqC(TParser *prs)
Datum prsd_headline(PG_FUNCTION_ARGS)
#define NUMHWORD
Definition wparser_def.c:50
static bool hlCover(HeadlineParsedText *prs, TSQuery query, List *locations, int *nextpos, int *p, int *q)
#define SPACE
Definition wparser_def.c:47
static const TParserStateActionItem actionTPS_InUDecimal[]
int(* TParserCharTest)(struct TParser *)
static const TParserStateActionItem actionTPS_InSignedInt[]
static int p_isurlchar(TParser *prs)
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
static const TParserStateActionItem actionTPS_InTagFirst[]
#define NUMWORD
Definition wparser_def.c:38
#define FILEPATH
Definition wparser_def.c:54
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
static int p_isneC(TParser *prs)
#define EMAIL
Definition wparser_def.c:39
static const TParserStateActionItem actionTPS_InCommentLast[]
static TParserPosition * newTParserPosition(TParserPosition *prev)
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
static const TParserStateActionItem actionTPS_InMantissaFirst[]
static const TParserStateActionItem actionTPS_Base[]
static void SpecialHyphen(TParser *prs)
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words)
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
#define UNSIGNEDINT
Definition wparser_def.c:57
void(* TParserSpecial)(struct TParser *)
static const TParserStateActionItem actionTPS_InEmail[]
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
static const TParserStateActionItem actionTPS_InURLPath[]
#define A_RERUN
static const TParserStateActionItem actionTPS_InSVerVersion[]
static const TParserStateActionItem actionTPS_InAsciiWord[]
static const char *const tok_alias[]
Definition wparser_def.c:62
static int p_isstophost(TParser *prs)
#define HLIDSKIP(x)
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
#define SIGNEDINT
Definition wparser_def.c:56
static int p_isasclet(TParser *prs)
static const TParserStateAction Actions[]
static const TParserStateActionItem actionTPS_InXMLBegin[]
#define PARTHWORD
Definition wparser_def.c:45
#define HLIDREPLACE(x)
#define A_MERGE
static TParser * TParserInit(char *str, int len)
static const TParserStateActionItem actionTPS_InMantissa[]
static const TParserStateActionItem actionTPS_InVersionFirst[]
static int p_isascii(TParser *prs)
static const TParserStateActionItem actionTPS_InCommentFirst[]
static const TParserStateActionItem actionTPS_InHyphenWord[]
static int p_isignore(TParser *prs)
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
static const TParserStateActionItem actionTPS_InPort[]
#define TAG_T
Definition wparser_def.c:48
static const TParserStateActionItem actionTPS_InDecimalFirst[]
#define URLPATH
Definition wparser_def.c:53
Datum prsd_lextype(PG_FUNCTION_ARGS)
#define ASCIIHWORD
Definition wparser_def.c:51
#define HOST
Definition wparser_def.c:41
static const TParserStateActionItem actionTPS_InTag[]
Datum prsd_start(PG_FUNCTION_ARGS)
#define A_BINGO
#define TPARSERSTATEACTION(state)
static bool TParserGet(TParser *prs)
#define XMLENTITY
Definition wparser_def.c:58
static int p_ishost(TParser *prs)
Datum prsd_end(PG_FUNCTION_ARGS)
#define A_CLRALL
static int p_isURLPath(TParser *prs)
static void SpecialVerVersion(TParser *prs)
static const TParserStateActionItem actionTPS_InProtocolFirst[]
static const TParserStateActionItem actionTPS_InUnsignedInt[]
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
static int p_isEOF(TParser *prs)
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
static void TParserCopyClose(TParser *prs)
#define A_CLEAR
static const TParserStateActionItem actionTPS_InFileFirst[]
static const TParserStateActionItem actionTPS_InNumWord[]
static const TParserStateActionItem actionTPS_InFileTwiddle[]
static TParser * TParserCopyInit(const TParser *orig)
static const TParserStateActionItem actionTPS_InHost[]
#define A_PUSH
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
static const TParserStateActionItem actionTPS_InProtocolSecond[]
static const TParserStateActionItem actionTPS_InWord[]
static int p_isspecial(TParser *prs)
static void TParserClose(TParser *prs)
#define URL_T
Definition wparser_def.c:40
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
static const TParserStateActionItem actionTPS_InVerVersion[]
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
#define A_NEXT
static const TParserStateActionItem actionTPS_InPortFirst[]
#define HWORD
Definition wparser_def.c:52
#define NUMPARTHWORD
Definition wparser_def.c:44
static const char *const lex_descr[]
Definition wparser_def.c:89
#define INTERESTINGWORD(j)
#define SCIENTIFIC
Definition wparser_def.c:42
static void SpecialTags(TParser *prs)
static const TParserStateActionItem actionTPS_InTagEnd[]
static const TParserStateActionItem actionTPS_InComment[]
static const TParserStateActionItem actionTPS_InProtocolEnd[]
static const TParserStateActionItem actionTPS_InURLPathFirst[]
static const TParserStateActionItem actionTPS_InPathFirstFirst[]