PostgreSQL Source Code git master
wparser_def.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wparser_def.c
4 * Default text search parser
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/wparser_def.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include <limits.h>
18#include <wctype.h>
19
20#include "commands/defrem.h"
21#include "mb/pg_wchar.h"
22#include "miscadmin.h"
23#include "tsearch/ts_public.h"
24#include "tsearch/ts_type.h"
25#include "tsearch/ts_utils.h"
26#include "utils/builtins.h"
27#include "utils/pg_locale.h"
28
29
30/* Define me to enable tracing of parser behavior */
31/* #define WPARSER_TRACE */
32
33
34/* Output token categories */
35
36#define ASCIIWORD 1
37#define WORD_T 2
38#define NUMWORD 3
39#define EMAIL 4
40#define URL_T 5
41#define HOST 6
42#define SCIENTIFIC 7
43#define VERSIONNUMBER 8
44#define NUMPARTHWORD 9
45#define PARTHWORD 10
46#define ASCIIPARTHWORD 11
47#define SPACE 12
48#define TAG_T 13
49#define PROTOCOL 14
50#define NUMHWORD 15
51#define ASCIIHWORD 16
52#define HWORD 17
53#define URLPATH 18
54#define FILEPATH 19
55#define DECIMAL_T 20
56#define SIGNEDINT 21
57#define UNSIGNEDINT 22
58#define XMLENTITY 23
59
60#define LASTNUM 23
61
62static const char *const tok_alias[] = {
63 "",
64 "asciiword",
65 "word",
66 "numword",
67 "email",
68 "url",
69 "host",
70 "sfloat",
71 "version",
72 "hword_numpart",
73 "hword_part",
74 "hword_asciipart",
75 "blank",
76 "tag",
77 "protocol",
78 "numhword",
79 "asciihword",
80 "hword",
81 "url_path",
82 "file",
83 "float",
84 "int",
85 "uint",
86 "entity"
87};
88
89static const char *const lex_descr[] = {
90 "",
91 "Word, all ASCII",
92 "Word, all letters",
93 "Word, letters and digits",
94 "Email address",
95 "URL",
96 "Host",
97 "Scientific notation",
98 "Version number",
99 "Hyphenated word part, letters and digits",
100 "Hyphenated word part, all letters",
101 "Hyphenated word part, all ASCII",
102 "Space symbols",
103 "XML tag",
104 "Protocol head",
105 "Hyphenated word, letters and digits",
106 "Hyphenated word, all ASCII",
107 "Hyphenated word, all letters",
108 "URL path",
109 "File or path name",
110 "Decimal notation",
111 "Signed integer",
112 "Unsigned integer",
113 "XML entity"
114};
115
116
117/* Parser states */
118
119typedef enum
120{
198 TPS_Null /* last state (fake value) */
200
201/* forward declaration */
202struct TParser;
203
204typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
205 * except p_iseq */
206typedef void (*TParserSpecial) (struct TParser *); /* special handler for
207 * special cases... */
208
209typedef struct
210{
212 char c;
215 int type;
218
219/* Flag bits in TParserStateActionItem.flags */
220#define A_NEXT 0x0000
221#define A_BINGO 0x0001
222#define A_POP 0x0002
223#define A_PUSH 0x0004
224#define A_RERUN 0x0008
225#define A_CLEAR 0x0010
226#define A_MERGE 0x0020
227#define A_CLRALL 0x0040
228
229typedef struct TParserPosition
230{
231 int posbyte; /* position of parser in bytes */
232 int poschar; /* position of parser in characters */
233 int charlen; /* length of current char */
234 int lenbytetoken; /* length of token-so-far in bytes */
235 int lenchartoken; /* and in chars */
240
241typedef struct TParser
242{
243 /* string and position information */
244 char *str; /* multibyte string */
245 int lenstr; /* length of mbstring */
246 wchar_t *wstr; /* wide character string */
247 pg_wchar *pgwstr; /* wide character string for C-locale */
249
250 /* State of parse */
253 bool ignore;
255
256 /* silly char */
257 char c;
258
259 /* out */
260 char *token;
263 int type;
265
266
267/* forward decls here */
268static bool TParserGet(TParser *prs);
269
270
271static TParserPosition *
273{
275
276 if (prev)
277 memcpy(res, prev, sizeof(TParserPosition));
278 else
279 memset(res, 0, sizeof(TParserPosition));
280
281 res->prev = prev;
282
283 res->pushedAtAction = NULL;
284
285 return res;
286}
287
288static TParser *
289TParserInit(char *str, int len)
290{
291 TParser *prs = (TParser *) palloc0(sizeof(TParser));
292
294 prs->str = str;
295 prs->lenstr = len;
296
297 /*
298 * Use wide char code only when max encoding length > 1.
299 */
300 if (prs->charmaxlen > 1)
301 {
302 pg_locale_t mylocale = 0; /* TODO */
303
304 prs->usewide = true;
306 {
307 /*
308 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
309 * be different from sizeof(wchar_t)
310 */
311 prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
312 pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
313 }
314 else
315 {
316 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
317 char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
318 mylocale);
319 }
320 }
321 else
322 prs->usewide = false;
323
324 prs->state = newTParserPosition(NULL);
325 prs->state->state = TPS_Base;
326
327#ifdef WPARSER_TRACE
328 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
329#endif
330
331 return prs;
332}
333
334/*
335 * As an alternative to a full TParserInit one can create a
336 * TParserCopy which basically is a regular TParser without a private
337 * copy of the string - instead it uses the one from another TParser.
338 * This is useful because at some places TParsers are created
339 * recursively and the repeated copying around of the strings can
340 * cause major inefficiency if the source string is long.
341 * The new parser starts parsing at the original's current position.
342 *
343 * Obviously one must not close the original TParser before the copy.
344 */
345static TParser *
347{
348 TParser *prs = (TParser *) palloc0(sizeof(TParser));
349
350 prs->charmaxlen = orig->charmaxlen;
351 prs->str = orig->str + orig->state->posbyte;
352 prs->lenstr = orig->lenstr - orig->state->posbyte;
353 prs->usewide = orig->usewide;
354
355 if (orig->pgwstr)
356 prs->pgwstr = orig->pgwstr + orig->state->poschar;
357 if (orig->wstr)
358 prs->wstr = orig->wstr + orig->state->poschar;
359
360 prs->state = newTParserPosition(NULL);
361 prs->state->state = TPS_Base;
362
363#ifdef WPARSER_TRACE
364 fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
365#endif
366
367 return prs;
368}
369
370
371static void
373{
374 while (prs->state)
375 {
376 TParserPosition *ptr = prs->state->prev;
377
378 pfree(prs->state);
379 prs->state = ptr;
380 }
381
382 if (prs->wstr)
383 pfree(prs->wstr);
384 if (prs->pgwstr)
385 pfree(prs->pgwstr);
386
387#ifdef WPARSER_TRACE
388 fprintf(stderr, "closing parser\n");
389#endif
390 pfree(prs);
391}
392
393/*
394 * Close a parser created with TParserCopyInit
395 */
396static void
398{
399 while (prs->state)
400 {
401 TParserPosition *ptr = prs->state->prev;
402
403 pfree(prs->state);
404 prs->state = ptr;
405 }
406
407#ifdef WPARSER_TRACE
408 fprintf(stderr, "closing parser copy\n");
409#endif
410 pfree(prs);
411}
412
413
414/*
415 * Character-type support functions, equivalent to is* macros, but
416 * working with any possible encodings and locales. Notes:
417 * - with multibyte encoding and C-locale isw* function may fail
418 * or give wrong result.
419 * - multibyte encoding and C-locale often are used for
420 * Asian languages.
421 * - if locale is C then we use pgwstr instead of wstr.
422 */
423
424#define p_iswhat(type, nonascii) \
425 \
426static int \
427p_is##type(TParser *prs) \
428{ \
429 Assert(prs->state); \
430 if (prs->usewide) \
431 { \
432 if (prs->pgwstr) \
433 { \
434 unsigned int c = *(prs->pgwstr + prs->state->poschar); \
435 if (c > 0x7f) \
436 return nonascii; \
437 return is##type(c); \
438 } \
439 return isw##type(*(prs->wstr + prs->state->poschar)); \
440 } \
441 return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
442} \
443 \
444static int \
445p_isnot##type(TParser *prs) \
446{ \
447 return !p_is##type(prs); \
448}
449
450/*
451 * In C locale with a multibyte encoding, any non-ASCII symbol is considered
452 * an alpha character, but not a member of other char classes.
453 */
454p_iswhat(alnum, 1)
455p_iswhat(alpha, 1)
456p_iswhat(digit, 0)
457p_iswhat(lower, 0)
458p_iswhat(print, 0)
459p_iswhat(punct, 0)
460p_iswhat(space, 0)
461p_iswhat(upper, 0)
462p_iswhat(xdigit, 0)
463
464/* p_iseq should be used only for ascii symbols */
465
466static int
467p_iseq(TParser *prs, char c)
468{
469 Assert(prs->state);
470 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
471}
472
473static int
475{
476 Assert(prs->state);
477 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
478}
479
480static int
482{
483 return p_iseq(prs, prs->c);
484}
485
486static int
488{
489 return !p_iseq(prs, prs->c);
490}
491
492static int
494{
495 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
496}
497
498static int
500{
501 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
502}
503
504static int
506{
507 char ch;
508
509 /* no non-ASCII need apply */
510 if (prs->state->charlen != 1)
511 return 0;
512 ch = *(prs->str + prs->state->posbyte);
513 /* no spaces or control characters */
514 if (ch <= 0x20 || ch >= 0x7F)
515 return 0;
516 /* reject characters disallowed by RFC 3986 */
517 switch (ch)
518 {
519 case '"':
520 case '<':
521 case '>':
522 case '\\':
523 case '^':
524 case '`':
525 case '{':
526 case '|':
527 case '}':
528 return 0;
529 }
530 return 1;
531}
532
533
534/* deliberately suppress unused-function complaints for the above */
535void _make_compiler_happy(void);
536void
538{
539 p_isalnum(NULL);
540 p_isnotalnum(NULL);
541 p_isalpha(NULL);
542 p_isnotalpha(NULL);
543 p_isdigit(NULL);
544 p_isnotdigit(NULL);
545 p_islower(NULL);
546 p_isnotlower(NULL);
547 p_isprint(NULL);
548 p_isnotprint(NULL);
549 p_ispunct(NULL);
550 p_isnotpunct(NULL);
551 p_isspace(NULL);
552 p_isnotspace(NULL);
553 p_isupper(NULL);
554 p_isnotupper(NULL);
555 p_isxdigit(NULL);
556 p_isnotxdigit(NULL);
557 p_isEOF(NULL);
558 p_iseqC(NULL);
559 p_isneC(NULL);
560}
561
562
563static void
565{
566 switch (prs->state->lenchartoken)
567 {
568 case 8: /* </script */
569 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
570 prs->ignore = false;
571 break;
572 case 7: /* <script || </style */
573 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
574 prs->ignore = false;
575 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
576 prs->ignore = true;
577 break;
578 case 6: /* <style */
579 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
580 prs->ignore = true;
581 break;
582 default:
583 break;
584 }
585}
586
587static void
589{
590 prs->wanthost = true;
591 prs->state->posbyte -= prs->state->lenbytetoken;
592 prs->state->poschar -= prs->state->lenchartoken;
593}
594
595static void
597{
598 prs->state->posbyte -= prs->state->lenbytetoken;
599 prs->state->poschar -= prs->state->lenchartoken;
600}
601
602static void
604{
605 prs->state->posbyte -= prs->state->lenbytetoken;
606 prs->state->poschar -= prs->state->lenchartoken;
607 prs->state->lenbytetoken = 0;
608 prs->state->lenchartoken = 0;
609}
610
611static int
613{
614 if (prs->wanthost)
615 {
616 prs->wanthost = false;
617 return 1;
618 }
619 return 0;
620}
621
622static int
624{
625 return (prs->ignore) ? 1 : 0;
626}
627
628static int
630{
631 TParser *tmpprs = TParserCopyInit(prs);
632 int res = 0;
633
634 tmpprs->wanthost = true;
635
636 /*
637 * Check stack depth before recursing. (Since TParserGet() doesn't
638 * normally recurse, we put the cost of checking here not there.)
639 */
641
642 if (TParserGet(tmpprs) && tmpprs->type == HOST)
643 {
644 prs->state->posbyte += tmpprs->lenbytetoken;
645 prs->state->poschar += tmpprs->lenchartoken;
646 prs->state->lenbytetoken += tmpprs->lenbytetoken;
647 prs->state->lenchartoken += tmpprs->lenchartoken;
648 prs->state->charlen = tmpprs->state->charlen;
649 res = 1;
650 }
651 TParserCopyClose(tmpprs);
652
653 return res;
654}
655
656static int
658{
659 TParser *tmpprs = TParserCopyInit(prs);
660 int res = 0;
661
662 tmpprs->state = newTParserPosition(tmpprs->state);
663 tmpprs->state->state = TPS_InURLPathFirst;
664
665 /*
666 * Check stack depth before recursing. (Since TParserGet() doesn't
667 * normally recurse, we put the cost of checking here not there.)
668 */
670
671 if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
672 {
673 prs->state->posbyte += tmpprs->lenbytetoken;
674 prs->state->poschar += tmpprs->lenchartoken;
675 prs->state->lenbytetoken += tmpprs->lenbytetoken;
676 prs->state->lenchartoken += tmpprs->lenchartoken;
677 prs->state->charlen = tmpprs->state->charlen;
678 res = 1;
679 }
680 TParserCopyClose(tmpprs);
681
682 return res;
683}
684
685/*
686 * returns true if current character has zero display length or
687 * it's a special sign in several languages. Such characters
688 * aren't a word-breaker although they aren't an isalpha.
689 * In beginning of word they aren't a part of it.
690 */
691static int
693{
694 /*
695 * pg_dsplen could return -1 which means error or control character
696 */
697 if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
698 return 1;
699
700 /*
701 * Unicode Characters in the 'Mark, Spacing Combining' Category That
702 * characters are not alpha although they are not breakers of word too.
703 * Check that only in utf encoding, because other encodings aren't
704 * supported by postgres or even exists.
705 */
706 if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
707 {
708 static const pg_wchar strange_letter[] = {
709 /*
710 * use binary search, so elements should be ordered
711 */
712 0x0903, /* DEVANAGARI SIGN VISARGA */
713 0x093E, /* DEVANAGARI VOWEL SIGN AA */
714 0x093F, /* DEVANAGARI VOWEL SIGN I */
715 0x0940, /* DEVANAGARI VOWEL SIGN II */
716 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
717 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
718 0x094B, /* DEVANAGARI VOWEL SIGN O */
719 0x094C, /* DEVANAGARI VOWEL SIGN AU */
720 0x0982, /* BENGALI SIGN ANUSVARA */
721 0x0983, /* BENGALI SIGN VISARGA */
722 0x09BE, /* BENGALI VOWEL SIGN AA */
723 0x09BF, /* BENGALI VOWEL SIGN I */
724 0x09C0, /* BENGALI VOWEL SIGN II */
725 0x09C7, /* BENGALI VOWEL SIGN E */
726 0x09C8, /* BENGALI VOWEL SIGN AI */
727 0x09CB, /* BENGALI VOWEL SIGN O */
728 0x09CC, /* BENGALI VOWEL SIGN AU */
729 0x09D7, /* BENGALI AU LENGTH MARK */
730 0x0A03, /* GURMUKHI SIGN VISARGA */
731 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
732 0x0A3F, /* GURMUKHI VOWEL SIGN I */
733 0x0A40, /* GURMUKHI VOWEL SIGN II */
734 0x0A83, /* GUJARATI SIGN VISARGA */
735 0x0ABE, /* GUJARATI VOWEL SIGN AA */
736 0x0ABF, /* GUJARATI VOWEL SIGN I */
737 0x0AC0, /* GUJARATI VOWEL SIGN II */
738 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
739 0x0ACB, /* GUJARATI VOWEL SIGN O */
740 0x0ACC, /* GUJARATI VOWEL SIGN AU */
741 0x0B02, /* ORIYA SIGN ANUSVARA */
742 0x0B03, /* ORIYA SIGN VISARGA */
743 0x0B3E, /* ORIYA VOWEL SIGN AA */
744 0x0B40, /* ORIYA VOWEL SIGN II */
745 0x0B47, /* ORIYA VOWEL SIGN E */
746 0x0B48, /* ORIYA VOWEL SIGN AI */
747 0x0B4B, /* ORIYA VOWEL SIGN O */
748 0x0B4C, /* ORIYA VOWEL SIGN AU */
749 0x0B57, /* ORIYA AU LENGTH MARK */
750 0x0BBE, /* TAMIL VOWEL SIGN AA */
751 0x0BBF, /* TAMIL VOWEL SIGN I */
752 0x0BC1, /* TAMIL VOWEL SIGN U */
753 0x0BC2, /* TAMIL VOWEL SIGN UU */
754 0x0BC6, /* TAMIL VOWEL SIGN E */
755 0x0BC7, /* TAMIL VOWEL SIGN EE */
756 0x0BC8, /* TAMIL VOWEL SIGN AI */
757 0x0BCA, /* TAMIL VOWEL SIGN O */
758 0x0BCB, /* TAMIL VOWEL SIGN OO */
759 0x0BCC, /* TAMIL VOWEL SIGN AU */
760 0x0BD7, /* TAMIL AU LENGTH MARK */
761 0x0C01, /* TELUGU SIGN CANDRABINDU */
762 0x0C02, /* TELUGU SIGN ANUSVARA */
763 0x0C03, /* TELUGU SIGN VISARGA */
764 0x0C41, /* TELUGU VOWEL SIGN U */
765 0x0C42, /* TELUGU VOWEL SIGN UU */
766 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
767 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
768 0x0C82, /* KANNADA SIGN ANUSVARA */
769 0x0C83, /* KANNADA SIGN VISARGA */
770 0x0CBE, /* KANNADA VOWEL SIGN AA */
771 0x0CC0, /* KANNADA VOWEL SIGN II */
772 0x0CC1, /* KANNADA VOWEL SIGN U */
773 0x0CC2, /* KANNADA VOWEL SIGN UU */
774 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
775 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
776 0x0CC7, /* KANNADA VOWEL SIGN EE */
777 0x0CC8, /* KANNADA VOWEL SIGN AI */
778 0x0CCA, /* KANNADA VOWEL SIGN O */
779 0x0CCB, /* KANNADA VOWEL SIGN OO */
780 0x0CD5, /* KANNADA LENGTH MARK */
781 0x0CD6, /* KANNADA AI LENGTH MARK */
782 0x0D02, /* MALAYALAM SIGN ANUSVARA */
783 0x0D03, /* MALAYALAM SIGN VISARGA */
784 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
785 0x0D3F, /* MALAYALAM VOWEL SIGN I */
786 0x0D40, /* MALAYALAM VOWEL SIGN II */
787 0x0D46, /* MALAYALAM VOWEL SIGN E */
788 0x0D47, /* MALAYALAM VOWEL SIGN EE */
789 0x0D48, /* MALAYALAM VOWEL SIGN AI */
790 0x0D4A, /* MALAYALAM VOWEL SIGN O */
791 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
792 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
793 0x0D57, /* MALAYALAM AU LENGTH MARK */
794 0x0D82, /* SINHALA SIGN ANUSVARAYA */
795 0x0D83, /* SINHALA SIGN VISARGAYA */
796 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
797 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
798 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
799 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
800 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
801 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
802 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
803 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
804 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
805 * AELA-PILLA */
806 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
807 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
808 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
809 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
810 0x0F3E, /* TIBETAN SIGN YAR TSHES */
811 0x0F3F, /* TIBETAN SIGN MAR TSHES */
812 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
813 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
814 0x102C, /* MYANMAR VOWEL SIGN AA */
815 0x1031, /* MYANMAR VOWEL SIGN E */
816 0x1038, /* MYANMAR SIGN VISARGA */
817 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
818 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
819 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
820 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
821 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
822 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
823 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
824 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
825 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
826 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
827 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
828 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
829 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
830 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
831 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
832 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
833 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
834 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
835 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
836 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
837 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
838 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
839 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
840 0x17B6, /* KHMER VOWEL SIGN AA */
841 0x17BE, /* KHMER VOWEL SIGN OE */
842 0x17BF, /* KHMER VOWEL SIGN YA */
843 0x17C0, /* KHMER VOWEL SIGN IE */
844 0x17C1, /* KHMER VOWEL SIGN E */
845 0x17C2, /* KHMER VOWEL SIGN AE */
846 0x17C3, /* KHMER VOWEL SIGN AI */
847 0x17C4, /* KHMER VOWEL SIGN OO */
848 0x17C5, /* KHMER VOWEL SIGN AU */
849 0x17C7, /* KHMER SIGN REAHMUK */
850 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
851 0x1923, /* LIMBU VOWEL SIGN EE */
852 0x1924, /* LIMBU VOWEL SIGN AI */
853 0x1925, /* LIMBU VOWEL SIGN OO */
854 0x1926, /* LIMBU VOWEL SIGN AU */
855 0x1929, /* LIMBU SUBJOINED LETTER YA */
856 0x192A, /* LIMBU SUBJOINED LETTER RA */
857 0x192B, /* LIMBU SUBJOINED LETTER WA */
858 0x1930, /* LIMBU SMALL LETTER KA */
859 0x1931, /* LIMBU SMALL LETTER NGA */
860 0x1933, /* LIMBU SMALL LETTER TA */
861 0x1934, /* LIMBU SMALL LETTER NA */
862 0x1935, /* LIMBU SMALL LETTER PA */
863 0x1936, /* LIMBU SMALL LETTER MA */
864 0x1937, /* LIMBU SMALL LETTER RA */
865 0x1938, /* LIMBU SMALL LETTER LA */
866 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
867 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
868 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
869 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
870 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
871 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
872 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
873 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
874 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
875 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
876 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
877 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
878 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
879 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
880 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
881 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
882 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
883 0x19C8, /* NEW TAI LUE TONE MARK-1 */
884 0x19C9, /* NEW TAI LUE TONE MARK-2 */
885 0x1A19, /* BUGINESE VOWEL SIGN E */
886 0x1A1A, /* BUGINESE VOWEL SIGN O */
887 0x1A1B, /* BUGINESE VOWEL SIGN AE */
888 0x1B04, /* BALINESE SIGN BISAH */
889 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
890 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
891 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
892 0x1B3E, /* BALINESE VOWEL SIGN TALING */
893 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
894 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
895 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
896 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
897 0x1B44, /* BALINESE ADEG ADEG */
898 0x1B82, /* SUNDANESE SIGN PANGWISAD */
899 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
900 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
901 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
902 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
903 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
904 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
905 0x1C26, /* LEPCHA VOWEL SIGN AA */
906 0x1C27, /* LEPCHA VOWEL SIGN I */
907 0x1C28, /* LEPCHA VOWEL SIGN O */
908 0x1C29, /* LEPCHA VOWEL SIGN OO */
909 0x1C2A, /* LEPCHA VOWEL SIGN U */
910 0x1C2B, /* LEPCHA VOWEL SIGN UU */
911 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
912 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
913 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
914 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
915 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
916 0xA880, /* SAURASHTRA SIGN ANUSVARA */
917 0xA881, /* SAURASHTRA SIGN VISARGA */
918 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
919 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
920 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
921 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
922 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
923 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
924 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
925 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
926 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
927 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
928 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
929 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
930 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
931 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
932 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
933 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
934 0xA952, /* REJANG CONSONANT SIGN H */
935 0xA953, /* REJANG VIRAMA */
936 0xAA2F, /* CHAM VOWEL SIGN O */
937 0xAA30, /* CHAM VOWEL SIGN AI */
938 0xAA33, /* CHAM CONSONANT SIGN YA */
939 0xAA34, /* CHAM CONSONANT SIGN RA */
940 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
941 };
942 const pg_wchar *StopLow = strange_letter,
943 *StopHigh = strange_letter + lengthof(strange_letter),
944 *StopMiddle;
945 pg_wchar c;
946
947 if (prs->pgwstr)
948 c = *(prs->pgwstr + prs->state->poschar);
949 else
950 c = (pg_wchar) *(prs->wstr + prs->state->poschar);
951
952 while (StopLow < StopHigh)
953 {
954 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
955 if (*StopMiddle == c)
956 return 1;
957 else if (*StopMiddle < c)
958 StopLow = StopMiddle + 1;
959 else
960 StopHigh = StopMiddle;
961 }
962 }
963
964 return 0;
965}
966
967/*
968 * Table of state/action of parser
969 */
970
972 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
973 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
974 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
975 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
976 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
977 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
978 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
979 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
980 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
981 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
982 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
983 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
984 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
985};
986
987
989 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
990 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
991 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
992 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
993 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
994 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
995 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
996 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
997};
998
1000 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1001 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1002 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1003 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1004 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1005 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1006 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1007 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1008 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1009 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1010 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1011 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1012 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1013 {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1014 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1015};
1016
1018 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1019 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1020 {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1021 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1022 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1023 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1024};
1025
1027 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1028 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1029 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1030 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1031 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1032 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1033 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1034 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1035 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1036 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1037 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1038 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1039 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1040 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1041};
1042
1044 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1045 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1046 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1047};
1048
1050 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1051 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1052 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1053 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1054 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1055 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1056};
1057
1059 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1060 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1061 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1062 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1063 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1064 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1065 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1066 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1067 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1068};
1069
1071 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1072 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1073 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1074};
1075
1077 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1078 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1079 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1080 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1081 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1082 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1083};
1084
1086 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1087 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1088 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1089};
1090
1092 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1093 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1094 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1095 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1096 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1097 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1098};
1099
1101 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1103 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1104};
1105
1107 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1108 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1109 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1110};
1111
1112
1114 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1115 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1116 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1117};
1118
1120 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1121 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1122 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1123 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1124};
1125
1127 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1128 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1129 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1130 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1131 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1132};
1133
1135 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1136 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1137 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1138};
1139
1141 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1142 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1143 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1144};
1145
1147 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1148 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1149 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1150 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1151 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1152 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1153};
1154
1156 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1157 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1158 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1159 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1160 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1161 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1162 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1163 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1164};
1165
1167 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1168 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1169 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1170 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1171 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172};
1173
1175 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1177 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1178};
1179
1181 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1182 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1183 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1184 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1185};
1186
1188 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1189 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1190 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1191 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1192};
1193
1195 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1196};
1197
1199 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1200 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1201 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1202 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1203 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1204 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1205 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1206 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1207};
1208
1210 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1211 /* <?xml ... */
1212 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1213 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1214 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1215};
1216
1218 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1220 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1221};
1222
1224 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1225 /* <br/> case */
1226 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1228 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1229 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1230 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1231 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1232 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1233 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1234 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235};
1236
1238 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1240 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1241};
1242
1244 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1246 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1247 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1248 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1249 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1250 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1251 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1252 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1253 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1254 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1255 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1256 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1257 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1258 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1259 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1260 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1261 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1262 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1263};
1264
1266 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1268 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1269 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1270};
1271
1273 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1274 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1275 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1276 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1277};
1278
1280 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1281 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1282};
1283
1285 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1286};
1287
1289 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1290 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1291 /* <!DOCTYPE ...> */
1292 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1293 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1294 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1295};
1296
1298 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1300 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1301};
1302
1304 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1306 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1307};
1308
1310 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1312 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1313};
1314
1316 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1317 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1318 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1319 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1320};
1321
1323 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1324};
1325
1327 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1329 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1330 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1331};
1332
1334 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1335 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1336 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1337 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1338 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1339 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1340 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1341 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1342};
1343
1345 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1346 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1347 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1348 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1349 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1350 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1351 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1352 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1353 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1355 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1356 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1357};
1358
1360 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1361 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1362 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1363};
1364
1366 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1367 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1369 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1370 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1371};
1372
1374 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1375 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1376 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1377 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1378};
1379
1381 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1382 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1383 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1384 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1385 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1386 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1387 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1388 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1389};
1390
1392 {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1393 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1394 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1395};
1396
1398 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1399 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1400 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1401 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1402 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1403 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1404 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1405};
1406
1408 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1409 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1410 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1411 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1412 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1413 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1414};
1415
1417 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1418 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1419 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1420 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1421 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1422 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1423 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1424};
1425
1427 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1428 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1429 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1430 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1431};
1432
1434 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1435 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1436 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1437 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1438 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1439};
1440
1442 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1443 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1444 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1445 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1446 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1447 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1448 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1449 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1450};
1451
1453 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1454 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1455 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1456 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1457 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1458};
1459
1461 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1462 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1463 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1464};
1465
1467 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1468};
1469
1471 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1472 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1473 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1474};
1475
1477 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1479 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480};
1481
1483 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1485 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486};
1487
1489 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1490 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1491 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1492};
1493
1495 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1496};
1497
1499 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1500 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1501 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1502 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1503 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1504};
1505
1508 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1509 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1510 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1511 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1512 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1514};
1515
1517 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1518 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1519 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1520 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1521};
1522
1525 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1526 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1527 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1528 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1530};
1531
1533 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1534 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1535 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1536 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1537};
1538
1541 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1542 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1543 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1545};
1546
1548 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1549 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1550 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1551 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1552 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1553};
1554
1556 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1558 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1559 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1560 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1561 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1562};
1563
1565 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1566 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1568 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1569};
1570
1572 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1573 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1574 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1575 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1576 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1577};
1578
1580 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1582 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1583 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1584 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1586};
1587
1589 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1590 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1592 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1593};
1594
1596 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1597 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1598 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1600 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1601};
1602
1603
1604/*
1605 * main table of per-state parser actions
1606 */
1607typedef struct
1608{
1609 const TParserStateActionItem *action; /* the actual state info */
1610 TParserState state; /* only for Assert crosscheck */
1611#ifdef WPARSER_TRACE
1612 const char *state_name; /* only for debug printout */
1613#endif
1615
1616#ifdef WPARSER_TRACE
1617#define TPARSERSTATEACTION(state) \
1618 { CppConcat(action,state), state, CppAsString(state) }
1619#else
1620#define TPARSERSTATEACTION(state) \
1621 { CppConcat(action,state), state }
1622#endif
1623
1624/*
1625 * order must be the same as in typedef enum {} TParserState!!
1626 */
1627
1628static const TParserStateAction Actions[] = {
1706};
1707
1708
1709static bool
1711{
1712 const TParserStateActionItem *item = NULL;
1713
1715
1716 Assert(prs->state);
1717
1718 if (prs->state->posbyte >= prs->lenstr)
1719 return false;
1720
1721 prs->token = prs->str + prs->state->posbyte;
1722 prs->state->pushedAtAction = NULL;
1723
1724 /* look at string */
1725 while (prs->state->posbyte <= prs->lenstr)
1726 {
1727 if (prs->state->posbyte == prs->lenstr)
1728 prs->state->charlen = 0;
1729 else
1730 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1731 pg_mblen(prs->str + prs->state->posbyte);
1732
1733 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1734 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1735 Assert(Actions[prs->state->state].state == prs->state->state);
1736
1737 if (prs->state->pushedAtAction)
1738 {
1739 /* After a POP, pick up at the next test */
1740 item = prs->state->pushedAtAction + 1;
1741 prs->state->pushedAtAction = NULL;
1742 }
1743 else
1744 {
1745 item = Actions[prs->state->state].action;
1746 Assert(item != NULL);
1747 }
1748
1749 /* find action by character class */
1750 while (item->isclass)
1751 {
1752 prs->c = item->c;
1753 if (item->isclass(prs) != 0)
1754 break;
1755 item++;
1756 }
1757
1758#ifdef WPARSER_TRACE
1759 {
1760 TParserPosition *ptr;
1761
1762 fprintf(stderr, "state ");
1763 /* indent according to stack depth */
1764 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1765 fprintf(stderr, " ");
1766 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1767 if (prs->state->posbyte < prs->lenstr)
1768 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1769 else
1770 fprintf(stderr, "at EOF");
1771 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1772 (int) (item - Actions[prs->state->state].action),
1773 (item->flags & A_BINGO) ? " BINGO" : "",
1774 (item->flags & A_POP) ? " POP" : "",
1775 (item->flags & A_PUSH) ? " PUSH" : "",
1776 (item->flags & A_RERUN) ? " RERUN" : "",
1777 (item->flags & A_CLEAR) ? " CLEAR" : "",
1778 (item->flags & A_MERGE) ? " MERGE" : "",
1779 (item->flags & A_CLRALL) ? " CLRALL" : "",
1780 (item->tostate != TPS_Null) ? " tostate " : "",
1781 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1782 (item->type > 0) ? " type " : "",
1783 tok_alias[item->type]);
1784 }
1785#endif
1786
1787 /* call special handler if exists */
1788 if (item->special)
1789 item->special(prs);
1790
1791 /* BINGO, token is found */
1792 if (item->flags & A_BINGO)
1793 {
1794 Assert(item->type > 0);
1795 prs->lenbytetoken = prs->state->lenbytetoken;
1796 prs->lenchartoken = prs->state->lenchartoken;
1797 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1798 prs->type = item->type;
1799 }
1800
1801 /* do various actions by flags */
1802 if (item->flags & A_POP)
1803 { /* pop stored state in stack */
1804 TParserPosition *ptr = prs->state->prev;
1805
1806 pfree(prs->state);
1807 prs->state = ptr;
1808 Assert(prs->state);
1809 }
1810 else if (item->flags & A_PUSH)
1811 { /* push (store) state in stack */
1812 prs->state->pushedAtAction = item; /* remember where we push */
1813 prs->state = newTParserPosition(prs->state);
1814 }
1815 else if (item->flags & A_CLEAR)
1816 { /* clear previous pushed state */
1817 TParserPosition *ptr;
1818
1819 Assert(prs->state->prev);
1820 ptr = prs->state->prev->prev;
1821 pfree(prs->state->prev);
1822 prs->state->prev = ptr;
1823 }
1824 else if (item->flags & A_CLRALL)
1825 { /* clear all previous pushed state */
1826 TParserPosition *ptr;
1827
1828 while (prs->state->prev)
1829 {
1830 ptr = prs->state->prev->prev;
1831 pfree(prs->state->prev);
1832 prs->state->prev = ptr;
1833 }
1834 }
1835 else if (item->flags & A_MERGE)
1836 { /* merge posinfo with current and pushed state */
1837 TParserPosition *ptr = prs->state;
1838
1839 Assert(prs->state->prev);
1840 prs->state = prs->state->prev;
1841
1842 prs->state->posbyte = ptr->posbyte;
1843 prs->state->poschar = ptr->poschar;
1844 prs->state->charlen = ptr->charlen;
1845 prs->state->lenbytetoken = ptr->lenbytetoken;
1846 prs->state->lenchartoken = ptr->lenchartoken;
1847 pfree(ptr);
1848 }
1849
1850 /* set new state if pointed */
1851 if (item->tostate != TPS_Null)
1852 prs->state->state = item->tostate;
1853
1854 /* check for go away */
1855 if ((item->flags & A_BINGO) ||
1856 (prs->state->posbyte >= prs->lenstr &&
1857 (item->flags & A_RERUN) == 0))
1858 break;
1859
1860 /* go to beginning of loop if we should rerun or we just restore state */
1861 if (item->flags & (A_RERUN | A_POP))
1862 continue;
1863
1864 /* move forward */
1865 if (prs->state->charlen)
1866 {
1867 prs->state->posbyte += prs->state->charlen;
1868 prs->state->lenbytetoken += prs->state->charlen;
1869 prs->state->poschar++;
1870 prs->state->lenchartoken++;
1871 }
1872 }
1873
1874 return (item && (item->flags & A_BINGO));
1875}
1876
1877Datum
1879{
1880 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1881 int i;
1882
1883 for (i = 1; i <= LASTNUM; i++)
1884 {
1885 descr[i - 1].lexid = i;
1886 descr[i - 1].alias = pstrdup(tok_alias[i]);
1887 descr[i - 1].descr = pstrdup(lex_descr[i]);
1888 }
1889
1890 descr[LASTNUM].lexid = 0;
1891
1892 PG_RETURN_POINTER(descr);
1893}
1894
1895Datum
1897{
1899}
1900
1901Datum
1903{
1904 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1905 char **t = (char **) PG_GETARG_POINTER(1);
1906 int *tlen = (int *) PG_GETARG_POINTER(2);
1907
1908 if (!TParserGet(p))
1909 PG_RETURN_INT32(0);
1910
1911 *t = p->token;
1912 *tlen = p->lenbytetoken;
1913
1915}
1916
1917Datum
1919{
1920 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1921
1922 TParserClose(p);
1924}
1925
1926
1927/*
1928 * ts_headline support begins here
1929 */
1930
1931/* token type classification macros */
1932#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1933#define HLIDREPLACE(x) ( (x)==TAG_T )
1934#define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1935#define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1936#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1937#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1938
1939/*
1940 * Macros useful in headline selection. These rely on availability of
1941 * "HeadlineParsedText *prs" describing some text, and "int shortword"
1942 * describing the "short word" length parameter.
1943 */
1944
1945/* Interesting words are non-repeated search terms */
1946#define INTERESTINGWORD(j) \
1947 (prs->words[j].item && !prs->words[j].repeated)
1948
1949/* Don't want to end at a non-word or a short word, unless interesting */
1950#define BADENDPOINT(j) \
1951 ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1952 !INTERESTINGWORD(j))
1953
1954typedef struct
1955{
1956 /* one cover (well, really one fragment) for mark_hl_fragments */
1957 int32 startpos; /* fragment's starting word index */
1958 int32 endpos; /* ending word index (inclusive) */
1959 int32 poslen; /* number of interesting words */
1960 int32 curlen; /* total number of words */
1961 bool chosen; /* chosen? */
1962 bool excluded; /* excluded? */
1963} CoverPos;
1964
1965typedef struct
1966{
1967 /* callback data for checkcondition_HL */
1969 int len;
1970} hlCheck;
1971
1972
1973/*
1974 * TS_execute callback for matching a tsquery operand to headline words
1975 *
1976 * Note: it's tempting to report words[] indexes as pos values to save
1977 * searching in hlCover; but that would screw up phrase matching, which
1978 * expects to measure distances in lexemes not tokens.
1979 */
1980static TSTernaryValue
1982{
1983 hlCheck *checkval = (hlCheck *) opaque;
1984 int i;
1985
1986 /* scan words array for matching items */
1987 for (i = 0; i < checkval->len; i++)
1988 {
1989 if (checkval->words[i].item == val)
1990 {
1991 /* if data == NULL, don't need to report positions */
1992 if (!data)
1993 return TS_YES;
1994
1995 if (!data->pos)
1996 {
1997 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1998 data->allocated = true;
1999 data->npos = 1;
2000 data->pos[0] = checkval->words[i].pos;
2001 }
2002 else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2003 {
2004 data->pos[data->npos++] = checkval->words[i].pos;
2005 }
2006 }
2007 }
2008
2009 if (data && data->npos > 0)
2010 return TS_YES;
2011
2012 return TS_NO;
2013}
2014
2015/*
2016 * hlCover: try to find a substring of prs' word list that satisfies query
2017 *
2018 * locations is the result of TS_execute_locations() for the query.
2019 * We use this to identify plausible subranges of the query.
2020 *
2021 * *nextpos is the lexeme position (NOT word index) to start the search
2022 * at. Caller should initialize this to zero. If successful, we'll
2023 * advance it to the next place to search at.
2024 *
2025 * On success, sets *p to first word index and *q to last word index of the
2026 * cover substring, and returns true.
2027 *
2028 * The result is a minimal cover, in the sense that both *p and *q will be
2029 * words used in the query.
2030 */
2031static bool
2032hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2033 int *nextpos, int *p, int *q)
2034{
2035 int pos = *nextpos;
2036
2037 /* This loop repeats when our selected word-range fails the query */
2038 for (;;)
2039 {
2040 int posb,
2041 pose;
2042 ListCell *lc;
2043
2044 /*
2045 * For each AND'ed query term or phrase, find its first occurrence at
2046 * or after pos; set pose to the maximum of those positions.
2047 *
2048 * We need not consider ORs or NOTs here; see the comments for
2049 * TS_execute_locations(). Rechecking the match with TS_execute(),
2050 * below, will deal with any ensuing imprecision.
2051 */
2052 pose = -1;
2053 foreach(lc, locations)
2054 {
2055 ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2056 int first = -1;
2057
2058 for (int i = 0; i < pdata->npos; i++)
2059 {
2060 /* For phrase matches, use the ending lexeme */
2061 int endp = pdata->pos[i];
2062
2063 if (endp >= pos)
2064 {
2065 first = endp;
2066 break;
2067 }
2068 }
2069 if (first < 0)
2070 return false; /* no more matches for this term */
2071 if (first > pose)
2072 pose = first;
2073 }
2074
2075 if (pose < 0)
2076 return false; /* we only get here if empty list */
2077
2078 /*
2079 * Now, for each AND'ed query term or phrase, find its last occurrence
2080 * at or before pose; set posb to the minimum of those positions.
2081 *
2082 * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2083 * posb + 1 below.
2084 */
2085 posb = INT_MAX - 1;
2086 foreach(lc, locations)
2087 {
2088 ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2089 int last = -1;
2090
2091 for (int i = pdata->npos - 1; i >= 0; i--)
2092 {
2093 /* For phrase matches, use the starting lexeme */
2094 int startp = pdata->pos[i] - pdata->width;
2095
2096 if (startp <= pose)
2097 {
2098 last = startp;
2099 break;
2100 }
2101 }
2102 if (last < posb)
2103 posb = last;
2104 }
2105
2106 /*
2107 * We could end up with posb to the left of pos, in case some phrase
2108 * match crosses pos. Try the match starting at pos anyway, since the
2109 * result of TS_execute_locations is imprecise for phrase matches OR'd
2110 * with plain matches; that is, if the query is "(A <-> B) | C" then C
2111 * could match at pos even though the phrase match would have to
2112 * extend to the left of pos.
2113 */
2114 posb = Max(posb, pos);
2115
2116 /* This test probably always succeeds, but be paranoid */
2117 if (posb <= pose)
2118 {
2119 /*
2120 * posb .. pose is now the shortest, earliest-after-pos range of
2121 * lexeme positions containing all the query terms. It will
2122 * contain all phrase matches, too, except in the corner case
2123 * described just above.
2124 *
2125 * Now convert these lexeme positions to indexes in prs->words[].
2126 */
2127 int idxb = -1;
2128 int idxe = -1;
2129
2130 for (int i = 0; i < prs->curwords; i++)
2131 {
2132 if (prs->words[i].item == NULL)
2133 continue;
2134 if (idxb < 0 && prs->words[i].pos >= posb)
2135 idxb = i;
2136 if (prs->words[i].pos <= pose)
2137 idxe = i;
2138 else
2139 break;
2140 }
2141
2142 /* This test probably always succeeds, but be paranoid */
2143 if (idxb >= 0 && idxe >= idxb)
2144 {
2145 /*
2146 * Finally, check that the selected range satisfies the query.
2147 * This should succeed in all simple cases; but odd cases
2148 * involving non-top-level NOT conditions or phrase matches
2149 * OR'd with other things could fail, since the result of
2150 * TS_execute_locations doesn't fully represent such things.
2151 */
2152 hlCheck ch;
2153
2154 ch.words = &(prs->words[idxb]);
2155 ch.len = idxe - idxb + 1;
2156 if (TS_execute(GETQUERY(query), &ch,
2158 {
2159 /* Match! Advance *nextpos and return the word range. */
2160 *nextpos = posb + 1;
2161 *p = idxb;
2162 *q = idxe;
2163 return true;
2164 }
2165 }
2166 }
2167
2168 /*
2169 * Advance pos and try again. Any later workable match must start
2170 * beyond posb.
2171 */
2172 pos = posb + 1;
2173 }
2174 /* Can't get here, but stupider compilers complain if we leave it off */
2175 return false;
2176}
2177
2178/*
2179 * Apply suitable highlight marking to words selected by headline selector
2180 *
2181 * The words from startpos to endpos inclusive are marked per highlightall
2182 */
2183static void
2184mark_fragment(HeadlineParsedText *prs, bool highlightall,
2185 int startpos, int endpos)
2186{
2187 int i;
2188
2189 for (i = startpos; i <= endpos; i++)
2190 {
2191 if (prs->words[i].item)
2192 prs->words[i].selected = 1;
2193 if (!highlightall)
2194 {
2195 if (HLIDREPLACE(prs->words[i].type))
2196 prs->words[i].replace = 1;
2197 else if (HLIDSKIP(prs->words[i].type))
2198 prs->words[i].skip = 1;
2199 }
2200 else
2201 {
2202 if (XMLHLIDSKIP(prs->words[i].type))
2203 prs->words[i].skip = 1;
2204 }
2205
2206 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2207 }
2208}
2209
2210/*
2211 * split a cover substring into fragments not longer than max_words
2212 *
2213 * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2214 * substring. They are updated to hold the bounds of the next fragment.
2215 *
2216 * *curlen and *poslen are set to the fragment's length, in words and
2217 * interesting words respectively.
2218 */
2219static void
2221 int *curlen, int *poslen, int max_words)
2222{
2223 int i;
2224
2225 /*
2226 * Objective: select a fragment of words between startpos and endpos such
2227 * that it has at most max_words and both ends have query words. If the
2228 * startpos and endpos are the endpoints of the cover and the cover has
2229 * fewer words than max_words, then this function should just return the
2230 * cover
2231 */
2232 /* first move startpos to an item */
2233 for (i = *startpos; i <= *endpos; i++)
2234 {
2235 *startpos = i;
2236 if (INTERESTINGWORD(i))
2237 break;
2238 }
2239 /* cut endpos to have only max_words */
2240 *curlen = 0;
2241 *poslen = 0;
2242 for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2243 {
2244 if (!NONWORDTOKEN(prs->words[i].type))
2245 *curlen += 1;
2246 if (INTERESTINGWORD(i))
2247 *poslen += 1;
2248 }
2249 /* if the cover was cut then move back endpos to a query item */
2250 if (*endpos > i)
2251 {
2252 *endpos = i;
2253 for (i = *endpos; i >= *startpos; i--)
2254 {
2255 *endpos = i;
2256 if (INTERESTINGWORD(i))
2257 break;
2258 if (!NONWORDTOKEN(prs->words[i].type))
2259 *curlen -= 1;
2260 }
2261 }
2262}
2263
2264/*
2265 * Headline selector used when MaxFragments > 0
2266 *
2267 * Note: in this mode, highlightall is disregarded for phrase selection;
2268 * it only controls presentation details.
2269 */
2270static void
2272 bool highlightall,
2273 int shortword, int min_words,
2274 int max_words, int max_fragments)
2275{
2276 int32 poslen,
2277 curlen,
2278 i,
2279 f,
2280 num_f = 0;
2281 int32 stretch,
2282 maxstretch,
2283 posmarker;
2284
2285 int32 startpos = 0,
2286 endpos = 0,
2287 nextpos = 0,
2288 p = 0,
2289 q = 0;
2290
2291 int32 numcovers = 0,
2292 maxcovers = 32;
2293
2294 int32 minI,
2295 minwords,
2296 maxitems;
2297 CoverPos *covers;
2298
2299 covers = palloc(maxcovers * sizeof(CoverPos));
2300
2301 /* get all covers */
2302 while (hlCover(prs, query, locations, &nextpos, &p, &q))
2303 {
2304 startpos = p;
2305 endpos = q;
2306
2307 /*
2308 * Break the cover into smaller fragments such that each fragment has
2309 * at most max_words. Also ensure that each end of each fragment is a
2310 * query word. This will allow us to stretch the fragment in either
2311 * direction
2312 */
2313
2314 while (startpos <= endpos)
2315 {
2316 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2317 if (numcovers >= maxcovers)
2318 {
2319 maxcovers *= 2;
2320 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2321 }
2322 covers[numcovers].startpos = startpos;
2323 covers[numcovers].endpos = endpos;
2324 covers[numcovers].curlen = curlen;
2325 covers[numcovers].poslen = poslen;
2326 covers[numcovers].chosen = false;
2327 covers[numcovers].excluded = false;
2328 numcovers++;
2329 startpos = endpos + 1;
2330 endpos = q;
2331 }
2332 }
2333
2334 /* choose best covers */
2335 for (f = 0; f < max_fragments; f++)
2336 {
2337 maxitems = 0;
2338 minwords = PG_INT32_MAX;
2339 minI = -1;
2340
2341 /*
2342 * Choose the cover that contains max items. In case of tie choose the
2343 * one with smaller number of words.
2344 */
2345 for (i = 0; i < numcovers; i++)
2346 {
2347 if (!covers[i].chosen && !covers[i].excluded &&
2348 (maxitems < covers[i].poslen ||
2349 (maxitems == covers[i].poslen &&
2350 minwords > covers[i].curlen)))
2351 {
2352 maxitems = covers[i].poslen;
2353 minwords = covers[i].curlen;
2354 minI = i;
2355 }
2356 }
2357 /* if a cover was found mark it */
2358 if (minI >= 0)
2359 {
2360 covers[minI].chosen = true;
2361 /* adjust the size of cover */
2362 startpos = covers[minI].startpos;
2363 endpos = covers[minI].endpos;
2364 curlen = covers[minI].curlen;
2365 /* stretch the cover if cover size is lower than max_words */
2366 if (curlen < max_words)
2367 {
2368 /* divide the stretch on both sides of cover */
2369 maxstretch = (max_words - curlen) / 2;
2370
2371 /*
2372 * first stretch the startpos stop stretching if 1. we hit the
2373 * beginning of document 2. exceed maxstretch 3. we hit an
2374 * already marked fragment
2375 */
2376 stretch = 0;
2377 posmarker = startpos;
2378 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2379 {
2380 if (!NONWORDTOKEN(prs->words[i].type))
2381 {
2382 curlen++;
2383 stretch++;
2384 }
2385 posmarker = i;
2386 }
2387 /* cut back startpos till we find a good endpoint */
2388 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2389 {
2390 if (!NONWORDTOKEN(prs->words[i].type))
2391 curlen--;
2392 }
2393 startpos = i;
2394 /* now stretch the endpos as much as possible */
2395 posmarker = endpos;
2396 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2397 {
2398 if (!NONWORDTOKEN(prs->words[i].type))
2399 curlen++;
2400 posmarker = i;
2401 }
2402 /* cut back endpos till we find a good endpoint */
2403 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2404 {
2405 if (!NONWORDTOKEN(prs->words[i].type))
2406 curlen--;
2407 }
2408 endpos = i;
2409 }
2410 covers[minI].startpos = startpos;
2411 covers[minI].endpos = endpos;
2412 covers[minI].curlen = curlen;
2413 /* Mark the chosen fragments (covers) */
2414 mark_fragment(prs, highlightall, startpos, endpos);
2415 num_f++;
2416 /* Exclude covers overlapping this one from future consideration */
2417 for (i = 0; i < numcovers; i++)
2418 {
2419 if (i != minI &&
2420 ((covers[i].startpos >= startpos &&
2421 covers[i].startpos <= endpos) ||
2422 (covers[i].endpos >= startpos &&
2423 covers[i].endpos <= endpos) ||
2424 (covers[i].startpos < startpos &&
2425 covers[i].endpos > endpos)))
2426 covers[i].excluded = true;
2427 }
2428 }
2429 else
2430 break; /* no selectable covers remain */
2431 }
2432
2433 /* show the first min_words words if we have not marked anything */
2434 if (num_f <= 0)
2435 {
2436 startpos = curlen = 0;
2437 endpos = -1;
2438 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2439 {
2440 if (!NONWORDTOKEN(prs->words[i].type))
2441 curlen++;
2442 endpos = i;
2443 }
2444 mark_fragment(prs, highlightall, startpos, endpos);
2445 }
2446
2447 pfree(covers);
2448}
2449
2450/*
2451 * Headline selector used when MaxFragments == 0
2452 */
2453static void
2455 bool highlightall,
2456 int shortword, int min_words, int max_words)
2457{
2458 int nextpos = 0,
2459 p = 0,
2460 q = 0;
2461 int bestb = -1,
2462 beste = -1;
2463 int bestlen = -1;
2464 bool bestcover = false;
2465 int pose,
2466 posb,
2467 poslen,
2468 curlen;
2469 bool poscover;
2470 int i;
2471
2472 if (!highlightall)
2473 {
2474 /* examine all covers, select a headline using the best one */
2475 while (hlCover(prs, query, locations, &nextpos, &p, &q))
2476 {
2477 /*
2478 * Count words (curlen) and interesting words (poslen) within
2479 * cover, but stop once we reach max_words. This step doesn't
2480 * consider whether that's a good stopping point. posb and pose
2481 * are set to the start and end indexes of the possible headline.
2482 */
2483 curlen = 0;
2484 poslen = 0;
2485 posb = pose = p;
2486 for (i = p; i <= q && curlen < max_words; i++)
2487 {
2488 if (!NONWORDTOKEN(prs->words[i].type))
2489 curlen++;
2490 if (INTERESTINGWORD(i))
2491 poslen++;
2492 pose = i;
2493 }
2494
2495 if (curlen < max_words)
2496 {
2497 /*
2498 * We have room to lengthen the headline, so search forward
2499 * until it's full or we find a good stopping point. We'll
2500 * reconsider the word at "q", then move forward.
2501 */
2502 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2503 {
2504 if (i > q)
2505 {
2506 if (!NONWORDTOKEN(prs->words[i].type))
2507 curlen++;
2508 if (INTERESTINGWORD(i))
2509 poslen++;
2510 }
2511 pose = i;
2512 if (BADENDPOINT(i))
2513 continue;
2514 if (curlen >= min_words)
2515 break;
2516 }
2517 if (curlen < min_words)
2518 {
2519 /*
2520 * Reached end of text and our headline is still shorter
2521 * than min_words, so try to extend it to the left.
2522 */
2523 for (i = p - 1; i >= 0; i--)
2524 {
2525 if (!NONWORDTOKEN(prs->words[i].type))
2526 curlen++;
2527 if (INTERESTINGWORD(i))
2528 poslen++;
2529 if (curlen >= max_words)
2530 break;
2531 if (BADENDPOINT(i))
2532 continue;
2533 if (curlen >= min_words)
2534 break;
2535 }
2536 posb = (i >= 0) ? i : 0;
2537 }
2538 }
2539 else
2540 {
2541 /*
2542 * Can't make headline longer, so consider making it shorter
2543 * if needed to avoid a bad endpoint.
2544 */
2545 if (i > q)
2546 i = q;
2547 for (; curlen > min_words; i--)
2548 {
2549 if (!BADENDPOINT(i))
2550 break;
2551 if (!NONWORDTOKEN(prs->words[i].type))
2552 curlen--;
2553 if (INTERESTINGWORD(i))
2554 poslen--;
2555 pose = i - 1;
2556 }
2557 }
2558
2559 /*
2560 * Check whether the proposed headline includes the original
2561 * cover; it might not if we trimmed it due to max_words.
2562 */
2563 poscover = (posb <= p && pose >= q);
2564
2565 /*
2566 * Adopt this headline if it's better than the last one, giving
2567 * highest priority to headlines including the cover, then to
2568 * headlines with more interesting words, then to headlines with
2569 * good stopping points. (Since bestlen is initially -1, we will
2570 * certainly adopt the first headline.)
2571 */
2572 if (poscover > bestcover ||
2573 (poscover == bestcover && poslen > bestlen) ||
2574 (poscover == bestcover && poslen == bestlen &&
2575 !BADENDPOINT(pose) && BADENDPOINT(beste)))
2576 {
2577 bestb = posb;
2578 beste = pose;
2579 bestlen = poslen;
2580 bestcover = poscover;
2581 }
2582 }
2583
2584 /*
2585 * If we found nothing acceptable, select min_words words starting at
2586 * the beginning.
2587 */
2588 if (bestlen < 0)
2589 {
2590 curlen = 0;
2591 pose = -1;
2592 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2593 {
2594 if (!NONWORDTOKEN(prs->words[i].type))
2595 curlen++;
2596 pose = i;
2597 }
2598 bestb = 0;
2599 beste = pose;
2600 }
2601 }
2602 else
2603 {
2604 /* highlightall mode: headline is whole document */
2605 bestb = 0;
2606 beste = prs->curwords - 1;
2607 }
2608
2609 mark_fragment(prs, highlightall, bestb, beste);
2610}
2611
2612/*
2613 * Default parser's prsheadline function
2614 */
2615Datum
2617{
2619 List *prsoptions = (List *) PG_GETARG_POINTER(1);
2620 TSQuery query = PG_GETARG_TSQUERY(2);
2621 List *locations;
2622
2623 /* default option values: */
2624 int min_words = 15;
2625 int max_words = 35;
2626 int shortword = 3;
2627 int max_fragments = 0;
2628 bool highlightall = false;
2629 ListCell *l;
2630
2631 /* Extract configuration option values */
2632 prs->startsel = NULL;
2633 prs->stopsel = NULL;
2634 prs->fragdelim = NULL;
2635 foreach(l, prsoptions)
2636 {
2637 DefElem *defel = (DefElem *) lfirst(l);
2638 char *val = defGetString(defel);
2639
2640 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2641 max_words = pg_strtoint32(val);
2642 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2643 min_words = pg_strtoint32(val);
2644 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2645 shortword = pg_strtoint32(val);
2646 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2647 max_fragments = pg_strtoint32(val);
2648 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2649 prs->startsel = pstrdup(val);
2650 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2651 prs->stopsel = pstrdup(val);
2652 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2653 prs->fragdelim = pstrdup(val);
2654 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2655 highlightall = (pg_strcasecmp(val, "1") == 0 ||
2656 pg_strcasecmp(val, "on") == 0 ||
2657 pg_strcasecmp(val, "true") == 0 ||
2658 pg_strcasecmp(val, "t") == 0 ||
2659 pg_strcasecmp(val, "y") == 0 ||
2660 pg_strcasecmp(val, "yes") == 0);
2661 else
2662 ereport(ERROR,
2663 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2664 errmsg("unrecognized headline parameter: \"%s\"",
2665 defel->defname)));
2666 }
2667
2668 /* in HighlightAll mode these parameters are ignored */
2669 if (!highlightall)
2670 {
2671 if (min_words >= max_words)
2672 ereport(ERROR,
2673 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2674 errmsg("%s must be less than %s", "MinWords", "MaxWords")));
2675 if (min_words <= 0)
2676 ereport(ERROR,
2677 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2678 errmsg("%s must be positive", "MinWords")));
2679 if (shortword < 0)
2680 ereport(ERROR,
2681 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2682 errmsg("%s must be >= 0", "ShortWord")));
2683 if (max_fragments < 0)
2684 ereport(ERROR,
2685 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2686 errmsg("%s must be >= 0", "MaxFragments")));
2687 }
2688
2689 /* Locate words and phrases matching the query */
2690 if (query->size > 0)
2691 {
2692 hlCheck ch;
2693
2694 ch.words = prs->words;
2695 ch.len = prs->curwords;
2696 locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2698 }
2699 else
2700 locations = NIL; /* empty query matches nothing */
2701
2702 /* Apply appropriate headline selector */
2703 if (max_fragments == 0)
2704 mark_hl_words(prs, query, locations, highlightall, shortword,
2705 min_words, max_words);
2706 else
2707 mark_hl_fragments(prs, query, locations, highlightall, shortword,
2708 min_words, max_words, max_fragments);
2709
2710 /* Fill in default values for string options */
2711 if (!prs->startsel)
2712 prs->startsel = pstrdup("<b>");
2713 if (!prs->stopsel)
2714 prs->stopsel = pstrdup("</b>");
2715 if (!prs->fragdelim)
2716 prs->fragdelim = pstrdup(" ... ");
2717
2718 /* Caller will need these lengths, too */
2719 prs->startsellen = strlen(prs->startsel);
2720 prs->stopsellen = strlen(prs->stopsel);
2721 prs->fragdelimlen = strlen(prs->fragdelim);
2722
2723 PG_RETURN_POINTER(prs);
2724}
#define GETQUERY(x)
Definition: _int.h:157
void print(const void *obj)
Definition: print.c:36
#define PG_INT32_MAX
Definition: c.h:560
#define Max(x, y)
Definition: c.h:969
int32_t int32
Definition: c.h:498
uint16_t uint16
Definition: c.h:501
#define lengthof(array)
Definition: c.h:759
#define fprintf(file, fmt, msg)
Definition: cubescan.l:21
char * defGetString(DefElem *def)
Definition: define.c:35
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
Assert(PointerIsAligned(start, uint64))
const char * str
long val
Definition: informix.c:689
int i
Definition: isn.c:77
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1030
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:986
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
char * pstrdup(const char *in)
Definition: mcxt.c:2322
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2167
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc0(Size size)
Definition: mcxt.c:1970
void * palloc(Size size)
Definition: mcxt.c:1940
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
int32 pg_strtoint32(const char *s)
Definition: numutils.c:383
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
const void size_t len
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
bool database_ctype_is_c
Definition: pg_locale.c:128
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
static XLogRecPtr startpos
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
uintptr_t Datum
Definition: postgres.h:69
char * c
void check_stack_depth(void)
Definition: stack_depth.c:95
bool chosen
Definition: wparser_def.c:1961
int32 endpos
Definition: wparser_def.c:1958
int32 curlen
Definition: wparser_def.c:1960
int32 startpos
Definition: wparser_def.c:1957
bool excluded
Definition: wparser_def.c:1962
int32 poslen
Definition: wparser_def.c:1959
char * defname
Definition: parsenodes.h:826
WordEntryPos * pos
Definition: ts_utils.h:166
HeadlineWordEntry * words
Definition: ts_public.h:76
WordEntryPos pos
Definition: ts_public.h:68
QueryOperand * item
Definition: ts_public.h:70
char * alias
Definition: ts_public.h:28
int lexid
Definition: ts_public.h:27
char * descr
Definition: ts_public.h:29
Definition: pg_list.h:54
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:238
struct TParserPosition * prev
Definition: wparser_def.c:237
TParserState state
Definition: wparser_def.c:236
TParserCharTest isclass
Definition: wparser_def.c:211
TParserState tostate
Definition: wparser_def.c:214
TParserSpecial special
Definition: wparser_def.c:216
const TParserStateActionItem * action
Definition: wparser_def.c:1609
TParserState state
Definition: wparser_def.c:1610
char * str
Definition: wparser_def.c:244
pg_wchar * pgwstr
Definition: wparser_def.c:247
wchar_t * wstr
Definition: wparser_def.c:246
int lenstr
Definition: wparser_def.c:245
char * token
Definition: wparser_def.c:260
int type
Definition: wparser_def.c:263
int charmaxlen
Definition: wparser_def.c:251
bool wanthost
Definition: wparser_def.c:254
int lenbytetoken
Definition: wparser_def.c:261
bool ignore
Definition: wparser_def.c:253
TParserPosition * state
Definition: wparser_def.c:252
int lenchartoken
Definition: wparser_def.c:262
char c
Definition: wparser_def.c:257
bool usewide
Definition: wparser_def.c:248
int32 size
Definition: ts_type.h:221
HeadlineWordEntry * words
Definition: wparser_def.c:1968
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:266
uint16 WordEntryPos
Definition: ts_type.h:63
TSTernaryValue
Definition: ts_utils.h:133
@ TS_NO
Definition: ts_utils.h:134
@ TS_YES
Definition: ts_utils.h:135
#define TS_EXEC_EMPTY
Definition: ts_utils.h:188
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1854
List * TS_execute_locations(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:2007
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1555
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1166
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1516
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1936
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1146
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1373
#define VERSIONNUMBER
Definition: wparser_def.c:43
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1588
#define BADENDPOINT(j)
Definition: wparser_def.c:1950
#define ASCIIWORD
Definition: wparser_def.c:36
#define PROTOCOL
Definition: wparser_def.c:49
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1433
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1416
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1333
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1309
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:588
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1322
struct TParser TParser
static TSTernaryValue checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1981
void _make_compiler_happy(void)
Definition: wparser_def.c:537
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1466
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1326
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1547
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1344
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1119
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1935
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1498
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1902
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1223
#define DECIMAL_T
Definition: wparser_def.c:55
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1452
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1155
#define ASCIIPARTHWORD
Definition: wparser_def.c:46
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1476
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:424
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1134
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2271
#define WORD_T
Definition: wparser_def.c:37
TParserState
Definition: wparser_def.c:120
@ TPS_InXMLEntityHexNumFirst
Definition: wparser_def.c:144
@ TPS_InPort
Definition: wparser_def.c:167
@ TPS_InXMLEntityHexNum
Definition: wparser_def.c:145
@ TPS_InHostDomainSecond
Definition: wparser_def.c:164
@ TPS_InMantissaFirst
Definition: wparser_def.c:137
@ TPS_InTagName
Definition: wparser_def.c:150
@ TPS_InHyphenAsciiWordFirst
Definition: wparser_def.c:185
@ TPS_Null
Definition: wparser_def.c:198
@ TPS_InPathFirstFirst
Definition: wparser_def.c:174
@ TPS_InSignedIntFirst
Definition: wparser_def.c:126
@ TPS_InSignedInt
Definition: wparser_def.c:127
@ TPS_InUnsignedInt
Definition: wparser_def.c:125
@ TPS_InMantissa
Definition: wparser_def.c:139
@ TPS_InProtocolFirst
Definition: wparser_def.c:182
@ TPS_InFURL
Definition: wparser_def.c:181
@ TPS_InMantissaSign
Definition: wparser_def.c:138
@ TPS_InXMLBegin
Definition: wparser_def.c:148
@ TPS_InCommentEnd
Definition: wparser_def.c:162
@ TPS_InHyphenWordFirst
Definition: wparser_def.c:187
@ TPS_InHyphenNumWordPart
Definition: wparser_def.c:196
@ TPS_InPortFirst
Definition: wparser_def.c:166
@ TPS_InProtocolEnd
Definition: wparser_def.c:184
@ TPS_InXMLEntityFirst
Definition: wparser_def.c:140
@ TPS_InHyphenNumWordFirst
Definition: wparser_def.c:189
@ TPS_InCommentLast
Definition: wparser_def.c:158
@ TPS_InFileTwiddle
Definition: wparser_def.c:172
@ TPS_InURLPathStart
Definition: wparser_def.c:179
@ TPS_InURLPathFirst
Definition: wparser_def.c:178
@ TPS_InPathFirst
Definition: wparser_def.c:173
@ TPS_InPathSecond
Definition: wparser_def.c:175
@ TPS_InHyphenUnsignedInt
Definition: wparser_def.c:197
@ TPS_InFileFirst
Definition: wparser_def.c:171
@ TPS_InXMLEntityNumFirst
Definition: wparser_def.c:142
@ TPS_InHyphenWordPart
Definition: wparser_def.c:194
@ TPS_InNumWord
Definition: wparser_def.c:122
@ TPS_InAsciiWord
Definition: wparser_def.c:123
@ TPS_InVersion
Definition: wparser_def.c:136
@ TPS_InHost
Definition: wparser_def.c:169
@ TPS_InFile
Definition: wparser_def.c:176
@ TPS_InProtocolSecond
Definition: wparser_def.c:183
@ TPS_InCloseCommentFirst
Definition: wparser_def.c:160
@ TPS_InTagEscapeK
Definition: wparser_def.c:153
@ TPS_InParseHyphenHyphen
Definition: wparser_def.c:193
@ TPS_InTagBackSleshed
Definition: wparser_def.c:155
@ TPS_InTagFirst
Definition: wparser_def.c:147
@ TPS_InTagEnd
Definition: wparser_def.c:156
@ TPS_InComment
Definition: wparser_def.c:159
@ TPS_InHyphenWord
Definition: wparser_def.c:188
@ TPS_InHyphenAsciiWord
Definition: wparser_def.c:186
@ TPS_InWord
Definition: wparser_def.c:124
@ TPS_InXMLEntityEnd
Definition: wparser_def.c:146
@ TPS_InTagEscapeKK
Definition: wparser_def.c:154
@ TPS_InSpace
Definition: wparser_def.c:128
@ TPS_InFileNext
Definition: wparser_def.c:177
@ TPS_InURLPath
Definition: wparser_def.c:180
@ TPS_Base
Definition: wparser_def.c:121
@ TPS_InUDecimal
Definition: wparser_def.c:130
@ TPS_InParseHyphen
Definition: wparser_def.c:192
@ TPS_InHostFirstAN
Definition: wparser_def.c:168
@ TPS_InEmail
Definition: wparser_def.c:170
@ TPS_InDecimalFirst
Definition: wparser_def.c:131
@ TPS_InVersionFirst
Definition: wparser_def.c:135
@ TPS_InCloseCommentLast
Definition: wparser_def.c:161
@ TPS_InSVerVersion
Definition: wparser_def.c:134
@ TPS_InHyphenAsciiWordPart
Definition: wparser_def.c:195
@ TPS_InCommentFirst
Definition: wparser_def.c:157
@ TPS_InUDecimalFirst
Definition: wparser_def.c:129
@ TPS_InHostFirstDomain
Definition: wparser_def.c:163
@ TPS_InHostDomain
Definition: wparser_def.c:165
@ TPS_InHyphenDigitLookahead
Definition: wparser_def.c:191
@ TPS_InVerVersion
Definition: wparser_def.c:133
@ TPS_InXMLEntityNum
Definition: wparser_def.c:143
@ TPS_InTag
Definition: wparser_def.c:152
@ TPS_InDecimal
Definition: wparser_def.c:132
@ TPS_InTagCloseFirst
Definition: wparser_def.c:149
@ TPS_InXMLEntity
Definition: wparser_def.c:141
@ TPS_InHyphenNumWord
Definition: wparser_def.c:190
@ TPS_InTagBeginEnd
Definition: wparser_def.c:151
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
Definition: wparser_def.c:2184
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1194
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1539
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1091
#define A_POP
Definition: wparser_def.c:222
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1043
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1265
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1058
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1441
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1579
#define LASTNUM
Definition: wparser_def.c:60
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:481
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2616
#define NUMHWORD
Definition: wparser_def.c:50
static bool hlCover(HeadlineParsedText *prs, TSQuery query, List *locations, int *nextpos, int *p, int *q)
Definition: wparser_def.c:2032
#define SPACE
Definition: wparser_def.c:47
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1076
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1049
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:505
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1237
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1198
struct TParserPosition TParserPosition
#define NUMWORD
Definition: wparser_def.c:38
#define FILEPATH
Definition: wparser_def.c:54
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1272
static int p_isneC(TParser *prs)
Definition: wparser_def.c:487
#define EMAIL
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1297
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:272
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1571
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1126
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:971
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:596
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2454
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1532
#define UNSIGNEDINT
Definition: wparser_def.c:57
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:206
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1391
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1174
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1470
#define A_RERUN
Definition: wparser_def.c:224
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1106
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:999
static const char *const tok_alias[]
Definition: wparser_def.c:62
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:612
#define HLIDSKIP(x)
Definition: wparser_def.c:1934
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2220
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1595
#define SIGNEDINT
Definition: wparser_def.c:56
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:499
static const TParserStateAction Actions[]
Definition: wparser_def.c:1628
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1209
#define PARTHWORD
Definition: wparser_def.c:45
#define HLIDREPLACE(x)
Definition: wparser_def.c:1933
#define A_MERGE
Definition: wparser_def.c:226
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:289
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1140
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1113
static int p_isascii(TParser *prs)
Definition: wparser_def.c:493
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1288
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1523
static int p_isignore(TParser *prs)
Definition: wparser_def.c:623
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1564
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1365
#define TAG_T
Definition: wparser_def.c:48
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1085
#define URLPATH
Definition: wparser_def.c:53
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1878
#define ASCIIHWORD
Definition: wparser_def.c:51
#define HOST
Definition: wparser_def.c:41
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1243
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1896
#define A_BINGO
Definition: wparser_def.c:221
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1620
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1710
#define XMLENTITY
Definition: wparser_def.c:58
static int p_ishost(TParser *prs)
Definition: wparser_def.c:629
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1918
#define A_CLRALL
Definition: wparser_def.c:227
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:657
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:603
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1482
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1026
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1070
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1217
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:474
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1315
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:397
#define A_CLEAR
Definition: wparser_def.c:225
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1397
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:988
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1407
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:346
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1380
#define A_PUSH
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1279
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1488
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1017
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:692
static void TParserClose(TParser *prs)
Definition: wparser_def.c:372
#define URL_T
Definition: wparser_def.c:40
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1180
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1100
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1506
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1187
#define A_NEXT
Definition: wparser_def.c:220
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1359
#define HWORD
Definition: wparser_def.c:52
#define NUMPARTHWORD
Definition: wparser_def.c:44
static const char *const lex_descr[]
Definition: wparser_def.c:89
#define INTERESTINGWORD(j)
Definition: wparser_def.c:1946
#define SCIENTIFIC
Definition: wparser_def.c:42
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:564
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1284
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1303
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1494
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1460
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1426