PostgreSQL Source Code  git master
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 #include <wctype.h>
19 
20 #include "commands/defrem.h"
21 #include "mb/pg_wchar.h"
22 #include "miscadmin.h"
23 #include "tsearch/ts_public.h"
24 #include "tsearch/ts_type.h"
25 #include "tsearch/ts_utils.h"
26 #include "utils/builtins.h"
27 #include "utils/pg_locale.h"
28 
29 
30 /* Define me to enable tracing of parser behavior */
31 /* #define WPARSER_TRACE */
32 
33 
34 /* Output token categories */
35 
36 #define ASCIIWORD 1
37 #define WORD_T 2
38 #define NUMWORD 3
39 #define EMAIL 4
40 #define URL_T 5
41 #define HOST 6
42 #define SCIENTIFIC 7
43 #define VERSIONNUMBER 8
44 #define NUMPARTHWORD 9
45 #define PARTHWORD 10
46 #define ASCIIPARTHWORD 11
47 #define SPACE 12
48 #define TAG_T 13
49 #define PROTOCOL 14
50 #define NUMHWORD 15
51 #define ASCIIHWORD 16
52 #define HWORD 17
53 #define URLPATH 18
54 #define FILEPATH 19
55 #define DECIMAL_T 20
56 #define SIGNEDINT 21
57 #define UNSIGNEDINT 22
58 #define XMLENTITY 23
59 
60 #define LASTNUM 23
61 
62 static const char *const tok_alias[] = {
63  "",
64  "asciiword",
65  "word",
66  "numword",
67  "email",
68  "url",
69  "host",
70  "sfloat",
71  "version",
72  "hword_numpart",
73  "hword_part",
74  "hword_asciipart",
75  "blank",
76  "tag",
77  "protocol",
78  "numhword",
79  "asciihword",
80  "hword",
81  "url_path",
82  "file",
83  "float",
84  "int",
85  "uint",
86  "entity"
87 };
88 
89 static const char *const lex_descr[] = {
90  "",
91  "Word, all ASCII",
92  "Word, all letters",
93  "Word, letters and digits",
94  "Email address",
95  "URL",
96  "Host",
97  "Scientific notation",
98  "Version number",
99  "Hyphenated word part, letters and digits",
100  "Hyphenated word part, all letters",
101  "Hyphenated word part, all ASCII",
102  "Space symbols",
103  "XML tag",
104  "Protocol head",
105  "Hyphenated word, letters and digits",
106  "Hyphenated word, all ASCII",
107  "Hyphenated word, all letters",
108  "URL path",
109  "File or path name",
110  "Decimal notation",
111  "Signed integer",
112  "Unsigned integer",
113  "XML entity"
114 };
115 
116 
117 /* Parser states */
118 
119 typedef enum
120 {
121  TPS_Base = 0,
198  TPS_Null /* last state (fake value) */
200 
201 /* forward declaration */
202 struct TParser;
203 
204 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
205  * except p_iseq */
206 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
207  * special cases... */
208 
209 typedef struct
210 {
212  char c;
215  int type;
218 
219 /* Flag bits in TParserStateActionItem.flags */
220 #define A_NEXT 0x0000
221 #define A_BINGO 0x0001
222 #define A_POP 0x0002
223 #define A_PUSH 0x0004
224 #define A_RERUN 0x0008
225 #define A_CLEAR 0x0010
226 #define A_MERGE 0x0020
227 #define A_CLRALL 0x0040
228 
229 typedef struct TParserPosition
230 {
231  int posbyte; /* position of parser in bytes */
232  int poschar; /* position of parser in characters */
233  int charlen; /* length of current char */
234  int lenbytetoken; /* length of token-so-far in bytes */
235  int lenchartoken; /* and in chars */
240 
241 typedef struct TParser
242 {
243  /* string and position information */
244  char *str; /* multibyte string */
245  int lenstr; /* length of mbstring */
246  wchar_t *wstr; /* wide character string */
247  pg_wchar *pgwstr; /* wide character string for C-locale */
248  bool usewide;
249 
250  /* State of parse */
253  bool ignore;
254  bool wanthost;
255 
256  /* silly char */
257  char c;
258 
259  /* out */
260  char *token;
263  int type;
265 
266 
267 /* forward decls here */
268 static bool TParserGet(TParser *prs);
269 
270 
271 static TParserPosition *
273 {
275 
276  if (prev)
277  memcpy(res, prev, sizeof(TParserPosition));
278  else
279  memset(res, 0, sizeof(TParserPosition));
280 
281  res->prev = prev;
282 
283  res->pushedAtAction = NULL;
284 
285  return res;
286 }
287 
288 static TParser *
289 TParserInit(char *str, int len)
290 {
291  TParser *prs = (TParser *) palloc0(sizeof(TParser));
292 
294  prs->str = str;
295  prs->lenstr = len;
296 
297  /*
298  * Use wide char code only when max encoding length > 1.
299  */
300  if (prs->charmaxlen > 1)
301  {
302  pg_locale_t mylocale = 0; /* TODO */
303 
304  prs->usewide = true;
306  {
307  /*
308  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
309  * be different from sizeof(wchar_t)
310  */
311  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
312  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
313  }
314  else
315  {
316  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
317  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
318  mylocale);
319  }
320  }
321  else
322  prs->usewide = false;
323 
324  prs->state = newTParserPosition(NULL);
325  prs->state->state = TPS_Base;
326 
327 #ifdef WPARSER_TRACE
328  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
329 #endif
330 
331  return prs;
332 }
333 
334 /*
335  * As an alternative to a full TParserInit one can create a
336  * TParserCopy which basically is a regular TParser without a private
337  * copy of the string - instead it uses the one from another TParser.
338  * This is useful because at some places TParsers are created
339  * recursively and the repeated copying around of the strings can
340  * cause major inefficiency if the source string is long.
341  * The new parser starts parsing at the original's current position.
342  *
343  * Obviously one must not close the original TParser before the copy.
344  */
345 static TParser *
347 {
348  TParser *prs = (TParser *) palloc0(sizeof(TParser));
349 
350  prs->charmaxlen = orig->charmaxlen;
351  prs->str = orig->str + orig->state->posbyte;
352  prs->lenstr = orig->lenstr - orig->state->posbyte;
353  prs->usewide = orig->usewide;
354 
355  if (orig->pgwstr)
356  prs->pgwstr = orig->pgwstr + orig->state->poschar;
357  if (orig->wstr)
358  prs->wstr = orig->wstr + orig->state->poschar;
359 
360  prs->state = newTParserPosition(NULL);
361  prs->state->state = TPS_Base;
362 
363 #ifdef WPARSER_TRACE
364  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
365 #endif
366 
367  return prs;
368 }
369 
370 
371 static void
373 {
374  while (prs->state)
375  {
376  TParserPosition *ptr = prs->state->prev;
377 
378  pfree(prs->state);
379  prs->state = ptr;
380  }
381 
382  if (prs->wstr)
383  pfree(prs->wstr);
384  if (prs->pgwstr)
385  pfree(prs->pgwstr);
386 
387 #ifdef WPARSER_TRACE
388  fprintf(stderr, "closing parser\n");
389 #endif
390  pfree(prs);
391 }
392 
393 /*
394  * Close a parser created with TParserCopyInit
395  */
396 static void
398 {
399  while (prs->state)
400  {
401  TParserPosition *ptr = prs->state->prev;
402 
403  pfree(prs->state);
404  prs->state = ptr;
405  }
406 
407 #ifdef WPARSER_TRACE
408  fprintf(stderr, "closing parser copy\n");
409 #endif
410  pfree(prs);
411 }
412 
413 
414 /*
415  * Character-type support functions, equivalent to is* macros, but
416  * working with any possible encodings and locales. Notes:
417  * - with multibyte encoding and C-locale isw* function may fail
418  * or give wrong result.
419  * - multibyte encoding and C-locale often are used for
420  * Asian languages.
421  * - if locale is C then we use pgwstr instead of wstr.
422  */
423 
424 #define p_iswhat(type, nonascii) \
425  \
426 static int \
427 p_is##type(TParser *prs) \
428 { \
429  Assert(prs->state); \
430  if (prs->usewide) \
431  { \
432  if (prs->pgwstr) \
433  { \
434  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
435  if (c > 0x7f) \
436  return nonascii; \
437  return is##type(c); \
438  } \
439  return isw##type(*(prs->wstr + prs->state->poschar)); \
440  } \
441  return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
442 } \
443  \
444 static int \
445 p_isnot##type(TParser *prs) \
446 { \
447  return !p_is##type(prs); \
448 }
449 
450 /*
451  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
452  * an alpha character, but not a member of other char classes.
453  */
454 p_iswhat(alnum, 1)
455 p_iswhat(alpha, 1)
456 p_iswhat(digit, 0)
457 p_iswhat(lower, 0)
458 p_iswhat(print, 0)
459 p_iswhat(punct, 0)
460 p_iswhat(space, 0)
461 p_iswhat(upper, 0)
462 p_iswhat(xdigit, 0)
463 
464 /* p_iseq should be used only for ascii symbols */
465 
466 static int
467 p_iseq(TParser *prs, char c)
468 {
469  Assert(prs->state);
470  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
471 }
472 
473 static int
475 {
476  Assert(prs->state);
477  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
478 }
479 
480 static int
482 {
483  return p_iseq(prs, prs->c);
484 }
485 
486 static int
488 {
489  return !p_iseq(prs, prs->c);
490 }
491 
492 static int
494 {
495  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
496 }
497 
498 static int
500 {
501  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
502 }
503 
504 static int
506 {
507  char ch;
508 
509  /* no non-ASCII need apply */
510  if (prs->state->charlen != 1)
511  return 0;
512  ch = *(prs->str + prs->state->posbyte);
513  /* no spaces or control characters */
514  if (ch <= 0x20 || ch >= 0x7F)
515  return 0;
516  /* reject characters disallowed by RFC 3986 */
517  switch (ch)
518  {
519  case '"':
520  case '<':
521  case '>':
522  case '\\':
523  case '^':
524  case '`':
525  case '{':
526  case '|':
527  case '}':
528  return 0;
529  }
530  return 1;
531 }
532 
533 
534 /* deliberately suppress unused-function complaints for the above */
535 void _make_compiler_happy(void);
536 void
538 {
539  p_isalnum(NULL);
540  p_isnotalnum(NULL);
541  p_isalpha(NULL);
542  p_isnotalpha(NULL);
543  p_isdigit(NULL);
544  p_isnotdigit(NULL);
545  p_islower(NULL);
546  p_isnotlower(NULL);
547  p_isprint(NULL);
548  p_isnotprint(NULL);
549  p_ispunct(NULL);
550  p_isnotpunct(NULL);
551  p_isspace(NULL);
552  p_isnotspace(NULL);
553  p_isupper(NULL);
554  p_isnotupper(NULL);
555  p_isxdigit(NULL);
556  p_isnotxdigit(NULL);
557  p_isEOF(NULL);
558  p_iseqC(NULL);
559  p_isneC(NULL);
560 }
561 
562 
563 static void
565 {
566  switch (prs->state->lenchartoken)
567  {
568  case 8: /* </script */
569  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
570  prs->ignore = false;
571  break;
572  case 7: /* <script || </style */
573  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
574  prs->ignore = false;
575  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
576  prs->ignore = true;
577  break;
578  case 6: /* <style */
579  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
580  prs->ignore = true;
581  break;
582  default:
583  break;
584  }
585 }
586 
587 static void
589 {
590  prs->wanthost = true;
591  prs->state->posbyte -= prs->state->lenbytetoken;
592  prs->state->poschar -= prs->state->lenchartoken;
593 }
594 
595 static void
597 {
598  prs->state->posbyte -= prs->state->lenbytetoken;
599  prs->state->poschar -= prs->state->lenchartoken;
600 }
601 
602 static void
604 {
605  prs->state->posbyte -= prs->state->lenbytetoken;
606  prs->state->poschar -= prs->state->lenchartoken;
607  prs->state->lenbytetoken = 0;
608  prs->state->lenchartoken = 0;
609 }
610 
611 static int
613 {
614  if (prs->wanthost)
615  {
616  prs->wanthost = false;
617  return 1;
618  }
619  return 0;
620 }
621 
622 static int
624 {
625  return (prs->ignore) ? 1 : 0;
626 }
627 
628 static int
630 {
631  TParser *tmpprs = TParserCopyInit(prs);
632  int res = 0;
633 
634  tmpprs->wanthost = true;
635 
636  /*
637  * Check stack depth before recursing. (Since TParserGet() doesn't
638  * normally recurse, we put the cost of checking here not there.)
639  */
641 
642  if (TParserGet(tmpprs) && tmpprs->type == HOST)
643  {
644  prs->state->posbyte += tmpprs->lenbytetoken;
645  prs->state->poschar += tmpprs->lenchartoken;
646  prs->state->lenbytetoken += tmpprs->lenbytetoken;
647  prs->state->lenchartoken += tmpprs->lenchartoken;
648  prs->state->charlen = tmpprs->state->charlen;
649  res = 1;
650  }
651  TParserCopyClose(tmpprs);
652 
653  return res;
654 }
655 
656 static int
658 {
659  TParser *tmpprs = TParserCopyInit(prs);
660  int res = 0;
661 
662  tmpprs->state = newTParserPosition(tmpprs->state);
663  tmpprs->state->state = TPS_InURLPathFirst;
664 
665  /*
666  * Check stack depth before recursing. (Since TParserGet() doesn't
667  * normally recurse, we put the cost of checking here not there.)
668  */
670 
671  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
672  {
673  prs->state->posbyte += tmpprs->lenbytetoken;
674  prs->state->poschar += tmpprs->lenchartoken;
675  prs->state->lenbytetoken += tmpprs->lenbytetoken;
676  prs->state->lenchartoken += tmpprs->lenchartoken;
677  prs->state->charlen = tmpprs->state->charlen;
678  res = 1;
679  }
680  TParserCopyClose(tmpprs);
681 
682  return res;
683 }
684 
685 /*
686  * returns true if current character has zero display length or
687  * it's a special sign in several languages. Such characters
688  * aren't a word-breaker although they aren't an isalpha.
689  * In beginning of word they aren't a part of it.
690  */
691 static int
693 {
694  /*
695  * pg_dsplen could return -1 which means error or control character
696  */
697  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
698  return 1;
699 
700  /*
701  * Unicode Characters in the 'Mark, Spacing Combining' Category That
702  * characters are not alpha although they are not breakers of word too.
703  * Check that only in utf encoding, because other encodings aren't
704  * supported by postgres or even exists.
705  */
706  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
707  {
708  static const pg_wchar strange_letter[] = {
709  /*
710  * use binary search, so elements should be ordered
711  */
712  0x0903, /* DEVANAGARI SIGN VISARGA */
713  0x093E, /* DEVANAGARI VOWEL SIGN AA */
714  0x093F, /* DEVANAGARI VOWEL SIGN I */
715  0x0940, /* DEVANAGARI VOWEL SIGN II */
716  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
717  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
718  0x094B, /* DEVANAGARI VOWEL SIGN O */
719  0x094C, /* DEVANAGARI VOWEL SIGN AU */
720  0x0982, /* BENGALI SIGN ANUSVARA */
721  0x0983, /* BENGALI SIGN VISARGA */
722  0x09BE, /* BENGALI VOWEL SIGN AA */
723  0x09BF, /* BENGALI VOWEL SIGN I */
724  0x09C0, /* BENGALI VOWEL SIGN II */
725  0x09C7, /* BENGALI VOWEL SIGN E */
726  0x09C8, /* BENGALI VOWEL SIGN AI */
727  0x09CB, /* BENGALI VOWEL SIGN O */
728  0x09CC, /* BENGALI VOWEL SIGN AU */
729  0x09D7, /* BENGALI AU LENGTH MARK */
730  0x0A03, /* GURMUKHI SIGN VISARGA */
731  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
732  0x0A3F, /* GURMUKHI VOWEL SIGN I */
733  0x0A40, /* GURMUKHI VOWEL SIGN II */
734  0x0A83, /* GUJARATI SIGN VISARGA */
735  0x0ABE, /* GUJARATI VOWEL SIGN AA */
736  0x0ABF, /* GUJARATI VOWEL SIGN I */
737  0x0AC0, /* GUJARATI VOWEL SIGN II */
738  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
739  0x0ACB, /* GUJARATI VOWEL SIGN O */
740  0x0ACC, /* GUJARATI VOWEL SIGN AU */
741  0x0B02, /* ORIYA SIGN ANUSVARA */
742  0x0B03, /* ORIYA SIGN VISARGA */
743  0x0B3E, /* ORIYA VOWEL SIGN AA */
744  0x0B40, /* ORIYA VOWEL SIGN II */
745  0x0B47, /* ORIYA VOWEL SIGN E */
746  0x0B48, /* ORIYA VOWEL SIGN AI */
747  0x0B4B, /* ORIYA VOWEL SIGN O */
748  0x0B4C, /* ORIYA VOWEL SIGN AU */
749  0x0B57, /* ORIYA AU LENGTH MARK */
750  0x0BBE, /* TAMIL VOWEL SIGN AA */
751  0x0BBF, /* TAMIL VOWEL SIGN I */
752  0x0BC1, /* TAMIL VOWEL SIGN U */
753  0x0BC2, /* TAMIL VOWEL SIGN UU */
754  0x0BC6, /* TAMIL VOWEL SIGN E */
755  0x0BC7, /* TAMIL VOWEL SIGN EE */
756  0x0BC8, /* TAMIL VOWEL SIGN AI */
757  0x0BCA, /* TAMIL VOWEL SIGN O */
758  0x0BCB, /* TAMIL VOWEL SIGN OO */
759  0x0BCC, /* TAMIL VOWEL SIGN AU */
760  0x0BD7, /* TAMIL AU LENGTH MARK */
761  0x0C01, /* TELUGU SIGN CANDRABINDU */
762  0x0C02, /* TELUGU SIGN ANUSVARA */
763  0x0C03, /* TELUGU SIGN VISARGA */
764  0x0C41, /* TELUGU VOWEL SIGN U */
765  0x0C42, /* TELUGU VOWEL SIGN UU */
766  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
767  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
768  0x0C82, /* KANNADA SIGN ANUSVARA */
769  0x0C83, /* KANNADA SIGN VISARGA */
770  0x0CBE, /* KANNADA VOWEL SIGN AA */
771  0x0CC0, /* KANNADA VOWEL SIGN II */
772  0x0CC1, /* KANNADA VOWEL SIGN U */
773  0x0CC2, /* KANNADA VOWEL SIGN UU */
774  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
775  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
776  0x0CC7, /* KANNADA VOWEL SIGN EE */
777  0x0CC8, /* KANNADA VOWEL SIGN AI */
778  0x0CCA, /* KANNADA VOWEL SIGN O */
779  0x0CCB, /* KANNADA VOWEL SIGN OO */
780  0x0CD5, /* KANNADA LENGTH MARK */
781  0x0CD6, /* KANNADA AI LENGTH MARK */
782  0x0D02, /* MALAYALAM SIGN ANUSVARA */
783  0x0D03, /* MALAYALAM SIGN VISARGA */
784  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
785  0x0D3F, /* MALAYALAM VOWEL SIGN I */
786  0x0D40, /* MALAYALAM VOWEL SIGN II */
787  0x0D46, /* MALAYALAM VOWEL SIGN E */
788  0x0D47, /* MALAYALAM VOWEL SIGN EE */
789  0x0D48, /* MALAYALAM VOWEL SIGN AI */
790  0x0D4A, /* MALAYALAM VOWEL SIGN O */
791  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
792  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
793  0x0D57, /* MALAYALAM AU LENGTH MARK */
794  0x0D82, /* SINHALA SIGN ANUSVARAYA */
795  0x0D83, /* SINHALA SIGN VISARGAYA */
796  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
797  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
798  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
799  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
800  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
801  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
802  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
803  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
804  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
805  * AELA-PILLA */
806  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
807  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
808  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
809  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
810  0x0F3E, /* TIBETAN SIGN YAR TSHES */
811  0x0F3F, /* TIBETAN SIGN MAR TSHES */
812  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
813  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
814  0x102C, /* MYANMAR VOWEL SIGN AA */
815  0x1031, /* MYANMAR VOWEL SIGN E */
816  0x1038, /* MYANMAR SIGN VISARGA */
817  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
818  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
819  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
820  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
821  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
822  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
823  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
824  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
825  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
826  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
827  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
828  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
829  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
830  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
831  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
832  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
833  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
834  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
835  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
836  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
837  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
838  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
839  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
840  0x17B6, /* KHMER VOWEL SIGN AA */
841  0x17BE, /* KHMER VOWEL SIGN OE */
842  0x17BF, /* KHMER VOWEL SIGN YA */
843  0x17C0, /* KHMER VOWEL SIGN IE */
844  0x17C1, /* KHMER VOWEL SIGN E */
845  0x17C2, /* KHMER VOWEL SIGN AE */
846  0x17C3, /* KHMER VOWEL SIGN AI */
847  0x17C4, /* KHMER VOWEL SIGN OO */
848  0x17C5, /* KHMER VOWEL SIGN AU */
849  0x17C7, /* KHMER SIGN REAHMUK */
850  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
851  0x1923, /* LIMBU VOWEL SIGN EE */
852  0x1924, /* LIMBU VOWEL SIGN AI */
853  0x1925, /* LIMBU VOWEL SIGN OO */
854  0x1926, /* LIMBU VOWEL SIGN AU */
855  0x1929, /* LIMBU SUBJOINED LETTER YA */
856  0x192A, /* LIMBU SUBJOINED LETTER RA */
857  0x192B, /* LIMBU SUBJOINED LETTER WA */
858  0x1930, /* LIMBU SMALL LETTER KA */
859  0x1931, /* LIMBU SMALL LETTER NGA */
860  0x1933, /* LIMBU SMALL LETTER TA */
861  0x1934, /* LIMBU SMALL LETTER NA */
862  0x1935, /* LIMBU SMALL LETTER PA */
863  0x1936, /* LIMBU SMALL LETTER MA */
864  0x1937, /* LIMBU SMALL LETTER RA */
865  0x1938, /* LIMBU SMALL LETTER LA */
866  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
867  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
868  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
869  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
870  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
871  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
872  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
873  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
874  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
875  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
876  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
877  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
878  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
879  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
880  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
881  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
882  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
883  0x19C8, /* NEW TAI LUE TONE MARK-1 */
884  0x19C9, /* NEW TAI LUE TONE MARK-2 */
885  0x1A19, /* BUGINESE VOWEL SIGN E */
886  0x1A1A, /* BUGINESE VOWEL SIGN O */
887  0x1A1B, /* BUGINESE VOWEL SIGN AE */
888  0x1B04, /* BALINESE SIGN BISAH */
889  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
890  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
891  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
892  0x1B3E, /* BALINESE VOWEL SIGN TALING */
893  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
894  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
895  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
896  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
897  0x1B44, /* BALINESE ADEG ADEG */
898  0x1B82, /* SUNDANESE SIGN PANGWISAD */
899  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
900  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
901  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
902  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
903  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
904  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
905  0x1C26, /* LEPCHA VOWEL SIGN AA */
906  0x1C27, /* LEPCHA VOWEL SIGN I */
907  0x1C28, /* LEPCHA VOWEL SIGN O */
908  0x1C29, /* LEPCHA VOWEL SIGN OO */
909  0x1C2A, /* LEPCHA VOWEL SIGN U */
910  0x1C2B, /* LEPCHA VOWEL SIGN UU */
911  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
912  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
913  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
914  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
915  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
916  0xA880, /* SAURASHTRA SIGN ANUSVARA */
917  0xA881, /* SAURASHTRA SIGN VISARGA */
918  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
919  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
920  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
921  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
922  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
923  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
924  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
925  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
926  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
927  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
928  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
929  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
930  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
931  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
932  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
933  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
934  0xA952, /* REJANG CONSONANT SIGN H */
935  0xA953, /* REJANG VIRAMA */
936  0xAA2F, /* CHAM VOWEL SIGN O */
937  0xAA30, /* CHAM VOWEL SIGN AI */
938  0xAA33, /* CHAM CONSONANT SIGN YA */
939  0xAA34, /* CHAM CONSONANT SIGN RA */
940  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
941  };
942  const pg_wchar *StopLow = strange_letter,
943  *StopHigh = strange_letter + lengthof(strange_letter),
944  *StopMiddle;
945  pg_wchar c;
946 
947  if (prs->pgwstr)
948  c = *(prs->pgwstr + prs->state->poschar);
949  else
950  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
951 
952  while (StopLow < StopHigh)
953  {
954  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
955  if (*StopMiddle == c)
956  return 1;
957  else if (*StopMiddle < c)
958  StopLow = StopMiddle + 1;
959  else
960  StopHigh = StopMiddle;
961  }
962  }
963 
964  return 0;
965 }
966 
967 /*
968  * Table of state/action of parser
969  */
970 
972  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
973  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
974  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
975  {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
976  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
977  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
978  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
979  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
980  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
981  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
982  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
983  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
984  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
985 };
986 
987 
989  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
990  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
991  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
992  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
993  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
994  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
995  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
996  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
997 };
998 
1000  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1001  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1002  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1003  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1004  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1005  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1006  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1007  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1008  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1009  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1010  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1011  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1012  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1013  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1014  {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1015 };
1016 
1018  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1019  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1020  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1021  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1022  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1023  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1024 };
1025 
1027  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1028  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1029  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1030  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1031  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1032  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1033  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1034  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1035  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1036  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1037  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1038  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1039  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1040  {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1041 };
1042 
1044  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1045  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1046  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1047 };
1048 
1050  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1051  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1052  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1053  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1054  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1055  {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1056 };
1057 
1059  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1060  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1061  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1062  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1063  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1064  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1065  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1066  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1067  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1068 };
1069 
1071  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1072  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1073  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1074 };
1075 
1077  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1078  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1079  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1080  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1081  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1082  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1083 };
1084 
1086  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1087  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1088  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1089 };
1090 
1092  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1093  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1094  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1095  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1096  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1097  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1098 };
1099 
1101  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1103  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1104 };
1105 
1107  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1108  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1109  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1110 };
1111 
1112 
1114  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1115  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1116  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1117 };
1118 
1120  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1121  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1122  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1123  {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1124 };
1125 
1127  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1128  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1129  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1130  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1131  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1132 };
1133 
1135  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1136  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1137  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1138 };
1139 
1141  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1142  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1143  {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1144 };
1145 
1147  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1148  {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1149  {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1150  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1151  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1152  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1153 };
1154 
1156  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1157  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1158  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1159  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1160  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1161  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1162  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1163  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1164 };
1165 
1167  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1168  {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1169  {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1170  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1171  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172 };
1173 
1175  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1177  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1178 };
1179 
1181  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1182  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1183  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1184  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1185 };
1186 
1188  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1189  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1190  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1191  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1192 };
1193 
1195  {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1196 };
1197 
1199  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1200  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1201  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1202  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1203  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1204  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1205  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1206  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1207 };
1208 
1210  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1211  /* <?xml ... */
1212  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1213  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1214  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1215 };
1216 
1218  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1220  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1221 };
1222 
1224  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1225  /* <br/> case */
1226  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1227  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1228  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1229  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1230  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1231  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1232  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1233  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1234  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235 };
1236 
1238  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1240  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1241 };
1242 
1244  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1245  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1246  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1247  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1248  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1249  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1250  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1251  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1252  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1253  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1254  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1255  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1256  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1257  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1258  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1259  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1260  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1261  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1262  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1263 };
1264 
1266  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1268  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1269  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1270 };
1271 
1273  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1274  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1275  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1276  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1277 };
1278 
1280  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1281  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1282 };
1283 
1285  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1286 };
1287 
1289  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1290  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1291  /* <!DOCTYPE ...> */
1292  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1293  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1294  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1295 };
1296 
1298  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1300  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1301 };
1302 
1304  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1306  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1307 };
1308 
1310  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1312  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1313 };
1314 
1316  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1317  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1318  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1319  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1320 };
1321 
1323  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1324 };
1325 
1327  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1328  {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1329  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1330  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1331 };
1332 
1334  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1335  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1336  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1337  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1338  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1339  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1340  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1341  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1342 };
1343 
1345  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1346  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1347  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1348  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1349  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1350  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1351  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1352  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1353  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1355  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1356  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1357 };
1358 
1360  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1361  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1362  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1363 };
1364 
1366  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1367  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1369  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1370  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1371 };
1372 
1374  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1375  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1376  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1377  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1378 };
1379 
1381  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1382  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1383  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1384  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1385  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1386  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1387  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1388  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1389 };
1390 
1392  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1393  {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1394  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1395 };
1396 
1398  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1399  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1400  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1401  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1402  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1403  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1404  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1405 };
1406 
1408  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1409  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1410  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1411  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1412  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1413  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1414 };
1415 
1417  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1418  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1419  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1420  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1421  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1422  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1423  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1424 };
1425 
1427  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1428  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1429  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1430  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1431 };
1432 
1434  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1435  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1436  {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1437  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1438  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1439 };
1440 
1442  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1443  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1444  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1445  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1446  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1447  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1448  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1449  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1450 };
1451 
1453  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1454  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1455  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1456  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1457  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1458 };
1459 
1461  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1462  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1463  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1464 };
1465 
1467  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1468 };
1469 
1471  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1472  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1473  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1474 };
1475 
1477  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1479  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481 
1483  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1485  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487 
1489  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1490  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1491  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1492 };
1493 
1495  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1496 };
1497 
1499  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1500  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1501  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1502  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1503  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1504 };
1505 
1508  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1509  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1510  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1511  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1512  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1514 };
1515 
1517  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1518  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1519  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1520  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1521 };
1522 
1525  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1526  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1527  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1528  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1530 };
1531 
1533  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1534  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1535  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1536  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1537 };
1538 
1541  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1542  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1543  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1545 };
1546 
1548  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1549  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1550  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1551  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1552  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1553 };
1554 
1556  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1558  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1559  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1560  {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1561  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1562 };
1563 
1565  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1566  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1568  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1569 };
1570 
1572  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1573  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1574  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1575  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1576  {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1577 };
1578 
1580  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1582  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1583  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1584  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585  {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1586 };
1587 
1589  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1590  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1591  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1592  {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1593 };
1594 
1596  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1597  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1598  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1600  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1601 };
1602 
1603 
1604 /*
1605  * main table of per-state parser actions
1606  */
1607 typedef struct
1608 {
1609  const TParserStateActionItem *action; /* the actual state info */
1610  TParserState state; /* only for Assert crosscheck */
1611 #ifdef WPARSER_TRACE
1612  const char *state_name; /* only for debug printout */
1613 #endif
1615 
1616 #ifdef WPARSER_TRACE
1617 #define TPARSERSTATEACTION(state) \
1618  { CppConcat(action,state), state, CppAsString(state) }
1619 #else
1620 #define TPARSERSTATEACTION(state) \
1621  { CppConcat(action,state), state }
1622 #endif
1623 
1624 /*
1625  * order must be the same as in typedef enum {} TParserState!!
1626  */
1627 
1628 static const TParserStateAction Actions[] = {
1706 };
1707 
1708 
1709 static bool
1711 {
1712  const TParserStateActionItem *item = NULL;
1713 
1715 
1716  Assert(prs->state);
1717 
1718  if (prs->state->posbyte >= prs->lenstr)
1719  return false;
1720 
1721  prs->token = prs->str + prs->state->posbyte;
1722  prs->state->pushedAtAction = NULL;
1723 
1724  /* look at string */
1725  while (prs->state->posbyte <= prs->lenstr)
1726  {
1727  if (prs->state->posbyte == prs->lenstr)
1728  prs->state->charlen = 0;
1729  else
1730  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1731  pg_mblen(prs->str + prs->state->posbyte);
1732 
1733  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1734  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1735  Assert(Actions[prs->state->state].state == prs->state->state);
1736 
1737  if (prs->state->pushedAtAction)
1738  {
1739  /* After a POP, pick up at the next test */
1740  item = prs->state->pushedAtAction + 1;
1741  prs->state->pushedAtAction = NULL;
1742  }
1743  else
1744  {
1745  item = Actions[prs->state->state].action;
1746  Assert(item != NULL);
1747  }
1748 
1749  /* find action by character class */
1750  while (item->isclass)
1751  {
1752  prs->c = item->c;
1753  if (item->isclass(prs) != 0)
1754  break;
1755  item++;
1756  }
1757 
1758 #ifdef WPARSER_TRACE
1759  {
1760  TParserPosition *ptr;
1761 
1762  fprintf(stderr, "state ");
1763  /* indent according to stack depth */
1764  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1765  fprintf(stderr, " ");
1766  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1767  if (prs->state->posbyte < prs->lenstr)
1768  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1769  else
1770  fprintf(stderr, "at EOF");
1771  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1772  (int) (item - Actions[prs->state->state].action),
1773  (item->flags & A_BINGO) ? " BINGO" : "",
1774  (item->flags & A_POP) ? " POP" : "",
1775  (item->flags & A_PUSH) ? " PUSH" : "",
1776  (item->flags & A_RERUN) ? " RERUN" : "",
1777  (item->flags & A_CLEAR) ? " CLEAR" : "",
1778  (item->flags & A_MERGE) ? " MERGE" : "",
1779  (item->flags & A_CLRALL) ? " CLRALL" : "",
1780  (item->tostate != TPS_Null) ? " tostate " : "",
1781  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1782  (item->type > 0) ? " type " : "",
1783  tok_alias[item->type]);
1784  }
1785 #endif
1786 
1787  /* call special handler if exists */
1788  if (item->special)
1789  item->special(prs);
1790 
1791  /* BINGO, token is found */
1792  if (item->flags & A_BINGO)
1793  {
1794  Assert(item->type > 0);
1795  prs->lenbytetoken = prs->state->lenbytetoken;
1796  prs->lenchartoken = prs->state->lenchartoken;
1797  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1798  prs->type = item->type;
1799  }
1800 
1801  /* do various actions by flags */
1802  if (item->flags & A_POP)
1803  { /* pop stored state in stack */
1804  TParserPosition *ptr = prs->state->prev;
1805 
1806  pfree(prs->state);
1807  prs->state = ptr;
1808  Assert(prs->state);
1809  }
1810  else if (item->flags & A_PUSH)
1811  { /* push (store) state in stack */
1812  prs->state->pushedAtAction = item; /* remember where we push */
1813  prs->state = newTParserPosition(prs->state);
1814  }
1815  else if (item->flags & A_CLEAR)
1816  { /* clear previous pushed state */
1817  TParserPosition *ptr;
1818 
1819  Assert(prs->state->prev);
1820  ptr = prs->state->prev->prev;
1821  pfree(prs->state->prev);
1822  prs->state->prev = ptr;
1823  }
1824  else if (item->flags & A_CLRALL)
1825  { /* clear all previous pushed state */
1826  TParserPosition *ptr;
1827 
1828  while (prs->state->prev)
1829  {
1830  ptr = prs->state->prev->prev;
1831  pfree(prs->state->prev);
1832  prs->state->prev = ptr;
1833  }
1834  }
1835  else if (item->flags & A_MERGE)
1836  { /* merge posinfo with current and pushed state */
1837  TParserPosition *ptr = prs->state;
1838 
1839  Assert(prs->state->prev);
1840  prs->state = prs->state->prev;
1841 
1842  prs->state->posbyte = ptr->posbyte;
1843  prs->state->poschar = ptr->poschar;
1844  prs->state->charlen = ptr->charlen;
1845  prs->state->lenbytetoken = ptr->lenbytetoken;
1846  prs->state->lenchartoken = ptr->lenchartoken;
1847  pfree(ptr);
1848  }
1849 
1850  /* set new state if pointed */
1851  if (item->tostate != TPS_Null)
1852  prs->state->state = item->tostate;
1853 
1854  /* check for go away */
1855  if ((item->flags & A_BINGO) ||
1856  (prs->state->posbyte >= prs->lenstr &&
1857  (item->flags & A_RERUN) == 0))
1858  break;
1859 
1860  /* go to beginning of loop if we should rerun or we just restore state */
1861  if (item->flags & (A_RERUN | A_POP))
1862  continue;
1863 
1864  /* move forward */
1865  if (prs->state->charlen)
1866  {
1867  prs->state->posbyte += prs->state->charlen;
1868  prs->state->lenbytetoken += prs->state->charlen;
1869  prs->state->poschar++;
1870  prs->state->lenchartoken++;
1871  }
1872  }
1873 
1874  return (item && (item->flags & A_BINGO));
1875 }
1876 
1877 Datum
1879 {
1880  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1881  int i;
1882 
1883  for (i = 1; i <= LASTNUM; i++)
1884  {
1885  descr[i - 1].lexid = i;
1886  descr[i - 1].alias = pstrdup(tok_alias[i]);
1887  descr[i - 1].descr = pstrdup(lex_descr[i]);
1888  }
1889 
1890  descr[LASTNUM].lexid = 0;
1891 
1892  PG_RETURN_POINTER(descr);
1893 }
1894 
1895 Datum
1897 {
1899 }
1900 
1901 Datum
1903 {
1904  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1905  char **t = (char **) PG_GETARG_POINTER(1);
1906  int *tlen = (int *) PG_GETARG_POINTER(2);
1907 
1908  if (!TParserGet(p))
1909  PG_RETURN_INT32(0);
1910 
1911  *t = p->token;
1912  *tlen = p->lenbytetoken;
1913 
1914  PG_RETURN_INT32(p->type);
1915 }
1916 
1917 Datum
1919 {
1920  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1921 
1922  TParserClose(p);
1923  PG_RETURN_VOID();
1924 }
1925 
1926 
1927 /*
1928  * ts_headline support begins here
1929  */
1930 
1931 /* token type classification macros */
1932 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1933 #define HLIDREPLACE(x) ( (x)==TAG_T )
1934 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1935 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1936 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1937 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1938 
1939 /*
1940  * Macros useful in headline selection. These rely on availability of
1941  * "HeadlineParsedText *prs" describing some text, and "int shortword"
1942  * describing the "short word" length parameter.
1943  */
1944 
1945 /* Interesting words are non-repeated search terms */
1946 #define INTERESTINGWORD(j) \
1947  (prs->words[j].item && !prs->words[j].repeated)
1948 
1949 /* Don't want to end at a non-word or a short word, unless interesting */
1950 #define BADENDPOINT(j) \
1951  ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1952  !INTERESTINGWORD(j))
1953 
1954 typedef struct
1955 {
1956  /* one cover (well, really one fragment) for mark_hl_fragments */
1957  int32 startpos; /* fragment's starting word index */
1958  int32 endpos; /* ending word index (inclusive) */
1959  int32 poslen; /* number of interesting words */
1960  int32 curlen; /* total number of words */
1961  bool chosen; /* chosen? */
1962  bool excluded; /* excluded? */
1963 } CoverPos;
1964 
1965 typedef struct
1966 {
1967  /* callback data for checkcondition_HL */
1969  int len;
1970 } hlCheck;
1971 
1972 
1973 /*
1974  * TS_execute callback for matching a tsquery operand to headline words
1975  *
1976  * Note: it's tempting to report words[] indexes as pos values to save
1977  * searching in hlCover; but that would screw up phrase matching, which
1978  * expects to measure distances in lexemes not tokens.
1979  */
1980 static TSTernaryValue
1982 {
1983  hlCheck *checkval = (hlCheck *) opaque;
1984  int i;
1985 
1986  /* scan words array for matching items */
1987  for (i = 0; i < checkval->len; i++)
1988  {
1989  if (checkval->words[i].item == val)
1990  {
1991  /* if data == NULL, don't need to report positions */
1992  if (!data)
1993  return TS_YES;
1994 
1995  if (!data->pos)
1996  {
1997  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1998  data->allocated = true;
1999  data->npos = 1;
2000  data->pos[0] = checkval->words[i].pos;
2001  }
2002  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2003  {
2004  data->pos[data->npos++] = checkval->words[i].pos;
2005  }
2006  }
2007  }
2008 
2009  if (data && data->npos > 0)
2010  return TS_YES;
2011 
2012  return TS_NO;
2013 }
2014 
2015 /*
2016  * hlCover: try to find a substring of prs' word list that satisfies query
2017  *
2018  * locations is the result of TS_execute_locations() for the query.
2019  * We use this to identify plausible subranges of the query.
2020  *
2021  * *nextpos is the lexeme position (NOT word index) to start the search
2022  * at. Caller should initialize this to zero. If successful, we'll
2023  * advance it to the next place to search at.
2024  *
2025  * On success, sets *p to first word index and *q to last word index of the
2026  * cover substring, and returns true.
2027  *
2028  * The result is a minimal cover, in the sense that both *p and *q will be
2029  * words used in the query.
2030  */
2031 static bool
2032 hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2033  int *nextpos, int *p, int *q)
2034 {
2035  int pos = *nextpos;
2036 
2037  /* This loop repeats when our selected word-range fails the query */
2038  for (;;)
2039  {
2040  int posb,
2041  pose;
2042  ListCell *lc;
2043 
2044  /*
2045  * For each AND'ed query term or phrase, find its first occurrence at
2046  * or after pos; set pose to the maximum of those positions.
2047  *
2048  * We need not consider ORs or NOTs here; see the comments for
2049  * TS_execute_locations(). Rechecking the match with TS_execute(),
2050  * below, will deal with any ensuing imprecision.
2051  */
2052  pose = -1;
2053  foreach(lc, locations)
2054  {
2055  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2056  int first = -1;
2057 
2058  for (int i = 0; i < pdata->npos; i++)
2059  {
2060  /* For phrase matches, use the ending lexeme */
2061  int endp = pdata->pos[i];
2062 
2063  if (endp >= pos)
2064  {
2065  first = endp;
2066  break;
2067  }
2068  }
2069  if (first < 0)
2070  return false; /* no more matches for this term */
2071  if (first > pose)
2072  pose = first;
2073  }
2074 
2075  if (pose < 0)
2076  return false; /* we only get here if empty list */
2077 
2078  /*
2079  * Now, for each AND'ed query term or phrase, find its last occurrence
2080  * at or before pose; set posb to the minimum of those positions.
2081  *
2082  * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2083  * posb + 1 below.
2084  */
2085  posb = INT_MAX - 1;
2086  foreach(lc, locations)
2087  {
2088  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2089  int last = -1;
2090 
2091  for (int i = pdata->npos - 1; i >= 0; i--)
2092  {
2093  /* For phrase matches, use the starting lexeme */
2094  int startp = pdata->pos[i] - pdata->width;
2095 
2096  if (startp <= pose)
2097  {
2098  last = startp;
2099  break;
2100  }
2101  }
2102  if (last < posb)
2103  posb = last;
2104  }
2105 
2106  /*
2107  * We could end up with posb to the left of pos, in case some phrase
2108  * match crosses pos. Try the match starting at pos anyway, since the
2109  * result of TS_execute_locations is imprecise for phrase matches OR'd
2110  * with plain matches; that is, if the query is "(A <-> B) | C" then C
2111  * could match at pos even though the phrase match would have to
2112  * extend to the left of pos.
2113  */
2114  posb = Max(posb, pos);
2115 
2116  /* This test probably always succeeds, but be paranoid */
2117  if (posb <= pose)
2118  {
2119  /*
2120  * posb .. pose is now the shortest, earliest-after-pos range of
2121  * lexeme positions containing all the query terms. It will
2122  * contain all phrase matches, too, except in the corner case
2123  * described just above.
2124  *
2125  * Now convert these lexeme positions to indexes in prs->words[].
2126  */
2127  int idxb = -1;
2128  int idxe = -1;
2129 
2130  for (int i = 0; i < prs->curwords; i++)
2131  {
2132  if (prs->words[i].item == NULL)
2133  continue;
2134  if (idxb < 0 && prs->words[i].pos >= posb)
2135  idxb = i;
2136  if (prs->words[i].pos <= pose)
2137  idxe = i;
2138  else
2139  break;
2140  }
2141 
2142  /* This test probably always succeeds, but be paranoid */
2143  if (idxb >= 0 && idxe >= idxb)
2144  {
2145  /*
2146  * Finally, check that the selected range satisfies the query.
2147  * This should succeed in all simple cases; but odd cases
2148  * involving non-top-level NOT conditions or phrase matches
2149  * OR'd with other things could fail, since the result of
2150  * TS_execute_locations doesn't fully represent such things.
2151  */
2152  hlCheck ch;
2153 
2154  ch.words = &(prs->words[idxb]);
2155  ch.len = idxe - idxb + 1;
2156  if (TS_execute(GETQUERY(query), &ch,
2158  {
2159  /* Match! Advance *nextpos and return the word range. */
2160  *nextpos = posb + 1;
2161  *p = idxb;
2162  *q = idxe;
2163  return true;
2164  }
2165  }
2166  }
2167 
2168  /*
2169  * Advance pos and try again. Any later workable match must start
2170  * beyond posb.
2171  */
2172  pos = posb + 1;
2173  }
2174  /* Can't get here, but stupider compilers complain if we leave it off */
2175  return false;
2176 }
2177 
2178 /*
2179  * Apply suitable highlight marking to words selected by headline selector
2180  *
2181  * The words from startpos to endpos inclusive are marked per highlightall
2182  */
2183 static void
2184 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2185  int startpos, int endpos)
2186 {
2187  int i;
2188 
2189  for (i = startpos; i <= endpos; i++)
2190  {
2191  if (prs->words[i].item)
2192  prs->words[i].selected = 1;
2193  if (!highlightall)
2194  {
2195  if (HLIDREPLACE(prs->words[i].type))
2196  prs->words[i].replace = 1;
2197  else if (HLIDSKIP(prs->words[i].type))
2198  prs->words[i].skip = 1;
2199  }
2200  else
2201  {
2202  if (XMLHLIDSKIP(prs->words[i].type))
2203  prs->words[i].skip = 1;
2204  }
2205 
2206  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2207  }
2208 }
2209 
2210 /*
2211  * split a cover substring into fragments not longer than max_words
2212  *
2213  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2214  * substring. They are updated to hold the bounds of the next fragment.
2215  *
2216  * *curlen and *poslen are set to the fragment's length, in words and
2217  * interesting words respectively.
2218  */
2219 static void
2221  int *curlen, int *poslen, int max_words)
2222 {
2223  int i;
2224 
2225  /*
2226  * Objective: select a fragment of words between startpos and endpos such
2227  * that it has at most max_words and both ends have query words. If the
2228  * startpos and endpos are the endpoints of the cover and the cover has
2229  * fewer words than max_words, then this function should just return the
2230  * cover
2231  */
2232  /* first move startpos to an item */
2233  for (i = *startpos; i <= *endpos; i++)
2234  {
2235  *startpos = i;
2236  if (INTERESTINGWORD(i))
2237  break;
2238  }
2239  /* cut endpos to have only max_words */
2240  *curlen = 0;
2241  *poslen = 0;
2242  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2243  {
2244  if (!NONWORDTOKEN(prs->words[i].type))
2245  *curlen += 1;
2246  if (INTERESTINGWORD(i))
2247  *poslen += 1;
2248  }
2249  /* if the cover was cut then move back endpos to a query item */
2250  if (*endpos > i)
2251  {
2252  *endpos = i;
2253  for (i = *endpos; i >= *startpos; i--)
2254  {
2255  *endpos = i;
2256  if (INTERESTINGWORD(i))
2257  break;
2258  if (!NONWORDTOKEN(prs->words[i].type))
2259  *curlen -= 1;
2260  }
2261  }
2262 }
2263 
2264 /*
2265  * Headline selector used when MaxFragments > 0
2266  *
2267  * Note: in this mode, highlightall is disregarded for phrase selection;
2268  * it only controls presentation details.
2269  */
2270 static void
2272  bool highlightall,
2273  int shortword, int min_words,
2274  int max_words, int max_fragments)
2275 {
2276  int32 poslen,
2277  curlen,
2278  i,
2279  f,
2280  num_f = 0;
2281  int32 stretch,
2282  maxstretch,
2283  posmarker;
2284 
2285  int32 startpos = 0,
2286  endpos = 0,
2287  nextpos = 0,
2288  p = 0,
2289  q = 0;
2290 
2291  int32 numcovers = 0,
2292  maxcovers = 32;
2293 
2294  int32 minI,
2295  minwords,
2296  maxitems;
2297  CoverPos *covers;
2298 
2299  covers = palloc(maxcovers * sizeof(CoverPos));
2300 
2301  /* get all covers */
2302  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2303  {
2304  startpos = p;
2305  endpos = q;
2306 
2307  /*
2308  * Break the cover into smaller fragments such that each fragment has
2309  * at most max_words. Also ensure that each end of each fragment is a
2310  * query word. This will allow us to stretch the fragment in either
2311  * direction
2312  */
2313 
2314  while (startpos <= endpos)
2315  {
2316  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2317  if (numcovers >= maxcovers)
2318  {
2319  maxcovers *= 2;
2320  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2321  }
2322  covers[numcovers].startpos = startpos;
2323  covers[numcovers].endpos = endpos;
2324  covers[numcovers].curlen = curlen;
2325  covers[numcovers].poslen = poslen;
2326  covers[numcovers].chosen = false;
2327  covers[numcovers].excluded = false;
2328  numcovers++;
2329  startpos = endpos + 1;
2330  endpos = q;
2331  }
2332  }
2333 
2334  /* choose best covers */
2335  for (f = 0; f < max_fragments; f++)
2336  {
2337  maxitems = 0;
2338  minwords = PG_INT32_MAX;
2339  minI = -1;
2340 
2341  /*
2342  * Choose the cover that contains max items. In case of tie choose the
2343  * one with smaller number of words.
2344  */
2345  for (i = 0; i < numcovers; i++)
2346  {
2347  if (!covers[i].chosen && !covers[i].excluded &&
2348  (maxitems < covers[i].poslen ||
2349  (maxitems == covers[i].poslen &&
2350  minwords > covers[i].curlen)))
2351  {
2352  maxitems = covers[i].poslen;
2353  minwords = covers[i].curlen;
2354  minI = i;
2355  }
2356  }
2357  /* if a cover was found mark it */
2358  if (minI >= 0)
2359  {
2360  covers[minI].chosen = true;
2361  /* adjust the size of cover */
2362  startpos = covers[minI].startpos;
2363  endpos = covers[minI].endpos;
2364  curlen = covers[minI].curlen;
2365  /* stretch the cover if cover size is lower than max_words */
2366  if (curlen < max_words)
2367  {
2368  /* divide the stretch on both sides of cover */
2369  maxstretch = (max_words - curlen) / 2;
2370 
2371  /*
2372  * first stretch the startpos stop stretching if 1. we hit the
2373  * beginning of document 2. exceed maxstretch 3. we hit an
2374  * already marked fragment
2375  */
2376  stretch = 0;
2377  posmarker = startpos;
2378  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2379  {
2380  if (!NONWORDTOKEN(prs->words[i].type))
2381  {
2382  curlen++;
2383  stretch++;
2384  }
2385  posmarker = i;
2386  }
2387  /* cut back startpos till we find a good endpoint */
2388  for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2389  {
2390  if (!NONWORDTOKEN(prs->words[i].type))
2391  curlen--;
2392  }
2393  startpos = i;
2394  /* now stretch the endpos as much as possible */
2395  posmarker = endpos;
2396  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2397  {
2398  if (!NONWORDTOKEN(prs->words[i].type))
2399  curlen++;
2400  posmarker = i;
2401  }
2402  /* cut back endpos till we find a good endpoint */
2403  for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2404  {
2405  if (!NONWORDTOKEN(prs->words[i].type))
2406  curlen--;
2407  }
2408  endpos = i;
2409  }
2410  covers[minI].startpos = startpos;
2411  covers[minI].endpos = endpos;
2412  covers[minI].curlen = curlen;
2413  /* Mark the chosen fragments (covers) */
2414  mark_fragment(prs, highlightall, startpos, endpos);
2415  num_f++;
2416  /* Exclude covers overlapping this one from future consideration */
2417  for (i = 0; i < numcovers; i++)
2418  {
2419  if (i != minI &&
2420  ((covers[i].startpos >= startpos &&
2421  covers[i].startpos <= endpos) ||
2422  (covers[i].endpos >= startpos &&
2423  covers[i].endpos <= endpos) ||
2424  (covers[i].startpos < startpos &&
2425  covers[i].endpos > endpos)))
2426  covers[i].excluded = true;
2427  }
2428  }
2429  else
2430  break; /* no selectable covers remain */
2431  }
2432 
2433  /* show the first min_words words if we have not marked anything */
2434  if (num_f <= 0)
2435  {
2436  startpos = curlen = 0;
2437  endpos = -1;
2438  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2439  {
2440  if (!NONWORDTOKEN(prs->words[i].type))
2441  curlen++;
2442  endpos = i;
2443  }
2444  mark_fragment(prs, highlightall, startpos, endpos);
2445  }
2446 
2447  pfree(covers);
2448 }
2449 
2450 /*
2451  * Headline selector used when MaxFragments == 0
2452  */
2453 static void
2455  bool highlightall,
2456  int shortword, int min_words, int max_words)
2457 {
2458  int nextpos = 0,
2459  p = 0,
2460  q = 0;
2461  int bestb = -1,
2462  beste = -1;
2463  int bestlen = -1;
2464  bool bestcover = false;
2465  int pose,
2466  posb,
2467  poslen,
2468  curlen;
2469  bool poscover;
2470  int i;
2471 
2472  if (!highlightall)
2473  {
2474  /* examine all covers, select a headline using the best one */
2475  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2476  {
2477  /*
2478  * Count words (curlen) and interesting words (poslen) within
2479  * cover, but stop once we reach max_words. This step doesn't
2480  * consider whether that's a good stopping point. posb and pose
2481  * are set to the start and end indexes of the possible headline.
2482  */
2483  curlen = 0;
2484  poslen = 0;
2485  posb = pose = p;
2486  for (i = p; i <= q && curlen < max_words; i++)
2487  {
2488  if (!NONWORDTOKEN(prs->words[i].type))
2489  curlen++;
2490  if (INTERESTINGWORD(i))
2491  poslen++;
2492  pose = i;
2493  }
2494 
2495  if (curlen < max_words)
2496  {
2497  /*
2498  * We have room to lengthen the headline, so search forward
2499  * until it's full or we find a good stopping point. We'll
2500  * reconsider the word at "q", then move forward.
2501  */
2502  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2503  {
2504  if (i > q)
2505  {
2506  if (!NONWORDTOKEN(prs->words[i].type))
2507  curlen++;
2508  if (INTERESTINGWORD(i))
2509  poslen++;
2510  }
2511  pose = i;
2512  if (BADENDPOINT(i))
2513  continue;
2514  if (curlen >= min_words)
2515  break;
2516  }
2517  if (curlen < min_words)
2518  {
2519  /*
2520  * Reached end of text and our headline is still shorter
2521  * than min_words, so try to extend it to the left.
2522  */
2523  for (i = p - 1; i >= 0; i--)
2524  {
2525  if (!NONWORDTOKEN(prs->words[i].type))
2526  curlen++;
2527  if (INTERESTINGWORD(i))
2528  poslen++;
2529  if (curlen >= max_words)
2530  break;
2531  if (BADENDPOINT(i))
2532  continue;
2533  if (curlen >= min_words)
2534  break;
2535  }
2536  posb = (i >= 0) ? i : 0;
2537  }
2538  }
2539  else
2540  {
2541  /*
2542  * Can't make headline longer, so consider making it shorter
2543  * if needed to avoid a bad endpoint.
2544  */
2545  if (i > q)
2546  i = q;
2547  for (; curlen > min_words; i--)
2548  {
2549  if (!BADENDPOINT(i))
2550  break;
2551  if (!NONWORDTOKEN(prs->words[i].type))
2552  curlen--;
2553  if (INTERESTINGWORD(i))
2554  poslen--;
2555  pose = i - 1;
2556  }
2557  }
2558 
2559  /*
2560  * Check whether the proposed headline includes the original
2561  * cover; it might not if we trimmed it due to max_words.
2562  */
2563  poscover = (posb <= p && pose >= q);
2564 
2565  /*
2566  * Adopt this headline if it's better than the last one, giving
2567  * highest priority to headlines including the cover, then to
2568  * headlines with more interesting words, then to headlines with
2569  * good stopping points. (Since bestlen is initially -1, we will
2570  * certainly adopt the first headline.)
2571  */
2572  if (poscover > bestcover ||
2573  (poscover == bestcover && poslen > bestlen) ||
2574  (poscover == bestcover && poslen == bestlen &&
2575  !BADENDPOINT(pose) && BADENDPOINT(beste)))
2576  {
2577  bestb = posb;
2578  beste = pose;
2579  bestlen = poslen;
2580  bestcover = poscover;
2581  }
2582  }
2583 
2584  /*
2585  * If we found nothing acceptable, select min_words words starting at
2586  * the beginning.
2587  */
2588  if (bestlen < 0)
2589  {
2590  curlen = 0;
2591  pose = -1;
2592  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2593  {
2594  if (!NONWORDTOKEN(prs->words[i].type))
2595  curlen++;
2596  pose = i;
2597  }
2598  bestb = 0;
2599  beste = pose;
2600  }
2601  }
2602  else
2603  {
2604  /* highlightall mode: headline is whole document */
2605  bestb = 0;
2606  beste = prs->curwords - 1;
2607  }
2608 
2609  mark_fragment(prs, highlightall, bestb, beste);
2610 }
2611 
2612 /*
2613  * Default parser's prsheadline function
2614  */
2615 Datum
2617 {
2619  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2620  TSQuery query = PG_GETARG_TSQUERY(2);
2621  List *locations;
2622 
2623  /* default option values: */
2624  int min_words = 15;
2625  int max_words = 35;
2626  int shortword = 3;
2627  int max_fragments = 0;
2628  bool highlightall = false;
2629  ListCell *l;
2630 
2631  /* Extract configuration option values */
2632  prs->startsel = NULL;
2633  prs->stopsel = NULL;
2634  prs->fragdelim = NULL;
2635  foreach(l, prsoptions)
2636  {
2637  DefElem *defel = (DefElem *) lfirst(l);
2638  char *val = defGetString(defel);
2639 
2640  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2641  max_words = pg_strtoint32(val);
2642  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2643  min_words = pg_strtoint32(val);
2644  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2645  shortword = pg_strtoint32(val);
2646  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2647  max_fragments = pg_strtoint32(val);
2648  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2649  prs->startsel = pstrdup(val);
2650  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2651  prs->stopsel = pstrdup(val);
2652  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2653  prs->fragdelim = pstrdup(val);
2654  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2655  highlightall = (pg_strcasecmp(val, "1") == 0 ||
2656  pg_strcasecmp(val, "on") == 0 ||
2657  pg_strcasecmp(val, "true") == 0 ||
2658  pg_strcasecmp(val, "t") == 0 ||
2659  pg_strcasecmp(val, "y") == 0 ||
2660  pg_strcasecmp(val, "yes") == 0);
2661  else
2662  ereport(ERROR,
2663  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2664  errmsg("unrecognized headline parameter: \"%s\"",
2665  defel->defname)));
2666  }
2667 
2668  /* in HighlightAll mode these parameters are ignored */
2669  if (!highlightall)
2670  {
2671  if (min_words >= max_words)
2672  ereport(ERROR,
2673  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2674  errmsg("MinWords should be less than MaxWords")));
2675  if (min_words <= 0)
2676  ereport(ERROR,
2677  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2678  errmsg("MinWords should be positive")));
2679  if (shortword < 0)
2680  ereport(ERROR,
2681  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2682  errmsg("ShortWord should be >= 0")));
2683  if (max_fragments < 0)
2684  ereport(ERROR,
2685  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2686  errmsg("MaxFragments should be >= 0")));
2687  }
2688 
2689  /* Locate words and phrases matching the query */
2690  if (query->size > 0)
2691  {
2692  hlCheck ch;
2693 
2694  ch.words = prs->words;
2695  ch.len = prs->curwords;
2696  locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2698  }
2699  else
2700  locations = NIL; /* empty query matches nothing */
2701 
2702  /* Apply appropriate headline selector */
2703  if (max_fragments == 0)
2704  mark_hl_words(prs, query, locations, highlightall, shortword,
2705  min_words, max_words);
2706  else
2707  mark_hl_fragments(prs, query, locations, highlightall, shortword,
2708  min_words, max_words, max_fragments);
2709 
2710  /* Fill in default values for string options */
2711  if (!prs->startsel)
2712  prs->startsel = pstrdup("<b>");
2713  if (!prs->stopsel)
2714  prs->stopsel = pstrdup("</b>");
2715  if (!prs->fragdelim)
2716  prs->fragdelim = pstrdup(" ... ");
2717 
2718  /* Caller will need these lengths, too */
2719  prs->startsellen = strlen(prs->startsel);
2720  prs->stopsellen = strlen(prs->stopsel);
2721  prs->fragdelimlen = strlen(prs->fragdelim);
2722 
2723  PG_RETURN_POINTER(prs);
2724 }
#define GETQUERY(x)
Definition: _int.h:157
void print(const void *obj)
Definition: print.c:36
#define PG_INT32_MAX
Definition: c.h:543
#define Max(x, y)
Definition: c.h:952
#define Assert(condition)
Definition: c.h:812
int32_t int32
Definition: c.h:481
uint16_t uint16
Definition: c.h:484
#define lengthof(array)
Definition: c.h:742
char * defGetString(DefElem *def)
Definition: define.c:48
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
const char * str
long val
Definition: informix.c:689
int i
Definition: isn.c:72
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1261
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1030
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:986
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
int32 pg_strtoint32(const char *s)
Definition: numutils.c:383
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
const void size_t len
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
bool database_ctype_is_c
Definition: pg_locale.c:140
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
static XLogRecPtr startpos
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define fprintf
Definition: port.h:242
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
void check_stack_depth(void)
Definition: postgres.c:3574
uintptr_t Datum
Definition: postgres.h:64
char * c
bool chosen
Definition: wparser_def.c:1961
int32 endpos
Definition: wparser_def.c:1958
int32 curlen
Definition: wparser_def.c:1960
int32 startpos
Definition: wparser_def.c:1957
bool excluded
Definition: wparser_def.c:1962
int32 poslen
Definition: wparser_def.c:1959
char * defname
Definition: parsenodes.h:817
WordEntryPos * pos
Definition: ts_utils.h:166
HeadlineWordEntry * words
Definition: ts_public.h:76
WordEntryPos pos
Definition: ts_public.h:68
QueryOperand * item
Definition: ts_public.h:70
char * alias
Definition: ts_public.h:28
int lexid
Definition: ts_public.h:27
char * descr
Definition: ts_public.h:29
Definition: pg_list.h:54
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:238
struct TParserPosition * prev
Definition: wparser_def.c:237
TParserState state
Definition: wparser_def.c:236
TParserCharTest isclass
Definition: wparser_def.c:211
TParserState tostate
Definition: wparser_def.c:214
TParserSpecial special
Definition: wparser_def.c:216
const TParserStateActionItem * action
Definition: wparser_def.c:1609
TParserState state
Definition: wparser_def.c:1610
char * str
Definition: wparser_def.c:244
pg_wchar * pgwstr
Definition: wparser_def.c:247
wchar_t * wstr
Definition: wparser_def.c:246
int lenstr
Definition: wparser_def.c:245
char * token
Definition: wparser_def.c:260
int type
Definition: wparser_def.c:263
int charmaxlen
Definition: wparser_def.c:251
bool wanthost
Definition: wparser_def.c:254
int lenbytetoken
Definition: wparser_def.c:261
bool ignore
Definition: wparser_def.c:253
TParserPosition * state
Definition: wparser_def.c:252
int lenchartoken
Definition: wparser_def.c:262
char c
Definition: wparser_def.c:257
bool usewide
Definition: wparser_def.c:248
int32 size
Definition: ts_type.h:221
HeadlineWordEntry * words
Definition: wparser_def.c:1968
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:266
uint16 WordEntryPos
Definition: ts_type.h:63
TSTernaryValue
Definition: ts_utils.h:133
@ TS_NO
Definition: ts_utils.h:134
@ TS_YES
Definition: ts_utils.h:135
#define TS_EXEC_EMPTY
Definition: ts_utils.h:188
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1854
List * TS_execute_locations(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:2007
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1555
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1166
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1516
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1936
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1146
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1373
#define VERSIONNUMBER
Definition: wparser_def.c:43
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1588
#define BADENDPOINT(j)
Definition: wparser_def.c:1950
#define ASCIIWORD
Definition: wparser_def.c:36
#define PROTOCOL
Definition: wparser_def.c:49
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1433
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1416
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1333
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1309
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:588
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1322
struct TParser TParser
static TSTernaryValue checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1981
void _make_compiler_happy(void)
Definition: wparser_def.c:537
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1466
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1326
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1547
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1344
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1119
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1935
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1498
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1902
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1223
#define DECIMAL_T
Definition: wparser_def.c:55
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1452
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1155
#define ASCIIPARTHWORD
Definition: wparser_def.c:46
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1476
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:424
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1134
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2271
#define WORD_T
Definition: wparser_def.c:37
TParserState
Definition: wparser_def.c:120
@ TPS_InXMLEntityHexNumFirst
Definition: wparser_def.c:144
@ TPS_InPort
Definition: wparser_def.c:167
@ TPS_InXMLEntityHexNum
Definition: wparser_def.c:145
@ TPS_InHostDomainSecond
Definition: wparser_def.c:164
@ TPS_InMantissaFirst
Definition: wparser_def.c:137
@ TPS_InTagName
Definition: wparser_def.c:150
@ TPS_InHyphenAsciiWordFirst
Definition: wparser_def.c:185
@ TPS_Null
Definition: wparser_def.c:198
@ TPS_InPathFirstFirst
Definition: wparser_def.c:174
@ TPS_InSignedIntFirst
Definition: wparser_def.c:126
@ TPS_InSignedInt
Definition: wparser_def.c:127
@ TPS_InUnsignedInt
Definition: wparser_def.c:125
@ TPS_InMantissa
Definition: wparser_def.c:139
@ TPS_InProtocolFirst
Definition: wparser_def.c:182
@ TPS_InFURL
Definition: wparser_def.c:181
@ TPS_InMantissaSign
Definition: wparser_def.c:138
@ TPS_InXMLBegin
Definition: wparser_def.c:148
@ TPS_InCommentEnd
Definition: wparser_def.c:162
@ TPS_InHyphenWordFirst
Definition: wparser_def.c:187
@ TPS_InHyphenNumWordPart
Definition: wparser_def.c:196
@ TPS_InPortFirst
Definition: wparser_def.c:166
@ TPS_InProtocolEnd
Definition: wparser_def.c:184
@ TPS_InXMLEntityFirst
Definition: wparser_def.c:140
@ TPS_InHyphenNumWordFirst
Definition: wparser_def.c:189
@ TPS_InCommentLast
Definition: wparser_def.c:158
@ TPS_InFileTwiddle
Definition: wparser_def.c:172
@ TPS_InURLPathStart
Definition: wparser_def.c:179
@ TPS_InURLPathFirst
Definition: wparser_def.c:178
@ TPS_InPathFirst
Definition: wparser_def.c:173
@ TPS_InPathSecond
Definition: wparser_def.c:175
@ TPS_InHyphenUnsignedInt
Definition: wparser_def.c:197
@ TPS_InFileFirst
Definition: wparser_def.c:171
@ TPS_InXMLEntityNumFirst
Definition: wparser_def.c:142
@ TPS_InHyphenWordPart
Definition: wparser_def.c:194
@ TPS_InNumWord
Definition: wparser_def.c:122
@ TPS_InAsciiWord
Definition: wparser_def.c:123
@ TPS_InVersion
Definition: wparser_def.c:136
@ TPS_InHost
Definition: wparser_def.c:169
@ TPS_InFile
Definition: wparser_def.c:176
@ TPS_InProtocolSecond
Definition: wparser_def.c:183
@ TPS_InCloseCommentFirst
Definition: wparser_def.c:160
@ TPS_InTagEscapeK
Definition: wparser_def.c:153
@ TPS_InParseHyphenHyphen
Definition: wparser_def.c:193
@ TPS_InTagBackSleshed
Definition: wparser_def.c:155
@ TPS_InTagFirst
Definition: wparser_def.c:147
@ TPS_InTagEnd
Definition: wparser_def.c:156
@ TPS_InComment
Definition: wparser_def.c:159
@ TPS_InHyphenWord
Definition: wparser_def.c:188
@ TPS_InHyphenAsciiWord
Definition: wparser_def.c:186
@ TPS_InWord
Definition: wparser_def.c:124
@ TPS_InXMLEntityEnd
Definition: wparser_def.c:146
@ TPS_InTagEscapeKK
Definition: wparser_def.c:154
@ TPS_InSpace
Definition: wparser_def.c:128
@ TPS_InFileNext
Definition: wparser_def.c:177
@ TPS_InURLPath
Definition: wparser_def.c:180
@ TPS_Base
Definition: wparser_def.c:121
@ TPS_InUDecimal
Definition: wparser_def.c:130
@ TPS_InParseHyphen
Definition: wparser_def.c:192
@ TPS_InHostFirstAN
Definition: wparser_def.c:168
@ TPS_InEmail
Definition: wparser_def.c:170
@ TPS_InDecimalFirst
Definition: wparser_def.c:131
@ TPS_InVersionFirst
Definition: wparser_def.c:135
@ TPS_InCloseCommentLast
Definition: wparser_def.c:161
@ TPS_InSVerVersion
Definition: wparser_def.c:134
@ TPS_InHyphenAsciiWordPart
Definition: wparser_def.c:195
@ TPS_InCommentFirst
Definition: wparser_def.c:157
@ TPS_InUDecimalFirst
Definition: wparser_def.c:129
@ TPS_InHostFirstDomain
Definition: wparser_def.c:163
@ TPS_InHostDomain
Definition: wparser_def.c:165
@ TPS_InHyphenDigitLookahead
Definition: wparser_def.c:191
@ TPS_InVerVersion
Definition: wparser_def.c:133
@ TPS_InXMLEntityNum
Definition: wparser_def.c:143
@ TPS_InTag
Definition: wparser_def.c:152
@ TPS_InDecimal
Definition: wparser_def.c:132
@ TPS_InTagCloseFirst
Definition: wparser_def.c:149
@ TPS_InXMLEntity
Definition: wparser_def.c:141
@ TPS_InHyphenNumWord
Definition: wparser_def.c:190
@ TPS_InTagBeginEnd
Definition: wparser_def.c:151
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
Definition: wparser_def.c:2184
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1194
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1539
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1091
#define A_POP
Definition: wparser_def.c:222
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1043
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1265
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1058
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1441
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:346
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1579
#define LASTNUM
Definition: wparser_def.c:60
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:481
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2616
#define NUMHWORD
Definition: wparser_def.c:50
static bool hlCover(HeadlineParsedText *prs, TSQuery query, List *locations, int *nextpos, int *p, int *q)
Definition: wparser_def.c:2032
#define SPACE
Definition: wparser_def.c:47
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1076
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1049
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:505
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1237
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1198
struct TParserPosition TParserPosition
#define NUMWORD
Definition: wparser_def.c:38
#define FILEPATH
Definition: wparser_def.c:54
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1272
static int p_isneC(TParser *prs)
Definition: wparser_def.c:487
#define EMAIL
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1297
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1571
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1126
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:971
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:596
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2454
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1532
#define UNSIGNEDINT
Definition: wparser_def.c:57
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:206
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1391
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1174
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1470
#define A_RERUN
Definition: wparser_def.c:224
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1106
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:999
static const char *const tok_alias[]
Definition: wparser_def.c:62
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:612
#define HLIDSKIP(x)
Definition: wparser_def.c:1934
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2220
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1595
#define SIGNEDINT
Definition: wparser_def.c:56
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:499
static const TParserStateAction Actions[]
Definition: wparser_def.c:1628
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1209
#define PARTHWORD
Definition: wparser_def.c:45
#define HLIDREPLACE(x)
Definition: wparser_def.c:1933
#define A_MERGE
Definition: wparser_def.c:226
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1140
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1113
static int p_isascii(TParser *prs)
Definition: wparser_def.c:493
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1288
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1523
static int p_isignore(TParser *prs)
Definition: wparser_def.c:623
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1564
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1365
#define TAG_T
Definition: wparser_def.c:48
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1085
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:272
#define URLPATH
Definition: wparser_def.c:53
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1878
#define ASCIIHWORD
Definition: wparser_def.c:51
#define HOST
Definition: wparser_def.c:41
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1243
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1896
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:289
#define A_BINGO
Definition: wparser_def.c:221
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1620
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1710
#define XMLENTITY
Definition: wparser_def.c:58
static int p_ishost(TParser *prs)
Definition: wparser_def.c:629
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1918
#define A_CLRALL
Definition: wparser_def.c:227
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:657
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:603
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1482
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1026
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1070
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1217
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:474
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1315
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:397
#define A_CLEAR
Definition: wparser_def.c:225
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1397
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:988
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1407
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1380
#define A_PUSH
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1279
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1488
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1017
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:692
static void TParserClose(TParser *prs)
Definition: wparser_def.c:372
#define URL_T
Definition: wparser_def.c:40
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1180
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1100
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1506
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1187
#define A_NEXT
Definition: wparser_def.c:220
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1359
#define HWORD
Definition: wparser_def.c:52
#define NUMPARTHWORD
Definition: wparser_def.c:44
static const char *const lex_descr[]
Definition: wparser_def.c:89
#define INTERESTINGWORD(j)
Definition: wparser_def.c:1946
#define SCIENTIFIC
Definition: wparser_def.c:42
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:564
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1284
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1303
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1494
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1460
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1426