PostgreSQL Source Code  git master
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "miscadmin.h"
22 #include "tsearch/ts_locale.h"
23 #include "tsearch/ts_public.h"
24 #include "tsearch/ts_type.h"
25 #include "tsearch/ts_utils.h"
26 #include "utils/builtins.h"
27 
28 
29 /* Define me to enable tracing of parser behavior */
30 /* #define WPARSER_TRACE */
31 
32 
33 /* Output token categories */
34 
35 #define ASCIIWORD 1
36 #define WORD_T 2
37 #define NUMWORD 3
38 #define EMAIL 4
39 #define URL_T 5
40 #define HOST 6
41 #define SCIENTIFIC 7
42 #define VERSIONNUMBER 8
43 #define NUMPARTHWORD 9
44 #define PARTHWORD 10
45 #define ASCIIPARTHWORD 11
46 #define SPACE 12
47 #define TAG_T 13
48 #define PROTOCOL 14
49 #define NUMHWORD 15
50 #define ASCIIHWORD 16
51 #define HWORD 17
52 #define URLPATH 18
53 #define FILEPATH 19
54 #define DECIMAL_T 20
55 #define SIGNEDINT 21
56 #define UNSIGNEDINT 22
57 #define XMLENTITY 23
58 
59 #define LASTNUM 23
60 
61 static const char *const tok_alias[] = {
62  "",
63  "asciiword",
64  "word",
65  "numword",
66  "email",
67  "url",
68  "host",
69  "sfloat",
70  "version",
71  "hword_numpart",
72  "hword_part",
73  "hword_asciipart",
74  "blank",
75  "tag",
76  "protocol",
77  "numhword",
78  "asciihword",
79  "hword",
80  "url_path",
81  "file",
82  "float",
83  "int",
84  "uint",
85  "entity"
86 };
87 
88 static const char *const lex_descr[] = {
89  "",
90  "Word, all ASCII",
91  "Word, all letters",
92  "Word, letters and digits",
93  "Email address",
94  "URL",
95  "Host",
96  "Scientific notation",
97  "Version number",
98  "Hyphenated word part, letters and digits",
99  "Hyphenated word part, all letters",
100  "Hyphenated word part, all ASCII",
101  "Space symbols",
102  "XML tag",
103  "Protocol head",
104  "Hyphenated word, letters and digits",
105  "Hyphenated word, all ASCII",
106  "Hyphenated word, all letters",
107  "URL path",
108  "File or path name",
109  "Decimal notation",
110  "Signed integer",
111  "Unsigned integer",
112  "XML entity"
113 };
114 
115 
116 /* Parser states */
117 
118 typedef enum
119 {
120  TPS_Base = 0,
197  TPS_Null /* last state (fake value) */
199 
200 /* forward declaration */
201 struct TParser;
202 
203 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
204  * except p_iseq */
205 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
206  * special cases... */
207 
208 typedef struct
209 {
211  char c;
214  int type;
217 
218 /* Flag bits in TParserStateActionItem.flags */
219 #define A_NEXT 0x0000
220 #define A_BINGO 0x0001
221 #define A_POP 0x0002
222 #define A_PUSH 0x0004
223 #define A_RERUN 0x0008
224 #define A_CLEAR 0x0010
225 #define A_MERGE 0x0020
226 #define A_CLRALL 0x0040
227 
228 typedef struct TParserPosition
229 {
230  int posbyte; /* position of parser in bytes */
231  int poschar; /* position of parser in characters */
232  int charlen; /* length of current char */
233  int lenbytetoken; /* length of token-so-far in bytes */
234  int lenchartoken; /* and in chars */
239 
240 typedef struct TParser
241 {
242  /* string and position information */
243  char *str; /* multibyte string */
244  int lenstr; /* length of mbstring */
245  wchar_t *wstr; /* wide character string */
246  pg_wchar *pgwstr; /* wide character string for C-locale */
247  bool usewide;
248 
249  /* State of parse */
252  bool ignore;
253  bool wanthost;
254 
255  /* silly char */
256  char c;
257 
258  /* out */
259  char *token;
262  int type;
264 
265 
266 /* forward decls here */
267 static bool TParserGet(TParser *prs);
268 
269 
270 static TParserPosition *
272 {
274 
275  if (prev)
276  memcpy(res, prev, sizeof(TParserPosition));
277  else
278  memset(res, 0, sizeof(TParserPosition));
279 
280  res->prev = prev;
281 
282  res->pushedAtAction = NULL;
283 
284  return res;
285 }
286 
287 static TParser *
288 TParserInit(char *str, int len)
289 {
290  TParser *prs = (TParser *) palloc0(sizeof(TParser));
291 
293  prs->str = str;
294  prs->lenstr = len;
295 
296  /*
297  * Use wide char code only when max encoding length > 1.
298  */
299  if (prs->charmaxlen > 1)
300  {
301  pg_locale_t mylocale = 0; /* TODO */
302 
303  prs->usewide = true;
305  {
306  /*
307  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308  * be different from sizeof(wchar_t)
309  */
310  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312  }
313  else
314  {
315  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317  mylocale);
318  }
319  }
320  else
321  prs->usewide = false;
322 
323  prs->state = newTParserPosition(NULL);
324  prs->state->state = TPS_Base;
325 
326 #ifdef WPARSER_TRACE
327  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
328 #endif
329 
330  return prs;
331 }
332 
333 /*
334  * As an alternative to a full TParserInit one can create a
335  * TParserCopy which basically is a regular TParser without a private
336  * copy of the string - instead it uses the one from another TParser.
337  * This is useful because at some places TParsers are created
338  * recursively and the repeated copying around of the strings can
339  * cause major inefficiency if the source string is long.
340  * The new parser starts parsing at the original's current position.
341  *
342  * Obviously one must not close the original TParser before the copy.
343  */
344 static TParser *
346 {
347  TParser *prs = (TParser *) palloc0(sizeof(TParser));
348 
349  prs->charmaxlen = orig->charmaxlen;
350  prs->str = orig->str + orig->state->posbyte;
351  prs->lenstr = orig->lenstr - orig->state->posbyte;
352  prs->usewide = orig->usewide;
353 
354  if (orig->pgwstr)
355  prs->pgwstr = orig->pgwstr + orig->state->poschar;
356  if (orig->wstr)
357  prs->wstr = orig->wstr + orig->state->poschar;
358 
359  prs->state = newTParserPosition(NULL);
360  prs->state->state = TPS_Base;
361 
362 #ifdef WPARSER_TRACE
363  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
364 #endif
365 
366  return prs;
367 }
368 
369 
370 static void
372 {
373  while (prs->state)
374  {
375  TParserPosition *ptr = prs->state->prev;
376 
377  pfree(prs->state);
378  prs->state = ptr;
379  }
380 
381  if (prs->wstr)
382  pfree(prs->wstr);
383  if (prs->pgwstr)
384  pfree(prs->pgwstr);
385 
386 #ifdef WPARSER_TRACE
387  fprintf(stderr, "closing parser\n");
388 #endif
389  pfree(prs);
390 }
391 
392 /*
393  * Close a parser created with TParserCopyInit
394  */
395 static void
397 {
398  while (prs->state)
399  {
400  TParserPosition *ptr = prs->state->prev;
401 
402  pfree(prs->state);
403  prs->state = ptr;
404  }
405 
406 #ifdef WPARSER_TRACE
407  fprintf(stderr, "closing parser copy\n");
408 #endif
409  pfree(prs);
410 }
411 
412 
413 /*
414  * Character-type support functions, equivalent to is* macros, but
415  * working with any possible encodings and locales. Notes:
416  * - with multibyte encoding and C-locale isw* function may fail
417  * or give wrong result.
418  * - multibyte encoding and C-locale often are used for
419  * Asian languages.
420  * - if locale is C then we use pgwstr instead of wstr.
421  */
422 
423 #define p_iswhat(type, nonascii) \
424  \
425 static int \
426 p_is##type(TParser *prs) \
427 { \
428  Assert(prs->state); \
429  if (prs->usewide) \
430  { \
431  if (prs->pgwstr) \
432  { \
433  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
434  if (c > 0x7f) \
435  return nonascii; \
436  return is##type(c); \
437  } \
438  return isw##type(*(prs->wstr + prs->state->poschar)); \
439  } \
440  return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
441 } \
442  \
443 static int \
444 p_isnot##type(TParser *prs) \
445 { \
446  return !p_is##type(prs); \
447 }
448 
449 /*
450  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
451  * an alpha character, but not a member of other char classes.
452  */
453 p_iswhat(alnum, 1)
454 p_iswhat(alpha, 1)
455 p_iswhat(digit, 0)
456 p_iswhat(lower, 0)
457 p_iswhat(print, 0)
458 p_iswhat(punct, 0)
459 p_iswhat(space, 0)
460 p_iswhat(upper, 0)
461 p_iswhat(xdigit, 0)
462 
463 /* p_iseq should be used only for ascii symbols */
464 
465 static int
466 p_iseq(TParser *prs, char c)
467 {
468  Assert(prs->state);
469  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
470 }
471 
472 static int
474 {
475  Assert(prs->state);
476  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
477 }
478 
479 static int
481 {
482  return p_iseq(prs, prs->c);
483 }
484 
485 static int
487 {
488  return !p_iseq(prs, prs->c);
489 }
490 
491 static int
493 {
494  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
495 }
496 
497 static int
499 {
500  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
501 }
502 
503 static int
505 {
506  char ch;
507 
508  /* no non-ASCII need apply */
509  if (prs->state->charlen != 1)
510  return 0;
511  ch = *(prs->str + prs->state->posbyte);
512  /* no spaces or control characters */
513  if (ch <= 0x20 || ch >= 0x7F)
514  return 0;
515  /* reject characters disallowed by RFC 3986 */
516  switch (ch)
517  {
518  case '"':
519  case '<':
520  case '>':
521  case '\\':
522  case '^':
523  case '`':
524  case '{':
525  case '|':
526  case '}':
527  return 0;
528  }
529  return 1;
530 }
531 
532 
533 /* deliberately suppress unused-function complaints for the above */
534 void _make_compiler_happy(void);
535 void
537 {
538  p_isalnum(NULL);
539  p_isnotalnum(NULL);
540  p_isalpha(NULL);
541  p_isnotalpha(NULL);
542  p_isdigit(NULL);
543  p_isnotdigit(NULL);
544  p_islower(NULL);
545  p_isnotlower(NULL);
546  p_isprint(NULL);
547  p_isnotprint(NULL);
548  p_ispunct(NULL);
549  p_isnotpunct(NULL);
550  p_isspace(NULL);
551  p_isnotspace(NULL);
552  p_isupper(NULL);
553  p_isnotupper(NULL);
554  p_isxdigit(NULL);
555  p_isnotxdigit(NULL);
556  p_isEOF(NULL);
557  p_iseqC(NULL);
558  p_isneC(NULL);
559 }
560 
561 
562 static void
564 {
565  switch (prs->state->lenchartoken)
566  {
567  case 8: /* </script */
568  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
569  prs->ignore = false;
570  break;
571  case 7: /* <script || </style */
572  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
573  prs->ignore = false;
574  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
575  prs->ignore = true;
576  break;
577  case 6: /* <style */
578  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
579  prs->ignore = true;
580  break;
581  default:
582  break;
583  }
584 }
585 
586 static void
588 {
589  prs->wanthost = true;
590  prs->state->posbyte -= prs->state->lenbytetoken;
591  prs->state->poschar -= prs->state->lenchartoken;
592 }
593 
594 static void
596 {
597  prs->state->posbyte -= prs->state->lenbytetoken;
598  prs->state->poschar -= prs->state->lenchartoken;
599 }
600 
601 static void
603 {
604  prs->state->posbyte -= prs->state->lenbytetoken;
605  prs->state->poschar -= prs->state->lenchartoken;
606  prs->state->lenbytetoken = 0;
607  prs->state->lenchartoken = 0;
608 }
609 
610 static int
612 {
613  if (prs->wanthost)
614  {
615  prs->wanthost = false;
616  return 1;
617  }
618  return 0;
619 }
620 
621 static int
623 {
624  return (prs->ignore) ? 1 : 0;
625 }
626 
627 static int
629 {
630  TParser *tmpprs = TParserCopyInit(prs);
631  int res = 0;
632 
633  tmpprs->wanthost = true;
634 
635  /*
636  * Check stack depth before recursing. (Since TParserGet() doesn't
637  * normally recurse, we put the cost of checking here not there.)
638  */
640 
641  if (TParserGet(tmpprs) && tmpprs->type == HOST)
642  {
643  prs->state->posbyte += tmpprs->lenbytetoken;
644  prs->state->poschar += tmpprs->lenchartoken;
645  prs->state->lenbytetoken += tmpprs->lenbytetoken;
646  prs->state->lenchartoken += tmpprs->lenchartoken;
647  prs->state->charlen = tmpprs->state->charlen;
648  res = 1;
649  }
650  TParserCopyClose(tmpprs);
651 
652  return res;
653 }
654 
655 static int
657 {
658  TParser *tmpprs = TParserCopyInit(prs);
659  int res = 0;
660 
661  tmpprs->state = newTParserPosition(tmpprs->state);
662  tmpprs->state->state = TPS_InURLPathFirst;
663 
664  /*
665  * Check stack depth before recursing. (Since TParserGet() doesn't
666  * normally recurse, we put the cost of checking here not there.)
667  */
669 
670  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
671  {
672  prs->state->posbyte += tmpprs->lenbytetoken;
673  prs->state->poschar += tmpprs->lenchartoken;
674  prs->state->lenbytetoken += tmpprs->lenbytetoken;
675  prs->state->lenchartoken += tmpprs->lenchartoken;
676  prs->state->charlen = tmpprs->state->charlen;
677  res = 1;
678  }
679  TParserCopyClose(tmpprs);
680 
681  return res;
682 }
683 
684 /*
685  * returns true if current character has zero display length or
686  * it's a special sign in several languages. Such characters
687  * aren't a word-breaker although they aren't an isalpha.
688  * In beginning of word they aren't a part of it.
689  */
690 static int
692 {
693  /*
694  * pg_dsplen could return -1 which means error or control character
695  */
696  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
697  return 1;
698 
699  /*
700  * Unicode Characters in the 'Mark, Spacing Combining' Category That
701  * characters are not alpha although they are not breakers of word too.
702  * Check that only in utf encoding, because other encodings aren't
703  * supported by postgres or even exists.
704  */
705  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
706  {
707  static const pg_wchar strange_letter[] = {
708  /*
709  * use binary search, so elements should be ordered
710  */
711  0x0903, /* DEVANAGARI SIGN VISARGA */
712  0x093E, /* DEVANAGARI VOWEL SIGN AA */
713  0x093F, /* DEVANAGARI VOWEL SIGN I */
714  0x0940, /* DEVANAGARI VOWEL SIGN II */
715  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
716  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
717  0x094B, /* DEVANAGARI VOWEL SIGN O */
718  0x094C, /* DEVANAGARI VOWEL SIGN AU */
719  0x0982, /* BENGALI SIGN ANUSVARA */
720  0x0983, /* BENGALI SIGN VISARGA */
721  0x09BE, /* BENGALI VOWEL SIGN AA */
722  0x09BF, /* BENGALI VOWEL SIGN I */
723  0x09C0, /* BENGALI VOWEL SIGN II */
724  0x09C7, /* BENGALI VOWEL SIGN E */
725  0x09C8, /* BENGALI VOWEL SIGN AI */
726  0x09CB, /* BENGALI VOWEL SIGN O */
727  0x09CC, /* BENGALI VOWEL SIGN AU */
728  0x09D7, /* BENGALI AU LENGTH MARK */
729  0x0A03, /* GURMUKHI SIGN VISARGA */
730  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
731  0x0A3F, /* GURMUKHI VOWEL SIGN I */
732  0x0A40, /* GURMUKHI VOWEL SIGN II */
733  0x0A83, /* GUJARATI SIGN VISARGA */
734  0x0ABE, /* GUJARATI VOWEL SIGN AA */
735  0x0ABF, /* GUJARATI VOWEL SIGN I */
736  0x0AC0, /* GUJARATI VOWEL SIGN II */
737  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
738  0x0ACB, /* GUJARATI VOWEL SIGN O */
739  0x0ACC, /* GUJARATI VOWEL SIGN AU */
740  0x0B02, /* ORIYA SIGN ANUSVARA */
741  0x0B03, /* ORIYA SIGN VISARGA */
742  0x0B3E, /* ORIYA VOWEL SIGN AA */
743  0x0B40, /* ORIYA VOWEL SIGN II */
744  0x0B47, /* ORIYA VOWEL SIGN E */
745  0x0B48, /* ORIYA VOWEL SIGN AI */
746  0x0B4B, /* ORIYA VOWEL SIGN O */
747  0x0B4C, /* ORIYA VOWEL SIGN AU */
748  0x0B57, /* ORIYA AU LENGTH MARK */
749  0x0BBE, /* TAMIL VOWEL SIGN AA */
750  0x0BBF, /* TAMIL VOWEL SIGN I */
751  0x0BC1, /* TAMIL VOWEL SIGN U */
752  0x0BC2, /* TAMIL VOWEL SIGN UU */
753  0x0BC6, /* TAMIL VOWEL SIGN E */
754  0x0BC7, /* TAMIL VOWEL SIGN EE */
755  0x0BC8, /* TAMIL VOWEL SIGN AI */
756  0x0BCA, /* TAMIL VOWEL SIGN O */
757  0x0BCB, /* TAMIL VOWEL SIGN OO */
758  0x0BCC, /* TAMIL VOWEL SIGN AU */
759  0x0BD7, /* TAMIL AU LENGTH MARK */
760  0x0C01, /* TELUGU SIGN CANDRABINDU */
761  0x0C02, /* TELUGU SIGN ANUSVARA */
762  0x0C03, /* TELUGU SIGN VISARGA */
763  0x0C41, /* TELUGU VOWEL SIGN U */
764  0x0C42, /* TELUGU VOWEL SIGN UU */
765  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
766  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
767  0x0C82, /* KANNADA SIGN ANUSVARA */
768  0x0C83, /* KANNADA SIGN VISARGA */
769  0x0CBE, /* KANNADA VOWEL SIGN AA */
770  0x0CC0, /* KANNADA VOWEL SIGN II */
771  0x0CC1, /* KANNADA VOWEL SIGN U */
772  0x0CC2, /* KANNADA VOWEL SIGN UU */
773  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
774  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
775  0x0CC7, /* KANNADA VOWEL SIGN EE */
776  0x0CC8, /* KANNADA VOWEL SIGN AI */
777  0x0CCA, /* KANNADA VOWEL SIGN O */
778  0x0CCB, /* KANNADA VOWEL SIGN OO */
779  0x0CD5, /* KANNADA LENGTH MARK */
780  0x0CD6, /* KANNADA AI LENGTH MARK */
781  0x0D02, /* MALAYALAM SIGN ANUSVARA */
782  0x0D03, /* MALAYALAM SIGN VISARGA */
783  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
784  0x0D3F, /* MALAYALAM VOWEL SIGN I */
785  0x0D40, /* MALAYALAM VOWEL SIGN II */
786  0x0D46, /* MALAYALAM VOWEL SIGN E */
787  0x0D47, /* MALAYALAM VOWEL SIGN EE */
788  0x0D48, /* MALAYALAM VOWEL SIGN AI */
789  0x0D4A, /* MALAYALAM VOWEL SIGN O */
790  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
791  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
792  0x0D57, /* MALAYALAM AU LENGTH MARK */
793  0x0D82, /* SINHALA SIGN ANUSVARAYA */
794  0x0D83, /* SINHALA SIGN VISARGAYA */
795  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
796  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
797  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
798  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
799  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
800  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
801  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
802  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
803  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
804  * AELA-PILLA */
805  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
806  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
807  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
808  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
809  0x0F3E, /* TIBETAN SIGN YAR TSHES */
810  0x0F3F, /* TIBETAN SIGN MAR TSHES */
811  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
812  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
813  0x102C, /* MYANMAR VOWEL SIGN AA */
814  0x1031, /* MYANMAR VOWEL SIGN E */
815  0x1038, /* MYANMAR SIGN VISARGA */
816  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
817  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
818  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
819  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
820  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
821  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
822  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
823  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
824  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
825  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
826  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
827  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
828  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
829  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
830  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
831  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
832  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
833  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
834  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
835  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
836  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
837  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
838  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
839  0x17B6, /* KHMER VOWEL SIGN AA */
840  0x17BE, /* KHMER VOWEL SIGN OE */
841  0x17BF, /* KHMER VOWEL SIGN YA */
842  0x17C0, /* KHMER VOWEL SIGN IE */
843  0x17C1, /* KHMER VOWEL SIGN E */
844  0x17C2, /* KHMER VOWEL SIGN AE */
845  0x17C3, /* KHMER VOWEL SIGN AI */
846  0x17C4, /* KHMER VOWEL SIGN OO */
847  0x17C5, /* KHMER VOWEL SIGN AU */
848  0x17C7, /* KHMER SIGN REAHMUK */
849  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
850  0x1923, /* LIMBU VOWEL SIGN EE */
851  0x1924, /* LIMBU VOWEL SIGN AI */
852  0x1925, /* LIMBU VOWEL SIGN OO */
853  0x1926, /* LIMBU VOWEL SIGN AU */
854  0x1929, /* LIMBU SUBJOINED LETTER YA */
855  0x192A, /* LIMBU SUBJOINED LETTER RA */
856  0x192B, /* LIMBU SUBJOINED LETTER WA */
857  0x1930, /* LIMBU SMALL LETTER KA */
858  0x1931, /* LIMBU SMALL LETTER NGA */
859  0x1933, /* LIMBU SMALL LETTER TA */
860  0x1934, /* LIMBU SMALL LETTER NA */
861  0x1935, /* LIMBU SMALL LETTER PA */
862  0x1936, /* LIMBU SMALL LETTER MA */
863  0x1937, /* LIMBU SMALL LETTER RA */
864  0x1938, /* LIMBU SMALL LETTER LA */
865  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
866  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
867  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
868  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
869  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
870  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
871  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
872  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
873  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
874  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
875  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
876  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
877  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
878  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
879  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
880  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
881  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
882  0x19C8, /* NEW TAI LUE TONE MARK-1 */
883  0x19C9, /* NEW TAI LUE TONE MARK-2 */
884  0x1A19, /* BUGINESE VOWEL SIGN E */
885  0x1A1A, /* BUGINESE VOWEL SIGN O */
886  0x1A1B, /* BUGINESE VOWEL SIGN AE */
887  0x1B04, /* BALINESE SIGN BISAH */
888  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
889  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
890  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
891  0x1B3E, /* BALINESE VOWEL SIGN TALING */
892  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
893  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
894  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
895  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
896  0x1B44, /* BALINESE ADEG ADEG */
897  0x1B82, /* SUNDANESE SIGN PANGWISAD */
898  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
899  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
900  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
901  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
902  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
903  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
904  0x1C26, /* LEPCHA VOWEL SIGN AA */
905  0x1C27, /* LEPCHA VOWEL SIGN I */
906  0x1C28, /* LEPCHA VOWEL SIGN O */
907  0x1C29, /* LEPCHA VOWEL SIGN OO */
908  0x1C2A, /* LEPCHA VOWEL SIGN U */
909  0x1C2B, /* LEPCHA VOWEL SIGN UU */
910  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
911  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
912  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
913  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
914  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
915  0xA880, /* SAURASHTRA SIGN ANUSVARA */
916  0xA881, /* SAURASHTRA SIGN VISARGA */
917  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
918  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
919  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
920  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
921  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
922  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
923  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
924  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
925  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
926  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
927  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
928  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
929  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
930  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
931  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
932  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
933  0xA952, /* REJANG CONSONANT SIGN H */
934  0xA953, /* REJANG VIRAMA */
935  0xAA2F, /* CHAM VOWEL SIGN O */
936  0xAA30, /* CHAM VOWEL SIGN AI */
937  0xAA33, /* CHAM CONSONANT SIGN YA */
938  0xAA34, /* CHAM CONSONANT SIGN RA */
939  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
940  };
941  const pg_wchar *StopLow = strange_letter,
942  *StopHigh = strange_letter + lengthof(strange_letter),
943  *StopMiddle;
944  pg_wchar c;
945 
946  if (prs->pgwstr)
947  c = *(prs->pgwstr + prs->state->poschar);
948  else
949  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
950 
951  while (StopLow < StopHigh)
952  {
953  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
954  if (*StopMiddle == c)
955  return 1;
956  else if (*StopMiddle < c)
957  StopLow = StopMiddle + 1;
958  else
959  StopHigh = StopMiddle;
960  }
961  }
962 
963  return 0;
964 }
965 
966 /*
967  * Table of state/action of parser
968  */
969 
971  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
972  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
973  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
974  {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
975  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
976  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
977  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
978  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
979  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
980  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
981  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
982  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
983  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
984 };
985 
986 
988  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
989  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
990  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
991  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
992  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
993  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
994  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
995  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
996 };
997 
999  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1000  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1001  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1002  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1003  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1004  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1005  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1006  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1007  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1008  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1009  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1010  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1011  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1012  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1013  {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1014 };
1015 
1017  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1018  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1019  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1020  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1021  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1022  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1023 };
1024 
1026  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1027  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1028  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1029  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1030  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1031  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1032  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1033  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1034  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1035  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1036  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1037  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1038  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1039  {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1040 };
1041 
1043  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1044  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1045  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1046 };
1047 
1049  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1050  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1051  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1052  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1053  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1054  {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1055 };
1056 
1058  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1059  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1060  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1061  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1062  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1063  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1064  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1065  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1066  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1067 };
1068 
1070  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1071  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1072  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1073 };
1074 
1076  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1077  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1078  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1079  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1080  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1081  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1082 };
1083 
1085  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1086  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1087  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1088 };
1089 
1091  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1092  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1093  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1094  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1095  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1096  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1097 };
1098 
1100  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1101  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1102  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1103 };
1104 
1106  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1107  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1108  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1109 };
1110 
1111 
1113  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1114  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1115  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1116 };
1117 
1119  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1120  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1121  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1122  {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1123 };
1124 
1126  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1127  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1128  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1129  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1130  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1131 };
1132 
1134  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1135  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1136  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1137 };
1138 
1140  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1141  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1142  {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1143 };
1144 
1146  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1147  {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1148  {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1149  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1150  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1151  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1152 };
1153 
1155  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1156  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1157  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1158  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1159  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1160  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1161  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1162  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1163 };
1164 
1166  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1167  {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1168  {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1169  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1170  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1171 };
1172 
1174  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1175  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1176  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1177 };
1178 
1180  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1181  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1182  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1183  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1184 };
1185 
1187  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1188  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1189  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1190  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1191 };
1192 
1194  {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1195 };
1196 
1198  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1199  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1200  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1201  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1202  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1203  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1204  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1205  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1206 };
1207 
1209  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1210  /* <?xml ... */
1211  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1212  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1213  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1214 };
1215 
1217  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1218  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1219  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1220 };
1221 
1223  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1224  /* <br/> case */
1225  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1226  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1227  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1228  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1229  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1230  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1231  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1232  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1233  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1234 };
1235 
1237  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1238  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1239  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1240 };
1241 
1243  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1244  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1245  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1246  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1247  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1248  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1249  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1250  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1251  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1252  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1253  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1254  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1255  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1256  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1257  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1258  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1259  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1260  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1261  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1262 };
1263 
1265  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1266  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1267  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1268  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1269 };
1270 
1272  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1273  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1274  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1275  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1276 };
1277 
1279  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1280  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1281 };
1282 
1284  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1285 };
1286 
1288  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1289  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1290  /* <!DOCTYPE ...> */
1291  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1292  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1293  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1294 };
1295 
1297  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1298  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1299  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1300 };
1301 
1303  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1304  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1305  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1306 };
1307 
1309  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1310  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1311  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1312 };
1313 
1315  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1317  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1318  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1319 };
1320 
1322  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1323 };
1324 
1326  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1327  {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1328  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1329  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1330 };
1331 
1333  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1334  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1335  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1336  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1337  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1338  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1339  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1340  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1341 };
1342 
1344  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1345  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1346  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1347  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1348  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1349  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1350  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1351  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1352  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1354  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1355  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1356 };
1357 
1359  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1360  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1361  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1362 };
1363 
1365  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1366  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1368  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1369  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1370 };
1371 
1373  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1374  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1375  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1376  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1377 };
1378 
1380  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1381  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1382  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1383  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1384  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1385  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1386  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1387  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1388 };
1389 
1391  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1392  {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1393  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1394 };
1395 
1397  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1398  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1399  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1400  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1401  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1402  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1403  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1404 };
1405 
1407  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1408  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1409  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1410  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1411  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1412  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1413 };
1414 
1416  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1417  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1418  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1419  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1420  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1421  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1422  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1423 };
1424 
1426  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1427  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1428  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1429  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1430 };
1431 
1433  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1434  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1435  {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1436  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1437  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1438 };
1439 
1441  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1442  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1443  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1444  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1445  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1446  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1447  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1448  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1449 };
1450 
1452  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1453  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1454  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1455  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1456  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1457 };
1458 
1460  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1461  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1462  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1463 };
1464 
1466  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1467 };
1468 
1470  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1471  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1472  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1473 };
1474 
1476  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1478  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1479 };
1480 
1482  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1483  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1484  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1485 };
1486 
1488  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1489  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1490  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1491 };
1492 
1494  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1495 };
1496 
1498  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1499  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1500  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1501  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1502  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1503 };
1504 
1507  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1508  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1509  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1510  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1511  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1513 };
1514 
1516  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1517  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1518  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1519  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1520 };
1521 
1524  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1525  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1526  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1527  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1529 };
1530 
1532  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1533  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1534  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1535  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1536 };
1537 
1540  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1541  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1542  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1544 };
1545 
1547  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1548  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1549  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1550  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1551  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1552 };
1553 
1555  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1557  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1558  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1559  {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1560  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1561 };
1562 
1564  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1565  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1567  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1568 };
1569 
1571  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1572  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1573  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1574  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1575  {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1576 };
1577 
1579  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1581  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1582  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1583  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1584  {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1585 };
1586 
1588  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1589  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1590  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1591  {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1592 };
1593 
1595  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1596  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1597  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1599  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1600 };
1601 
1602 
1603 /*
1604  * main table of per-state parser actions
1605  */
1606 typedef struct
1607 {
1608  const TParserStateActionItem *action; /* the actual state info */
1609  TParserState state; /* only for Assert crosscheck */
1610 #ifdef WPARSER_TRACE
1611  const char *state_name; /* only for debug printout */
1612 #endif
1614 
1615 #ifdef WPARSER_TRACE
1616 #define TPARSERSTATEACTION(state) \
1617  { CppConcat(action,state), state, CppAsString(state) }
1618 #else
1619 #define TPARSERSTATEACTION(state) \
1620  { CppConcat(action,state), state }
1621 #endif
1622 
1623 /*
1624  * order must be the same as in typedef enum {} TParserState!!
1625  */
1626 
1627 static const TParserStateAction Actions[] = {
1705 };
1706 
1707 
1708 static bool
1710 {
1711  const TParserStateActionItem *item = NULL;
1712 
1714 
1715  Assert(prs->state);
1716 
1717  if (prs->state->posbyte >= prs->lenstr)
1718  return false;
1719 
1720  prs->token = prs->str + prs->state->posbyte;
1721  prs->state->pushedAtAction = NULL;
1722 
1723  /* look at string */
1724  while (prs->state->posbyte <= prs->lenstr)
1725  {
1726  if (prs->state->posbyte == prs->lenstr)
1727  prs->state->charlen = 0;
1728  else
1729  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1730  pg_mblen(prs->str + prs->state->posbyte);
1731 
1732  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1733  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1734  Assert(Actions[prs->state->state].state == prs->state->state);
1735 
1736  if (prs->state->pushedAtAction)
1737  {
1738  /* After a POP, pick up at the next test */
1739  item = prs->state->pushedAtAction + 1;
1740  prs->state->pushedAtAction = NULL;
1741  }
1742  else
1743  {
1744  item = Actions[prs->state->state].action;
1745  Assert(item != NULL);
1746  }
1747 
1748  /* find action by character class */
1749  while (item->isclass)
1750  {
1751  prs->c = item->c;
1752  if (item->isclass(prs) != 0)
1753  break;
1754  item++;
1755  }
1756 
1757 #ifdef WPARSER_TRACE
1758  {
1759  TParserPosition *ptr;
1760 
1761  fprintf(stderr, "state ");
1762  /* indent according to stack depth */
1763  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1764  fprintf(stderr, " ");
1765  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1766  if (prs->state->posbyte < prs->lenstr)
1767  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1768  else
1769  fprintf(stderr, "at EOF");
1770  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1771  (int) (item - Actions[prs->state->state].action),
1772  (item->flags & A_BINGO) ? " BINGO" : "",
1773  (item->flags & A_POP) ? " POP" : "",
1774  (item->flags & A_PUSH) ? " PUSH" : "",
1775  (item->flags & A_RERUN) ? " RERUN" : "",
1776  (item->flags & A_CLEAR) ? " CLEAR" : "",
1777  (item->flags & A_MERGE) ? " MERGE" : "",
1778  (item->flags & A_CLRALL) ? " CLRALL" : "",
1779  (item->tostate != TPS_Null) ? " tostate " : "",
1780  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1781  (item->type > 0) ? " type " : "",
1782  tok_alias[item->type]);
1783  }
1784 #endif
1785 
1786  /* call special handler if exists */
1787  if (item->special)
1788  item->special(prs);
1789 
1790  /* BINGO, token is found */
1791  if (item->flags & A_BINGO)
1792  {
1793  Assert(item->type > 0);
1794  prs->lenbytetoken = prs->state->lenbytetoken;
1795  prs->lenchartoken = prs->state->lenchartoken;
1796  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1797  prs->type = item->type;
1798  }
1799 
1800  /* do various actions by flags */
1801  if (item->flags & A_POP)
1802  { /* pop stored state in stack */
1803  TParserPosition *ptr = prs->state->prev;
1804 
1805  pfree(prs->state);
1806  prs->state = ptr;
1807  Assert(prs->state);
1808  }
1809  else if (item->flags & A_PUSH)
1810  { /* push (store) state in stack */
1811  prs->state->pushedAtAction = item; /* remember where we push */
1812  prs->state = newTParserPosition(prs->state);
1813  }
1814  else if (item->flags & A_CLEAR)
1815  { /* clear previous pushed state */
1816  TParserPosition *ptr;
1817 
1818  Assert(prs->state->prev);
1819  ptr = prs->state->prev->prev;
1820  pfree(prs->state->prev);
1821  prs->state->prev = ptr;
1822  }
1823  else if (item->flags & A_CLRALL)
1824  { /* clear all previous pushed state */
1825  TParserPosition *ptr;
1826 
1827  while (prs->state->prev)
1828  {
1829  ptr = prs->state->prev->prev;
1830  pfree(prs->state->prev);
1831  prs->state->prev = ptr;
1832  }
1833  }
1834  else if (item->flags & A_MERGE)
1835  { /* merge posinfo with current and pushed state */
1836  TParserPosition *ptr = prs->state;
1837 
1838  Assert(prs->state->prev);
1839  prs->state = prs->state->prev;
1840 
1841  prs->state->posbyte = ptr->posbyte;
1842  prs->state->poschar = ptr->poschar;
1843  prs->state->charlen = ptr->charlen;
1844  prs->state->lenbytetoken = ptr->lenbytetoken;
1845  prs->state->lenchartoken = ptr->lenchartoken;
1846  pfree(ptr);
1847  }
1848 
1849  /* set new state if pointed */
1850  if (item->tostate != TPS_Null)
1851  prs->state->state = item->tostate;
1852 
1853  /* check for go away */
1854  if ((item->flags & A_BINGO) ||
1855  (prs->state->posbyte >= prs->lenstr &&
1856  (item->flags & A_RERUN) == 0))
1857  break;
1858 
1859  /* go to beginning of loop if we should rerun or we just restore state */
1860  if (item->flags & (A_RERUN | A_POP))
1861  continue;
1862 
1863  /* move forward */
1864  if (prs->state->charlen)
1865  {
1866  prs->state->posbyte += prs->state->charlen;
1867  prs->state->lenbytetoken += prs->state->charlen;
1868  prs->state->poschar++;
1869  prs->state->lenchartoken++;
1870  }
1871  }
1872 
1873  return (item && (item->flags & A_BINGO));
1874 }
1875 
1876 Datum
1878 {
1879  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1880  int i;
1881 
1882  for (i = 1; i <= LASTNUM; i++)
1883  {
1884  descr[i - 1].lexid = i;
1885  descr[i - 1].alias = pstrdup(tok_alias[i]);
1886  descr[i - 1].descr = pstrdup(lex_descr[i]);
1887  }
1888 
1889  descr[LASTNUM].lexid = 0;
1890 
1891  PG_RETURN_POINTER(descr);
1892 }
1893 
1894 Datum
1896 {
1898 }
1899 
1900 Datum
1902 {
1903  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1904  char **t = (char **) PG_GETARG_POINTER(1);
1905  int *tlen = (int *) PG_GETARG_POINTER(2);
1906 
1907  if (!TParserGet(p))
1908  PG_RETURN_INT32(0);
1909 
1910  *t = p->token;
1911  *tlen = p->lenbytetoken;
1912 
1913  PG_RETURN_INT32(p->type);
1914 }
1915 
1916 Datum
1918 {
1919  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1920 
1921  TParserClose(p);
1922  PG_RETURN_VOID();
1923 }
1924 
1925 
1926 /*
1927  * ts_headline support begins here
1928  */
1929 
1930 /* token type classification macros */
1931 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1932 #define HLIDREPLACE(x) ( (x)==TAG_T )
1933 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1934 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1935 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1936 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1937 
1938 /*
1939  * Macros useful in headline selection. These rely on availability of
1940  * "HeadlineParsedText *prs" describing some text, and "int shortword"
1941  * describing the "short word" length parameter.
1942  */
1943 
1944 /* Interesting words are non-repeated search terms */
1945 #define INTERESTINGWORD(j) \
1946  (prs->words[j].item && !prs->words[j].repeated)
1947 
1948 /* Don't want to end at a non-word or a short word, unless interesting */
1949 #define BADENDPOINT(j) \
1950  ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1951  !INTERESTINGWORD(j))
1952 
1953 typedef struct
1954 {
1955  /* one cover (well, really one fragment) for mark_hl_fragments */
1956  int32 startpos; /* fragment's starting word index */
1957  int32 endpos; /* ending word index (inclusive) */
1958  int32 poslen; /* number of interesting words */
1959  int32 curlen; /* total number of words */
1960  bool chosen; /* chosen? */
1961  bool excluded; /* excluded? */
1962 } CoverPos;
1963 
1964 typedef struct
1965 {
1966  /* callback data for checkcondition_HL */
1968  int len;
1969 } hlCheck;
1970 
1971 
1972 /*
1973  * TS_execute callback for matching a tsquery operand to headline words
1974  *
1975  * Note: it's tempting to report words[] indexes as pos values to save
1976  * searching in hlCover; but that would screw up phrase matching, which
1977  * expects to measure distances in lexemes not tokens.
1978  */
1979 static TSTernaryValue
1981 {
1982  hlCheck *checkval = (hlCheck *) opaque;
1983  int i;
1984 
1985  /* scan words array for matching items */
1986  for (i = 0; i < checkval->len; i++)
1987  {
1988  if (checkval->words[i].item == val)
1989  {
1990  /* if data == NULL, don't need to report positions */
1991  if (!data)
1992  return TS_YES;
1993 
1994  if (!data->pos)
1995  {
1996  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1997  data->allocated = true;
1998  data->npos = 1;
1999  data->pos[0] = checkval->words[i].pos;
2000  }
2001  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2002  {
2003  data->pos[data->npos++] = checkval->words[i].pos;
2004  }
2005  }
2006  }
2007 
2008  if (data && data->npos > 0)
2009  return TS_YES;
2010 
2011  return TS_NO;
2012 }
2013 
2014 /*
2015  * hlCover: try to find a substring of prs' word list that satisfies query
2016  *
2017  * locations is the result of TS_execute_locations() for the query.
2018  * We use this to identify plausible subranges of the query.
2019  *
2020  * *nextpos is the lexeme position (NOT word index) to start the search
2021  * at. Caller should initialize this to zero. If successful, we'll
2022  * advance it to the next place to search at.
2023  *
2024  * On success, sets *p to first word index and *q to last word index of the
2025  * cover substring, and returns true.
2026  *
2027  * The result is a minimal cover, in the sense that both *p and *q will be
2028  * words used in the query.
2029  */
2030 static bool
2031 hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2032  int *nextpos, int *p, int *q)
2033 {
2034  int pos = *nextpos;
2035 
2036  /* This loop repeats when our selected word-range fails the query */
2037  for (;;)
2038  {
2039  int posb,
2040  pose;
2041  ListCell *lc;
2042 
2043  /*
2044  * For each AND'ed query term or phrase, find its first occurrence at
2045  * or after pos; set pose to the maximum of those positions.
2046  *
2047  * We need not consider ORs or NOTs here; see the comments for
2048  * TS_execute_locations(). Rechecking the match with TS_execute(),
2049  * below, will deal with any ensuing imprecision.
2050  */
2051  pose = -1;
2052  foreach(lc, locations)
2053  {
2054  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2055  int first = -1;
2056 
2057  for (int i = 0; i < pdata->npos; i++)
2058  {
2059  /* For phrase matches, use the ending lexeme */
2060  int endp = pdata->pos[i];
2061 
2062  if (endp >= pos)
2063  {
2064  first = endp;
2065  break;
2066  }
2067  }
2068  if (first < 0)
2069  return false; /* no more matches for this term */
2070  if (first > pose)
2071  pose = first;
2072  }
2073 
2074  if (pose < 0)
2075  return false; /* we only get here if empty list */
2076 
2077  /*
2078  * Now, for each AND'ed query term or phrase, find its last occurrence
2079  * at or before pose; set posb to the minimum of those positions.
2080  *
2081  * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2082  * posb + 1 below.
2083  */
2084  posb = INT_MAX - 1;
2085  foreach(lc, locations)
2086  {
2087  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2088  int last = -1;
2089 
2090  for (int i = pdata->npos - 1; i >= 0; i--)
2091  {
2092  /* For phrase matches, use the starting lexeme */
2093  int startp = pdata->pos[i] - pdata->width;
2094 
2095  if (startp <= pose)
2096  {
2097  last = startp;
2098  break;
2099  }
2100  }
2101  if (last < posb)
2102  posb = last;
2103  }
2104 
2105  /*
2106  * We could end up with posb to the left of pos, in case some phrase
2107  * match crosses pos. Try the match starting at pos anyway, since the
2108  * result of TS_execute_locations is imprecise for phrase matches OR'd
2109  * with plain matches; that is, if the query is "(A <-> B) | C" then C
2110  * could match at pos even though the phrase match would have to
2111  * extend to the left of pos.
2112  */
2113  posb = Max(posb, pos);
2114 
2115  /* This test probably always succeeds, but be paranoid */
2116  if (posb <= pose)
2117  {
2118  /*
2119  * posb .. pose is now the shortest, earliest-after-pos range of
2120  * lexeme positions containing all the query terms. It will
2121  * contain all phrase matches, too, except in the corner case
2122  * described just above.
2123  *
2124  * Now convert these lexeme positions to indexes in prs->words[].
2125  */
2126  int idxb = -1;
2127  int idxe = -1;
2128 
2129  for (int i = 0; i < prs->curwords; i++)
2130  {
2131  if (prs->words[i].item == NULL)
2132  continue;
2133  if (idxb < 0 && prs->words[i].pos >= posb)
2134  idxb = i;
2135  if (prs->words[i].pos <= pose)
2136  idxe = i;
2137  else
2138  break;
2139  }
2140 
2141  /* This test probably always succeeds, but be paranoid */
2142  if (idxb >= 0 && idxe >= idxb)
2143  {
2144  /*
2145  * Finally, check that the selected range satisfies the query.
2146  * This should succeed in all simple cases; but odd cases
2147  * involving non-top-level NOT conditions or phrase matches
2148  * OR'd with other things could fail, since the result of
2149  * TS_execute_locations doesn't fully represent such things.
2150  */
2151  hlCheck ch;
2152 
2153  ch.words = &(prs->words[idxb]);
2154  ch.len = idxe - idxb + 1;
2155  if (TS_execute(GETQUERY(query), &ch,
2157  {
2158  /* Match! Advance *nextpos and return the word range. */
2159  *nextpos = posb + 1;
2160  *p = idxb;
2161  *q = idxe;
2162  return true;
2163  }
2164  }
2165  }
2166 
2167  /*
2168  * Advance pos and try again. Any later workable match must start
2169  * beyond posb.
2170  */
2171  pos = posb + 1;
2172  }
2173  /* Can't get here, but stupider compilers complain if we leave it off */
2174  return false;
2175 }
2176 
2177 /*
2178  * Apply suitable highlight marking to words selected by headline selector
2179  *
2180  * The words from startpos to endpos inclusive are marked per highlightall
2181  */
2182 static void
2183 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2184  int startpos, int endpos)
2185 {
2186  int i;
2187 
2188  for (i = startpos; i <= endpos; i++)
2189  {
2190  if (prs->words[i].item)
2191  prs->words[i].selected = 1;
2192  if (!highlightall)
2193  {
2194  if (HLIDREPLACE(prs->words[i].type))
2195  prs->words[i].replace = 1;
2196  else if (HLIDSKIP(prs->words[i].type))
2197  prs->words[i].skip = 1;
2198  }
2199  else
2200  {
2201  if (XMLHLIDSKIP(prs->words[i].type))
2202  prs->words[i].skip = 1;
2203  }
2204 
2205  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2206  }
2207 }
2208 
2209 /*
2210  * split a cover substring into fragments not longer than max_words
2211  *
2212  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2213  * substring. They are updated to hold the bounds of the next fragment.
2214  *
2215  * *curlen and *poslen are set to the fragment's length, in words and
2216  * interesting words respectively.
2217  */
2218 static void
2220  int *curlen, int *poslen, int max_words)
2221 {
2222  int i;
2223 
2224  /*
2225  * Objective: select a fragment of words between startpos and endpos such
2226  * that it has at most max_words and both ends have query words. If the
2227  * startpos and endpos are the endpoints of the cover and the cover has
2228  * fewer words than max_words, then this function should just return the
2229  * cover
2230  */
2231  /* first move startpos to an item */
2232  for (i = *startpos; i <= *endpos; i++)
2233  {
2234  *startpos = i;
2235  if (INTERESTINGWORD(i))
2236  break;
2237  }
2238  /* cut endpos to have only max_words */
2239  *curlen = 0;
2240  *poslen = 0;
2241  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2242  {
2243  if (!NONWORDTOKEN(prs->words[i].type))
2244  *curlen += 1;
2245  if (INTERESTINGWORD(i))
2246  *poslen += 1;
2247  }
2248  /* if the cover was cut then move back endpos to a query item */
2249  if (*endpos > i)
2250  {
2251  *endpos = i;
2252  for (i = *endpos; i >= *startpos; i--)
2253  {
2254  *endpos = i;
2255  if (INTERESTINGWORD(i))
2256  break;
2257  if (!NONWORDTOKEN(prs->words[i].type))
2258  *curlen -= 1;
2259  }
2260  }
2261 }
2262 
2263 /*
2264  * Headline selector used when MaxFragments > 0
2265  *
2266  * Note: in this mode, highlightall is disregarded for phrase selection;
2267  * it only controls presentation details.
2268  */
2269 static void
2271  bool highlightall,
2272  int shortword, int min_words,
2273  int max_words, int max_fragments)
2274 {
2275  int32 poslen,
2276  curlen,
2277  i,
2278  f,
2279  num_f = 0;
2280  int32 stretch,
2281  maxstretch,
2282  posmarker;
2283 
2284  int32 startpos = 0,
2285  endpos = 0,
2286  nextpos = 0,
2287  p = 0,
2288  q = 0;
2289 
2290  int32 numcovers = 0,
2291  maxcovers = 32;
2292 
2293  int32 minI,
2294  minwords,
2295  maxitems;
2296  CoverPos *covers;
2297 
2298  covers = palloc(maxcovers * sizeof(CoverPos));
2299 
2300  /* get all covers */
2301  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2302  {
2303  startpos = p;
2304  endpos = q;
2305 
2306  /*
2307  * Break the cover into smaller fragments such that each fragment has
2308  * at most max_words. Also ensure that each end of each fragment is a
2309  * query word. This will allow us to stretch the fragment in either
2310  * direction
2311  */
2312 
2313  while (startpos <= endpos)
2314  {
2315  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2316  if (numcovers >= maxcovers)
2317  {
2318  maxcovers *= 2;
2319  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2320  }
2321  covers[numcovers].startpos = startpos;
2322  covers[numcovers].endpos = endpos;
2323  covers[numcovers].curlen = curlen;
2324  covers[numcovers].poslen = poslen;
2325  covers[numcovers].chosen = false;
2326  covers[numcovers].excluded = false;
2327  numcovers++;
2328  startpos = endpos + 1;
2329  endpos = q;
2330  }
2331  }
2332 
2333  /* choose best covers */
2334  for (f = 0; f < max_fragments; f++)
2335  {
2336  maxitems = 0;
2337  minwords = PG_INT32_MAX;
2338  minI = -1;
2339 
2340  /*
2341  * Choose the cover that contains max items. In case of tie choose the
2342  * one with smaller number of words.
2343  */
2344  for (i = 0; i < numcovers; i++)
2345  {
2346  if (!covers[i].chosen && !covers[i].excluded &&
2347  (maxitems < covers[i].poslen ||
2348  (maxitems == covers[i].poslen &&
2349  minwords > covers[i].curlen)))
2350  {
2351  maxitems = covers[i].poslen;
2352  minwords = covers[i].curlen;
2353  minI = i;
2354  }
2355  }
2356  /* if a cover was found mark it */
2357  if (minI >= 0)
2358  {
2359  covers[minI].chosen = true;
2360  /* adjust the size of cover */
2361  startpos = covers[minI].startpos;
2362  endpos = covers[minI].endpos;
2363  curlen = covers[minI].curlen;
2364  /* stretch the cover if cover size is lower than max_words */
2365  if (curlen < max_words)
2366  {
2367  /* divide the stretch on both sides of cover */
2368  maxstretch = (max_words - curlen) / 2;
2369 
2370  /*
2371  * first stretch the startpos stop stretching if 1. we hit the
2372  * beginning of document 2. exceed maxstretch 3. we hit an
2373  * already marked fragment
2374  */
2375  stretch = 0;
2376  posmarker = startpos;
2377  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2378  {
2379  if (!NONWORDTOKEN(prs->words[i].type))
2380  {
2381  curlen++;
2382  stretch++;
2383  }
2384  posmarker = i;
2385  }
2386  /* cut back startpos till we find a good endpoint */
2387  for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2388  {
2389  if (!NONWORDTOKEN(prs->words[i].type))
2390  curlen--;
2391  }
2392  startpos = i;
2393  /* now stretch the endpos as much as possible */
2394  posmarker = endpos;
2395  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2396  {
2397  if (!NONWORDTOKEN(prs->words[i].type))
2398  curlen++;
2399  posmarker = i;
2400  }
2401  /* cut back endpos till we find a good endpoint */
2402  for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2403  {
2404  if (!NONWORDTOKEN(prs->words[i].type))
2405  curlen--;
2406  }
2407  endpos = i;
2408  }
2409  covers[minI].startpos = startpos;
2410  covers[minI].endpos = endpos;
2411  covers[minI].curlen = curlen;
2412  /* Mark the chosen fragments (covers) */
2413  mark_fragment(prs, highlightall, startpos, endpos);
2414  num_f++;
2415  /* Exclude covers overlapping this one from future consideration */
2416  for (i = 0; i < numcovers; i++)
2417  {
2418  if (i != minI &&
2419  ((covers[i].startpos >= startpos &&
2420  covers[i].startpos <= endpos) ||
2421  (covers[i].endpos >= startpos &&
2422  covers[i].endpos <= endpos) ||
2423  (covers[i].startpos < startpos &&
2424  covers[i].endpos > endpos)))
2425  covers[i].excluded = true;
2426  }
2427  }
2428  else
2429  break; /* no selectable covers remain */
2430  }
2431 
2432  /* show the first min_words words if we have not marked anything */
2433  if (num_f <= 0)
2434  {
2435  startpos = curlen = 0;
2436  endpos = -1;
2437  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2438  {
2439  if (!NONWORDTOKEN(prs->words[i].type))
2440  curlen++;
2441  endpos = i;
2442  }
2443  mark_fragment(prs, highlightall, startpos, endpos);
2444  }
2445 
2446  pfree(covers);
2447 }
2448 
2449 /*
2450  * Headline selector used when MaxFragments == 0
2451  */
2452 static void
2454  bool highlightall,
2455  int shortword, int min_words, int max_words)
2456 {
2457  int nextpos = 0,
2458  p = 0,
2459  q = 0;
2460  int bestb = -1,
2461  beste = -1;
2462  int bestlen = -1;
2463  bool bestcover = false;
2464  int pose,
2465  posb,
2466  poslen,
2467  curlen;
2468  bool poscover;
2469  int i;
2470 
2471  if (!highlightall)
2472  {
2473  /* examine all covers, select a headline using the best one */
2474  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2475  {
2476  /*
2477  * Count words (curlen) and interesting words (poslen) within
2478  * cover, but stop once we reach max_words. This step doesn't
2479  * consider whether that's a good stopping point. posb and pose
2480  * are set to the start and end indexes of the possible headline.
2481  */
2482  curlen = 0;
2483  poslen = 0;
2484  posb = pose = p;
2485  for (i = p; i <= q && curlen < max_words; i++)
2486  {
2487  if (!NONWORDTOKEN(prs->words[i].type))
2488  curlen++;
2489  if (INTERESTINGWORD(i))
2490  poslen++;
2491  pose = i;
2492  }
2493 
2494  if (curlen < max_words)
2495  {
2496  /*
2497  * We have room to lengthen the headline, so search forward
2498  * until it's full or we find a good stopping point. We'll
2499  * reconsider the word at "q", then move forward.
2500  */
2501  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2502  {
2503  if (i > q)
2504  {
2505  if (!NONWORDTOKEN(prs->words[i].type))
2506  curlen++;
2507  if (INTERESTINGWORD(i))
2508  poslen++;
2509  }
2510  pose = i;
2511  if (BADENDPOINT(i))
2512  continue;
2513  if (curlen >= min_words)
2514  break;
2515  }
2516  if (curlen < min_words)
2517  {
2518  /*
2519  * Reached end of text and our headline is still shorter
2520  * than min_words, so try to extend it to the left.
2521  */
2522  for (i = p - 1; i >= 0; i--)
2523  {
2524  if (!NONWORDTOKEN(prs->words[i].type))
2525  curlen++;
2526  if (INTERESTINGWORD(i))
2527  poslen++;
2528  if (curlen >= max_words)
2529  break;
2530  if (BADENDPOINT(i))
2531  continue;
2532  if (curlen >= min_words)
2533  break;
2534  }
2535  posb = (i >= 0) ? i : 0;
2536  }
2537  }
2538  else
2539  {
2540  /*
2541  * Can't make headline longer, so consider making it shorter
2542  * if needed to avoid a bad endpoint.
2543  */
2544  if (i > q)
2545  i = q;
2546  for (; curlen > min_words; i--)
2547  {
2548  if (!BADENDPOINT(i))
2549  break;
2550  if (!NONWORDTOKEN(prs->words[i].type))
2551  curlen--;
2552  if (INTERESTINGWORD(i))
2553  poslen--;
2554  pose = i - 1;
2555  }
2556  }
2557 
2558  /*
2559  * Check whether the proposed headline includes the original
2560  * cover; it might not if we trimmed it due to max_words.
2561  */
2562  poscover = (posb <= p && pose >= q);
2563 
2564  /*
2565  * Adopt this headline if it's better than the last one, giving
2566  * highest priority to headlines including the cover, then to
2567  * headlines with more interesting words, then to headlines with
2568  * good stopping points. (Since bestlen is initially -1, we will
2569  * certainly adopt the first headline.)
2570  */
2571  if (poscover > bestcover ||
2572  (poscover == bestcover && poslen > bestlen) ||
2573  (poscover == bestcover && poslen == bestlen &&
2574  !BADENDPOINT(pose) && BADENDPOINT(beste)))
2575  {
2576  bestb = posb;
2577  beste = pose;
2578  bestlen = poslen;
2579  bestcover = poscover;
2580  }
2581  }
2582 
2583  /*
2584  * If we found nothing acceptable, select min_words words starting at
2585  * the beginning.
2586  */
2587  if (bestlen < 0)
2588  {
2589  curlen = 0;
2590  pose = -1;
2591  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2592  {
2593  if (!NONWORDTOKEN(prs->words[i].type))
2594  curlen++;
2595  pose = i;
2596  }
2597  bestb = 0;
2598  beste = pose;
2599  }
2600  }
2601  else
2602  {
2603  /* highlightall mode: headline is whole document */
2604  bestb = 0;
2605  beste = prs->curwords - 1;
2606  }
2607 
2608  mark_fragment(prs, highlightall, bestb, beste);
2609 }
2610 
2611 /*
2612  * Default parser's prsheadline function
2613  */
2614 Datum
2616 {
2618  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2619  TSQuery query = PG_GETARG_TSQUERY(2);
2620  List *locations;
2621 
2622  /* default option values: */
2623  int min_words = 15;
2624  int max_words = 35;
2625  int shortword = 3;
2626  int max_fragments = 0;
2627  bool highlightall = false;
2628  ListCell *l;
2629 
2630  /* Extract configuration option values */
2631  prs->startsel = NULL;
2632  prs->stopsel = NULL;
2633  prs->fragdelim = NULL;
2634  foreach(l, prsoptions)
2635  {
2636  DefElem *defel = (DefElem *) lfirst(l);
2637  char *val = defGetString(defel);
2638 
2639  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2640  max_words = pg_strtoint32(val);
2641  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2642  min_words = pg_strtoint32(val);
2643  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2644  shortword = pg_strtoint32(val);
2645  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2646  max_fragments = pg_strtoint32(val);
2647  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2648  prs->startsel = pstrdup(val);
2649  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2650  prs->stopsel = pstrdup(val);
2651  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2652  prs->fragdelim = pstrdup(val);
2653  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2654  highlightall = (pg_strcasecmp(val, "1") == 0 ||
2655  pg_strcasecmp(val, "on") == 0 ||
2656  pg_strcasecmp(val, "true") == 0 ||
2657  pg_strcasecmp(val, "t") == 0 ||
2658  pg_strcasecmp(val, "y") == 0 ||
2659  pg_strcasecmp(val, "yes") == 0);
2660  else
2661  ereport(ERROR,
2662  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2663  errmsg("unrecognized headline parameter: \"%s\"",
2664  defel->defname)));
2665  }
2666 
2667  /* in HighlightAll mode these parameters are ignored */
2668  if (!highlightall)
2669  {
2670  if (min_words >= max_words)
2671  ereport(ERROR,
2672  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2673  errmsg("MinWords should be less than MaxWords")));
2674  if (min_words <= 0)
2675  ereport(ERROR,
2676  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2677  errmsg("MinWords should be positive")));
2678  if (shortword < 0)
2679  ereport(ERROR,
2680  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2681  errmsg("ShortWord should be >= 0")));
2682  if (max_fragments < 0)
2683  ereport(ERROR,
2684  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2685  errmsg("MaxFragments should be >= 0")));
2686  }
2687 
2688  /* Locate words and phrases matching the query */
2689  if (query->size > 0)
2690  {
2691  hlCheck ch;
2692 
2693  ch.words = prs->words;
2694  ch.len = prs->curwords;
2695  locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2697  }
2698  else
2699  locations = NIL; /* empty query matches nothing */
2700 
2701  /* Apply appropriate headline selector */
2702  if (max_fragments == 0)
2703  mark_hl_words(prs, query, locations, highlightall, shortword,
2704  min_words, max_words);
2705  else
2706  mark_hl_fragments(prs, query, locations, highlightall, shortword,
2707  min_words, max_words, max_fragments);
2708 
2709  /* Fill in default values for string options */
2710  if (!prs->startsel)
2711  prs->startsel = pstrdup("<b>");
2712  if (!prs->stopsel)
2713  prs->stopsel = pstrdup("</b>");
2714  if (!prs->fragdelim)
2715  prs->fragdelim = pstrdup(" ... ");
2716 
2717  /* Caller will need these lengths, too */
2718  prs->startsellen = strlen(prs->startsel);
2719  prs->stopsellen = strlen(prs->stopsel);
2720  prs->fragdelimlen = strlen(prs->fragdelim);
2721 
2722  PG_RETURN_POINTER(prs);
2723 }
#define GETQUERY(x)
Definition: _int.h:157
void print(const void *obj)
Definition: print.c:36
unsigned short uint16
Definition: c.h:494
#define PG_INT32_MAX
Definition: c.h:578
signed int int32
Definition: c.h:483
#define Max(x, y)
Definition: c.h:987
#define lengthof(array)
Definition: c.h:777
char * defGetString(DefElem *def)
Definition: define.c:49
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
long val
Definition: informix.c:664
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1268
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1031
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:987
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
char * pstrdup(const char *in)
Definition: mcxt.c:1644
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * palloc(Size size)
Definition: mcxt.c:1226
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
int32 pg_strtoint32(const char *s)
Definition: numutils.c:384
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
const void size_t len
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
bool database_ctype_is_c
Definition: pg_locale.c:118
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2990
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
static XLogRecPtr startpos
@ PG_UTF8
Definition: pg_wchar.h:235
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define fprintf
Definition: port.h:242
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
void check_stack_depth(void)
Definition: postgres.c:3520
uintptr_t Datum
Definition: postgres.h:64
char * c
bool chosen
Definition: wparser_def.c:1960
int32 endpos
Definition: wparser_def.c:1957
int32 curlen
Definition: wparser_def.c:1959
int32 startpos
Definition: wparser_def.c:1956
bool excluded
Definition: wparser_def.c:1961
int32 poslen
Definition: wparser_def.c:1958
char * defname
Definition: parsenodes.h:802
WordEntryPos * pos
Definition: ts_utils.h:166
HeadlineWordEntry * words
Definition: ts_public.h:76
WordEntryPos pos
Definition: ts_public.h:68
QueryOperand * item
Definition: ts_public.h:70
char * alias
Definition: ts_public.h:28
int lexid
Definition: ts_public.h:27
char * descr
Definition: ts_public.h:29
Definition: pg_list.h:54
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:237
struct TParserPosition * prev
Definition: wparser_def.c:236
TParserState state
Definition: wparser_def.c:235
TParserCharTest isclass
Definition: wparser_def.c:210
TParserState tostate
Definition: wparser_def.c:213
TParserSpecial special
Definition: wparser_def.c:215
const TParserStateActionItem * action
Definition: wparser_def.c:1608
TParserState state
Definition: wparser_def.c:1609
char * str
Definition: wparser_def.c:243
pg_wchar * pgwstr
Definition: wparser_def.c:246
wchar_t * wstr
Definition: wparser_def.c:245
int lenstr
Definition: wparser_def.c:244
char * token
Definition: wparser_def.c:259
int type
Definition: wparser_def.c:262
int charmaxlen
Definition: wparser_def.c:250
bool wanthost
Definition: wparser_def.c:253
int lenbytetoken
Definition: wparser_def.c:260
bool ignore
Definition: wparser_def.c:252
TParserPosition * state
Definition: wparser_def.c:251
int lenchartoken
Definition: wparser_def.c:261
char c
Definition: wparser_def.c:256
bool usewide
Definition: wparser_def.c:247
int32 size
Definition: ts_type.h:221
HeadlineWordEntry * words
Definition: wparser_def.c:1967
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:266
uint16 WordEntryPos
Definition: ts_type.h:63
TSTernaryValue
Definition: ts_utils.h:133
@ TS_NO
Definition: ts_utils.h:134
@ TS_YES
Definition: ts_utils.h:135
#define TS_EXEC_EMPTY
Definition: ts_utils.h:188
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1856
List * TS_execute_locations(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:2009
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1554
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1165
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1515
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1935
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1145
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1372
#define VERSIONNUMBER
Definition: wparser_def.c:42
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1587
#define BADENDPOINT(j)
Definition: wparser_def.c:1949
#define ASCIIWORD
Definition: wparser_def.c:35
#define PROTOCOL
Definition: wparser_def.c:48
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1432
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1415
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1332
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1308
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:587
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1321
struct TParser TParser
static TSTernaryValue checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1980
void _make_compiler_happy(void)
Definition: wparser_def.c:536
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1465
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1325
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1546
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1343
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1118
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1934
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1497
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1901
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1222
#define DECIMAL_T
Definition: wparser_def.c:54
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1451
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1154
#define ASCIIPARTHWORD
Definition: wparser_def.c:45
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1475
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:423
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1133
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2270
#define WORD_T
Definition: wparser_def.c:36
TParserState
Definition: wparser_def.c:119
@ TPS_InXMLEntityHexNumFirst
Definition: wparser_def.c:143
@ TPS_InPort
Definition: wparser_def.c:166
@ TPS_InXMLEntityHexNum
Definition: wparser_def.c:144
@ TPS_InHostDomainSecond
Definition: wparser_def.c:163
@ TPS_InMantissaFirst
Definition: wparser_def.c:136
@ TPS_InTagName
Definition: wparser_def.c:149
@ TPS_InHyphenAsciiWordFirst
Definition: wparser_def.c:184
@ TPS_Null
Definition: wparser_def.c:197
@ TPS_InPathFirstFirst
Definition: wparser_def.c:173
@ TPS_InSignedIntFirst
Definition: wparser_def.c:125
@ TPS_InSignedInt
Definition: wparser_def.c:126
@ TPS_InUnsignedInt
Definition: wparser_def.c:124
@ TPS_InMantissa
Definition: wparser_def.c:138
@ TPS_InProtocolFirst
Definition: wparser_def.c:181
@ TPS_InFURL
Definition: wparser_def.c:180
@ TPS_InMantissaSign
Definition: wparser_def.c:137
@ TPS_InXMLBegin
Definition: wparser_def.c:147
@ TPS_InCommentEnd
Definition: wparser_def.c:161
@ TPS_InHyphenWordFirst
Definition: wparser_def.c:186
@ TPS_InHyphenNumWordPart
Definition: wparser_def.c:195
@ TPS_InPortFirst
Definition: wparser_def.c:165
@ TPS_InProtocolEnd
Definition: wparser_def.c:183
@ TPS_InXMLEntityFirst
Definition: wparser_def.c:139
@ TPS_InHyphenNumWordFirst
Definition: wparser_def.c:188
@ TPS_InCommentLast
Definition: wparser_def.c:157
@ TPS_InFileTwiddle
Definition: wparser_def.c:171
@ TPS_InURLPathStart
Definition: wparser_def.c:178
@ TPS_InURLPathFirst
Definition: wparser_def.c:177
@ TPS_InPathFirst
Definition: wparser_def.c:172
@ TPS_InPathSecond
Definition: wparser_def.c:174
@ TPS_InHyphenUnsignedInt
Definition: wparser_def.c:196
@ TPS_InFileFirst
Definition: wparser_def.c:170
@ TPS_InXMLEntityNumFirst
Definition: wparser_def.c:141
@ TPS_InHyphenWordPart
Definition: wparser_def.c:193
@ TPS_InNumWord
Definition: wparser_def.c:121
@ TPS_InAsciiWord
Definition: wparser_def.c:122
@ TPS_InVersion
Definition: wparser_def.c:135
@ TPS_InHost
Definition: wparser_def.c:168
@ TPS_InFile
Definition: wparser_def.c:175
@ TPS_InProtocolSecond
Definition: wparser_def.c:182
@ TPS_InCloseCommentFirst
Definition: wparser_def.c:159
@ TPS_InTagEscapeK
Definition: wparser_def.c:152
@ TPS_InParseHyphenHyphen
Definition: wparser_def.c:192
@ TPS_InTagBackSleshed
Definition: wparser_def.c:154
@ TPS_InTagFirst
Definition: wparser_def.c:146
@ TPS_InTagEnd
Definition: wparser_def.c:155
@ TPS_InComment
Definition: wparser_def.c:158
@ TPS_InHyphenWord
Definition: wparser_def.c:187
@ TPS_InHyphenAsciiWord
Definition: wparser_def.c:185
@ TPS_InWord
Definition: wparser_def.c:123
@ TPS_InXMLEntityEnd
Definition: wparser_def.c:145
@ TPS_InTagEscapeKK
Definition: wparser_def.c:153
@ TPS_InSpace
Definition: wparser_def.c:127
@ TPS_InFileNext
Definition: wparser_def.c:176
@ TPS_InURLPath
Definition: wparser_def.c:179
@ TPS_Base
Definition: wparser_def.c:120
@ TPS_InUDecimal
Definition: wparser_def.c:129
@ TPS_InParseHyphen
Definition: wparser_def.c:191
@ TPS_InHostFirstAN
Definition: wparser_def.c:167
@ TPS_InEmail
Definition: wparser_def.c:169
@ TPS_InDecimalFirst
Definition: wparser_def.c:130
@ TPS_InVersionFirst
Definition: wparser_def.c:134
@ TPS_InCloseCommentLast
Definition: wparser_def.c:160
@ TPS_InSVerVersion
Definition: wparser_def.c:133
@ TPS_InHyphenAsciiWordPart
Definition: wparser_def.c:194
@ TPS_InCommentFirst
Definition: wparser_def.c:156
@ TPS_InUDecimalFirst
Definition: wparser_def.c:128
@ TPS_InHostFirstDomain
Definition: wparser_def.c:162
@ TPS_InHostDomain
Definition: wparser_def.c:164
@ TPS_InHyphenDigitLookahead
Definition: wparser_def.c:190
@ TPS_InVerVersion
Definition: wparser_def.c:132
@ TPS_InXMLEntityNum
Definition: wparser_def.c:142
@ TPS_InTag
Definition: wparser_def.c:151
@ TPS_InDecimal
Definition: wparser_def.c:131
@ TPS_InTagCloseFirst
Definition: wparser_def.c:148
@ TPS_InXMLEntity
Definition: wparser_def.c:140
@ TPS_InHyphenNumWord
Definition: wparser_def.c:189
@ TPS_InTagBeginEnd
Definition: wparser_def.c:150
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
Definition: wparser_def.c:2183
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1193
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1538
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1090
#define A_POP
Definition: wparser_def.c:221
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1042
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1264
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1057
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1440
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:345
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1578
#define LASTNUM
Definition: wparser_def.c:59
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:480
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2615
#define NUMHWORD
Definition: wparser_def.c:49
static bool hlCover(HeadlineParsedText *prs, TSQuery query, List *locations, int *nextpos, int *p, int *q)
Definition: wparser_def.c:2031
#define SPACE
Definition: wparser_def.c:46
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1075
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:203
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1048
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:504
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1236
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1197
struct TParserPosition TParserPosition
#define NUMWORD
Definition: wparser_def.c:37
#define FILEPATH
Definition: wparser_def.c:53
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1271
static int p_isneC(TParser *prs)
Definition: wparser_def.c:486
#define EMAIL
Definition: wparser_def.c:38
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1296
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1570
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1125
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:970
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:595
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2453
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1531
#define UNSIGNEDINT
Definition: wparser_def.c:56
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:205
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1390
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1173
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1469
#define A_RERUN
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1105
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:998
static const char *const tok_alias[]
Definition: wparser_def.c:61
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:611
#define HLIDSKIP(x)
Definition: wparser_def.c:1933
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2219
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1594
#define SIGNEDINT
Definition: wparser_def.c:55
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:498
static const TParserStateAction Actions[]
Definition: wparser_def.c:1627
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1208
#define PARTHWORD
Definition: wparser_def.c:44
#define HLIDREPLACE(x)
Definition: wparser_def.c:1932
#define A_MERGE
Definition: wparser_def.c:225
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1139
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1112
static int p_isascii(TParser *prs)
Definition: wparser_def.c:492
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1287
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1522
static int p_isignore(TParser *prs)
Definition: wparser_def.c:622
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1563
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1364
#define TAG_T
Definition: wparser_def.c:47
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1084
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:271
#define URLPATH
Definition: wparser_def.c:52
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1877
#define ASCIIHWORD
Definition: wparser_def.c:50
#define HOST
Definition: wparser_def.c:40
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1242
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1895
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:288
#define A_BINGO
Definition: wparser_def.c:220
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1619
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1709
#define XMLENTITY
Definition: wparser_def.c:57
static int p_ishost(TParser *prs)
Definition: wparser_def.c:628
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1917
#define A_CLRALL
Definition: wparser_def.c:226
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:656
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:602
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1481
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1025
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1069
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1216
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:473
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1314
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:396
#define A_CLEAR
Definition: wparser_def.c:224
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1396
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:987
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1406
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1379
#define A_PUSH
Definition: wparser_def.c:222
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1278
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1487
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1016
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:691
static void TParserClose(TParser *prs)
Definition: wparser_def.c:371
#define URL_T
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1179
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1099
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1505
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1186
#define A_NEXT
Definition: wparser_def.c:219
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1358
#define HWORD
Definition: wparser_def.c:51
#define NUMPARTHWORD
Definition: wparser_def.c:43
static const char *const lex_descr[]
Definition: wparser_def.c:88
#define INTERESTINGWORD(j)
Definition: wparser_def.c:1945
#define SCIENTIFIC
Definition: wparser_def.c:41
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:563
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1283
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1302
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1493
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1459
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1425