PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57 
58 #define LASTNUM 23
59 
60 static const char *const tok_alias[] = {
61  "",
62  "asciiword",
63  "word",
64  "numword",
65  "email",
66  "url",
67  "host",
68  "sfloat",
69  "version",
70  "hword_numpart",
71  "hword_part",
72  "hword_asciipart",
73  "blank",
74  "tag",
75  "protocol",
76  "numhword",
77  "asciihword",
78  "hword",
79  "url_path",
80  "file",
81  "float",
82  "int",
83  "uint",
84  "entity"
85 };
86 
87 static const char *const lex_descr[] = {
88  "",
89  "Word, all ASCII",
90  "Word, all letters",
91  "Word, letters and digits",
92  "Email address",
93  "URL",
94  "Host",
95  "Scientific notation",
96  "Version number",
97  "Hyphenated word part, letters and digits",
98  "Hyphenated word part, all letters",
99  "Hyphenated word part, all ASCII",
100  "Space symbols",
101  "XML tag",
102  "Protocol head",
103  "Hyphenated word, letters and digits",
104  "Hyphenated word, all ASCII",
105  "Hyphenated word, all letters",
106  "URL path",
107  "File or path name",
108  "Decimal notation",
109  "Signed integer",
110  "Unsigned integer",
111  "XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119  TPS_Base = 0,
196  TPS_Null /* last state (fake value) */
197 } TParserState;
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203  * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205  * special cases... */
206 
207 typedef struct
208 {
210  char c;
213  int type;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226 
227 typedef struct TParserPosition
228 {
229  int posbyte; /* position of parser in bytes */
230  int poschar; /* position of parser in characters */
231  int charlen; /* length of current char */
232  int lenbytetoken; /* length of token-so-far in bytes */
233  int lenchartoken; /* and in chars */
238 
239 typedef struct TParser
240 {
241  /* string and position information */
242  char *str; /* multibyte string */
243  int lenstr; /* length of mbstring */
244  wchar_t *wstr; /* wide character string */
245  pg_wchar *pgwstr; /* wide character string for C-locale */
246  bool usewide;
247 
248  /* State of parse */
251  bool ignore;
252  bool wanthost;
253 
254  /* silly char */
255  char c;
256 
257  /* out */
258  char *token;
261  int type;
262 } TParser;
263 
264 
265 /* forward decls here */
266 static bool TParserGet(TParser *prs);
267 
268 
269 static TParserPosition *
271 {
273 
274  if (prev)
275  memcpy(res, prev, sizeof(TParserPosition));
276  else
277  memset(res, 0, sizeof(TParserPosition));
278 
279  res->prev = prev;
280 
281  res->pushedAtAction = NULL;
282 
283  return res;
284 }
285 
286 static TParser *
287 TParserInit(char *str, int len)
288 {
289  TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 
292  prs->str = str;
293  prs->lenstr = len;
294 
295  /*
296  * Use wide char code only when max encoding length > 1.
297  */
298  if (prs->charmaxlen > 1)
299  {
300  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
301  pg_locale_t mylocale = 0; /* TODO */
302 
303  prs->usewide = true;
304  if (lc_ctype_is_c(collation))
305  {
306  /*
307  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308  * be different from sizeof(wchar_t)
309  */
310  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312  }
313  else
314  {
315  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317  mylocale);
318  }
319  }
320  else
321  prs->usewide = false;
322 
323  prs->state = newTParserPosition(NULL);
324  prs->state->state = TPS_Base;
325 
326 #ifdef WPARSER_TRACE
327 
328  /*
329  * Use of %.*s here is a bit risky since it can misbehave if the data is
330  * not in what libc thinks is the prevailing encoding. However, since
331  * this is just a debugging aid, we choose to live with that.
332  */
333  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
334 #endif
335 
336  return prs;
337 }
338 
339 /*
340  * As an alternative to a full TParserInit one can create a
341  * TParserCopy which basically is a regular TParser without a private
342  * copy of the string - instead it uses the one from another TParser.
343  * This is useful because at some places TParsers are created
344  * recursively and the repeated copying around of the strings can
345  * cause major inefficiency if the source string is long.
346  * The new parser starts parsing at the original's current position.
347  *
348  * Obviously one must not close the original TParser before the copy.
349  */
350 static TParser *
352 {
353  TParser *prs = (TParser *) palloc0(sizeof(TParser));
354 
355  prs->charmaxlen = orig->charmaxlen;
356  prs->str = orig->str + orig->state->posbyte;
357  prs->lenstr = orig->lenstr - orig->state->posbyte;
358  prs->usewide = orig->usewide;
359 
360  if (orig->pgwstr)
361  prs->pgwstr = orig->pgwstr + orig->state->poschar;
362  if (orig->wstr)
363  prs->wstr = orig->wstr + orig->state->poschar;
364 
365  prs->state = newTParserPosition(NULL);
366  prs->state->state = TPS_Base;
367 
368 #ifdef WPARSER_TRACE
369  /* See note above about %.*s */
370  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
371 #endif
372 
373  return prs;
374 }
375 
376 
377 static void
379 {
380  while (prs->state)
381  {
382  TParserPosition *ptr = prs->state->prev;
383 
384  pfree(prs->state);
385  prs->state = ptr;
386  }
387 
388  if (prs->wstr)
389  pfree(prs->wstr);
390  if (prs->pgwstr)
391  pfree(prs->pgwstr);
392 
393 #ifdef WPARSER_TRACE
394  fprintf(stderr, "closing parser\n");
395 #endif
396  pfree(prs);
397 }
398 
399 /*
400  * Close a parser created with TParserCopyInit
401  */
402 static void
404 {
405  while (prs->state)
406  {
407  TParserPosition *ptr = prs->state->prev;
408 
409  pfree(prs->state);
410  prs->state = ptr;
411  }
412 
413 #ifdef WPARSER_TRACE
414  fprintf(stderr, "closing parser copy\n");
415 #endif
416  pfree(prs);
417 }
418 
419 
420 /*
421  * Character-type support functions, equivalent to is* macros, but
422  * working with any possible encodings and locales. Notes:
423  * - with multibyte encoding and C-locale isw* function may fail
424  * or give wrong result.
425  * - multibyte encoding and C-locale often are used for
426  * Asian languages.
427  * - if locale is C then we use pgwstr instead of wstr.
428  */
429 
430 #define p_iswhat(type, nonascii) \
431  \
432 static int \
433 p_is##type(TParser *prs) \
434 { \
435  Assert(prs->state); \
436  if (prs->usewide) \
437  { \
438  if (prs->pgwstr) \
439  { \
440  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
441  if (c > 0x7f) \
442  return nonascii; \
443  return is##type(c); \
444  } \
445  return isw##type(*(prs->wstr + prs->state->poschar)); \
446  } \
447  return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
448 } \
449  \
450 static int \
451 p_isnot##type(TParser *prs) \
452 { \
453  return !p_is##type(prs); \
454 }
455 
456 /*
457  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
458  * an alpha character, but not a member of other char classes.
459  */
460 p_iswhat(alnum, 1)
461 p_iswhat(alpha, 1)
462 p_iswhat(digit, 0)
463 p_iswhat(lower, 0)
464 p_iswhat(print, 0)
465 p_iswhat(punct, 0)
466 p_iswhat(space, 0)
467 p_iswhat(upper, 0)
468 p_iswhat(xdigit, 0)
469 
470 /* p_iseq should be used only for ascii symbols */
471 
472 static int
473 p_iseq(TParser *prs, char c)
474 {
475  Assert(prs->state);
476  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
477 }
478 
479 static int
481 {
482  Assert(prs->state);
483  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 }
485 
486 static int
488 {
489  return p_iseq(prs, prs->c);
490 }
491 
492 static int
494 {
495  return !p_iseq(prs, prs->c);
496 }
497 
498 static int
500 {
501  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 }
503 
504 static int
506 {
507  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 }
509 
510 static int
512 {
513  char ch;
514 
515  /* no non-ASCII need apply */
516  if (prs->state->charlen != 1)
517  return 0;
518  ch = *(prs->str + prs->state->posbyte);
519  /* no spaces or control characters */
520  if (ch <= 0x20 || ch >= 0x7F)
521  return 0;
522  /* reject characters disallowed by RFC 3986 */
523  switch (ch)
524  {
525  case '"':
526  case '<':
527  case '>':
528  case '\\':
529  case '^':
530  case '`':
531  case '{':
532  case '|':
533  case '}':
534  return 0;
535  }
536  return 1;
537 }
538 
539 
540 /* deliberately suppress unused-function complaints for the above */
541 void _make_compiler_happy(void);
542 void
544 {
545  p_isalnum(NULL);
546  p_isnotalnum(NULL);
547  p_isalpha(NULL);
548  p_isnotalpha(NULL);
549  p_isdigit(NULL);
550  p_isnotdigit(NULL);
551  p_islower(NULL);
552  p_isnotlower(NULL);
553  p_isprint(NULL);
554  p_isnotprint(NULL);
555  p_ispunct(NULL);
556  p_isnotpunct(NULL);
557  p_isspace(NULL);
558  p_isnotspace(NULL);
559  p_isupper(NULL);
560  p_isnotupper(NULL);
561  p_isxdigit(NULL);
562  p_isnotxdigit(NULL);
563  p_isEOF(NULL);
564  p_iseqC(NULL);
565  p_isneC(NULL);
566 }
567 
568 
569 static void
571 {
572  switch (prs->state->lenchartoken)
573  {
574  case 8: /* </script */
575  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
576  prs->ignore = false;
577  break;
578  case 7: /* <script || </style */
579  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
580  prs->ignore = false;
581  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
582  prs->ignore = true;
583  break;
584  case 6: /* <style */
585  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
586  prs->ignore = true;
587  break;
588  default:
589  break;
590  }
591 }
592 
593 static void
595 {
596  prs->wanthost = true;
597  prs->state->posbyte -= prs->state->lenbytetoken;
598  prs->state->poschar -= prs->state->lenchartoken;
599 }
600 
601 static void
603 {
604  prs->state->posbyte -= prs->state->lenbytetoken;
605  prs->state->poschar -= prs->state->lenchartoken;
606 }
607 
608 static void
610 {
611  prs->state->posbyte -= prs->state->lenbytetoken;
612  prs->state->poschar -= prs->state->lenchartoken;
613  prs->state->lenbytetoken = 0;
614  prs->state->lenchartoken = 0;
615 }
616 
617 static int
619 {
620  if (prs->wanthost)
621  {
622  prs->wanthost = false;
623  return 1;
624  }
625  return 0;
626 }
627 
628 static int
630 {
631  return (prs->ignore) ? 1 : 0;
632 }
633 
634 static int
636 {
637  TParser *tmpprs = TParserCopyInit(prs);
638  int res = 0;
639 
640  tmpprs->wanthost = true;
641 
642  if (TParserGet(tmpprs) && tmpprs->type == HOST)
643  {
644  prs->state->posbyte += tmpprs->lenbytetoken;
645  prs->state->poschar += tmpprs->lenchartoken;
646  prs->state->lenbytetoken += tmpprs->lenbytetoken;
647  prs->state->lenchartoken += tmpprs->lenchartoken;
648  prs->state->charlen = tmpprs->state->charlen;
649  res = 1;
650  }
651  TParserCopyClose(tmpprs);
652 
653  return res;
654 }
655 
656 static int
658 {
659  TParser *tmpprs = TParserCopyInit(prs);
660  int res = 0;
661 
662  tmpprs->state = newTParserPosition(tmpprs->state);
663  tmpprs->state->state = TPS_InURLPathFirst;
664 
665  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
666  {
667  prs->state->posbyte += tmpprs->lenbytetoken;
668  prs->state->poschar += tmpprs->lenchartoken;
669  prs->state->lenbytetoken += tmpprs->lenbytetoken;
670  prs->state->lenchartoken += tmpprs->lenchartoken;
671  prs->state->charlen = tmpprs->state->charlen;
672  res = 1;
673  }
674  TParserCopyClose(tmpprs);
675 
676  return res;
677 }
678 
679 /*
680  * returns true if current character has zero display length or
681  * it's a special sign in several languages. Such characters
682  * aren't a word-breaker although they aren't an isalpha.
683  * In beginning of word they aren't a part of it.
684  */
685 static int
687 {
688  /*
689  * pg_dsplen could return -1 which means error or control character
690  */
691  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
692  return 1;
693 
694  /*
695  * Unicode Characters in the 'Mark, Spacing Combining' Category That
696  * characters are not alpha although they are not breakers of word too.
697  * Check that only in utf encoding, because other encodings aren't
698  * supported by postgres or even exists.
699  */
700  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
701  {
702  static const pg_wchar strange_letter[] = {
703  /*
704  * use binary search, so elements should be ordered
705  */
706  0x0903, /* DEVANAGARI SIGN VISARGA */
707  0x093E, /* DEVANAGARI VOWEL SIGN AA */
708  0x093F, /* DEVANAGARI VOWEL SIGN I */
709  0x0940, /* DEVANAGARI VOWEL SIGN II */
710  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
711  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
712  0x094B, /* DEVANAGARI VOWEL SIGN O */
713  0x094C, /* DEVANAGARI VOWEL SIGN AU */
714  0x0982, /* BENGALI SIGN ANUSVARA */
715  0x0983, /* BENGALI SIGN VISARGA */
716  0x09BE, /* BENGALI VOWEL SIGN AA */
717  0x09BF, /* BENGALI VOWEL SIGN I */
718  0x09C0, /* BENGALI VOWEL SIGN II */
719  0x09C7, /* BENGALI VOWEL SIGN E */
720  0x09C8, /* BENGALI VOWEL SIGN AI */
721  0x09CB, /* BENGALI VOWEL SIGN O */
722  0x09CC, /* BENGALI VOWEL SIGN AU */
723  0x09D7, /* BENGALI AU LENGTH MARK */
724  0x0A03, /* GURMUKHI SIGN VISARGA */
725  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
726  0x0A3F, /* GURMUKHI VOWEL SIGN I */
727  0x0A40, /* GURMUKHI VOWEL SIGN II */
728  0x0A83, /* GUJARATI SIGN VISARGA */
729  0x0ABE, /* GUJARATI VOWEL SIGN AA */
730  0x0ABF, /* GUJARATI VOWEL SIGN I */
731  0x0AC0, /* GUJARATI VOWEL SIGN II */
732  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
733  0x0ACB, /* GUJARATI VOWEL SIGN O */
734  0x0ACC, /* GUJARATI VOWEL SIGN AU */
735  0x0B02, /* ORIYA SIGN ANUSVARA */
736  0x0B03, /* ORIYA SIGN VISARGA */
737  0x0B3E, /* ORIYA VOWEL SIGN AA */
738  0x0B40, /* ORIYA VOWEL SIGN II */
739  0x0B47, /* ORIYA VOWEL SIGN E */
740  0x0B48, /* ORIYA VOWEL SIGN AI */
741  0x0B4B, /* ORIYA VOWEL SIGN O */
742  0x0B4C, /* ORIYA VOWEL SIGN AU */
743  0x0B57, /* ORIYA AU LENGTH MARK */
744  0x0BBE, /* TAMIL VOWEL SIGN AA */
745  0x0BBF, /* TAMIL VOWEL SIGN I */
746  0x0BC1, /* TAMIL VOWEL SIGN U */
747  0x0BC2, /* TAMIL VOWEL SIGN UU */
748  0x0BC6, /* TAMIL VOWEL SIGN E */
749  0x0BC7, /* TAMIL VOWEL SIGN EE */
750  0x0BC8, /* TAMIL VOWEL SIGN AI */
751  0x0BCA, /* TAMIL VOWEL SIGN O */
752  0x0BCB, /* TAMIL VOWEL SIGN OO */
753  0x0BCC, /* TAMIL VOWEL SIGN AU */
754  0x0BD7, /* TAMIL AU LENGTH MARK */
755  0x0C01, /* TELUGU SIGN CANDRABINDU */
756  0x0C02, /* TELUGU SIGN ANUSVARA */
757  0x0C03, /* TELUGU SIGN VISARGA */
758  0x0C41, /* TELUGU VOWEL SIGN U */
759  0x0C42, /* TELUGU VOWEL SIGN UU */
760  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
761  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
762  0x0C82, /* KANNADA SIGN ANUSVARA */
763  0x0C83, /* KANNADA SIGN VISARGA */
764  0x0CBE, /* KANNADA VOWEL SIGN AA */
765  0x0CC0, /* KANNADA VOWEL SIGN II */
766  0x0CC1, /* KANNADA VOWEL SIGN U */
767  0x0CC2, /* KANNADA VOWEL SIGN UU */
768  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
769  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
770  0x0CC7, /* KANNADA VOWEL SIGN EE */
771  0x0CC8, /* KANNADA VOWEL SIGN AI */
772  0x0CCA, /* KANNADA VOWEL SIGN O */
773  0x0CCB, /* KANNADA VOWEL SIGN OO */
774  0x0CD5, /* KANNADA LENGTH MARK */
775  0x0CD6, /* KANNADA AI LENGTH MARK */
776  0x0D02, /* MALAYALAM SIGN ANUSVARA */
777  0x0D03, /* MALAYALAM SIGN VISARGA */
778  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
779  0x0D3F, /* MALAYALAM VOWEL SIGN I */
780  0x0D40, /* MALAYALAM VOWEL SIGN II */
781  0x0D46, /* MALAYALAM VOWEL SIGN E */
782  0x0D47, /* MALAYALAM VOWEL SIGN EE */
783  0x0D48, /* MALAYALAM VOWEL SIGN AI */
784  0x0D4A, /* MALAYALAM VOWEL SIGN O */
785  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
786  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
787  0x0D57, /* MALAYALAM AU LENGTH MARK */
788  0x0D82, /* SINHALA SIGN ANUSVARAYA */
789  0x0D83, /* SINHALA SIGN VISARGAYA */
790  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
791  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
792  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
793  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
794  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
795  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
796  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
797  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
798  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
799  * AELA-PILLA */
800  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
801  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
802  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
803  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
804  0x0F3E, /* TIBETAN SIGN YAR TSHES */
805  0x0F3F, /* TIBETAN SIGN MAR TSHES */
806  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
807  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
808  0x102C, /* MYANMAR VOWEL SIGN AA */
809  0x1031, /* MYANMAR VOWEL SIGN E */
810  0x1038, /* MYANMAR SIGN VISARGA */
811  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
812  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
813  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
814  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
815  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
816  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
817  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
818  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
819  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
820  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
821  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
822  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
823  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
824  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
825  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
826  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
827  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
828  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
829  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
830  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
831  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
832  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
833  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
834  0x17B6, /* KHMER VOWEL SIGN AA */
835  0x17BE, /* KHMER VOWEL SIGN OE */
836  0x17BF, /* KHMER VOWEL SIGN YA */
837  0x17C0, /* KHMER VOWEL SIGN IE */
838  0x17C1, /* KHMER VOWEL SIGN E */
839  0x17C2, /* KHMER VOWEL SIGN AE */
840  0x17C3, /* KHMER VOWEL SIGN AI */
841  0x17C4, /* KHMER VOWEL SIGN OO */
842  0x17C5, /* KHMER VOWEL SIGN AU */
843  0x17C7, /* KHMER SIGN REAHMUK */
844  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
845  0x1923, /* LIMBU VOWEL SIGN EE */
846  0x1924, /* LIMBU VOWEL SIGN AI */
847  0x1925, /* LIMBU VOWEL SIGN OO */
848  0x1926, /* LIMBU VOWEL SIGN AU */
849  0x1929, /* LIMBU SUBJOINED LETTER YA */
850  0x192A, /* LIMBU SUBJOINED LETTER RA */
851  0x192B, /* LIMBU SUBJOINED LETTER WA */
852  0x1930, /* LIMBU SMALL LETTER KA */
853  0x1931, /* LIMBU SMALL LETTER NGA */
854  0x1933, /* LIMBU SMALL LETTER TA */
855  0x1934, /* LIMBU SMALL LETTER NA */
856  0x1935, /* LIMBU SMALL LETTER PA */
857  0x1936, /* LIMBU SMALL LETTER MA */
858  0x1937, /* LIMBU SMALL LETTER RA */
859  0x1938, /* LIMBU SMALL LETTER LA */
860  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
861  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
862  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
863  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
864  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
865  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
866  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
867  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
868  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
869  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
870  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
871  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
872  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
873  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
874  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
875  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
876  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
877  0x19C8, /* NEW TAI LUE TONE MARK-1 */
878  0x19C9, /* NEW TAI LUE TONE MARK-2 */
879  0x1A19, /* BUGINESE VOWEL SIGN E */
880  0x1A1A, /* BUGINESE VOWEL SIGN O */
881  0x1A1B, /* BUGINESE VOWEL SIGN AE */
882  0x1B04, /* BALINESE SIGN BISAH */
883  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
884  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
885  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
886  0x1B3E, /* BALINESE VOWEL SIGN TALING */
887  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
888  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
889  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
890  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
891  0x1B44, /* BALINESE ADEG ADEG */
892  0x1B82, /* SUNDANESE SIGN PANGWISAD */
893  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
894  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
895  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
896  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
897  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
898  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
899  0x1C26, /* LEPCHA VOWEL SIGN AA */
900  0x1C27, /* LEPCHA VOWEL SIGN I */
901  0x1C28, /* LEPCHA VOWEL SIGN O */
902  0x1C29, /* LEPCHA VOWEL SIGN OO */
903  0x1C2A, /* LEPCHA VOWEL SIGN U */
904  0x1C2B, /* LEPCHA VOWEL SIGN UU */
905  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
906  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
907  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
908  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
909  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
910  0xA880, /* SAURASHTRA SIGN ANUSVARA */
911  0xA881, /* SAURASHTRA SIGN VISARGA */
912  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
913  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
914  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
915  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
916  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
917  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
918  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
919  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
920  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
921  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
922  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
923  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
924  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
925  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
926  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
927  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
928  0xA952, /* REJANG CONSONANT SIGN H */
929  0xA953, /* REJANG VIRAMA */
930  0xAA2F, /* CHAM VOWEL SIGN O */
931  0xAA30, /* CHAM VOWEL SIGN AI */
932  0xAA33, /* CHAM CONSONANT SIGN YA */
933  0xAA34, /* CHAM CONSONANT SIGN RA */
934  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
935  };
936  const pg_wchar *StopLow = strange_letter,
937  *StopHigh = strange_letter + lengthof(strange_letter),
938  *StopMiddle;
939  pg_wchar c;
940 
941  if (prs->pgwstr)
942  c = *(prs->pgwstr + prs->state->poschar);
943  else
944  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
945 
946  while (StopLow < StopHigh)
947  {
948  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
949  if (*StopMiddle == c)
950  return 1;
951  else if (*StopMiddle < c)
952  StopLow = StopMiddle + 1;
953  else
954  StopHigh = StopMiddle;
955  }
956  }
957 
958  return 0;
959 }
960 
961 /*
962  * Table of state/action of parser
963  */
964 
966  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
967  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
968  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
969  {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
970  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
971  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
972  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
973  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
974  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
975  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
976  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
977  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
978  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
979 };
980 
981 
983  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
984  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
985  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
986  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
987  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
988  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
989  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
990  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
991 };
992 
994  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
995  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
996  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
997  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
998  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
999  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1000  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1002  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1003  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1004  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1006  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1007  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1008  {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1009 };
1010 
1012  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1013  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1014  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1015  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1016  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1017  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1018 };
1019 
1021  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1022  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1023  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1024  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1025  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1026  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1027  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1028  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1029  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1030  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1031  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1032  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1033  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1034  {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1035 };
1036 
1038  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1039  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1040  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1041 };
1042 
1044  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1045  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1046  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1047  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1048  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1049  {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1050 };
1051 
1053  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1054  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1055  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1056  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1057  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1058  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1059  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1060  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1061  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1062 };
1063 
1065  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1066  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1067  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1068 };
1069 
1071  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1072  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1073  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1074  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1075  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1076  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1077 };
1078 
1080  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1081  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1082  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1083 };
1084 
1086  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1087  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1088  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1089  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1090  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1091  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1092 };
1093 
1095  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1096  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1097  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1098 };
1099 
1101  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1103  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1104 };
1105 
1106 
1108  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1109  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1110  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1111 };
1112 
1114  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1115  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1116  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1117  {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1118 };
1119 
1121  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1124  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1125  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1126 };
1127 
1129  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1130  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1131  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1132 };
1133 
1135  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1136  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1137  {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1138 };
1139 
1141  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1142  {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1143  {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1147 };
1148 
1150  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1151  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1152  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1153  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1154  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1155  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1156  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1157  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 };
1159 
1161  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162  {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1163  {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1164  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1165  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1166 };
1167 
1169  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1171  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172 };
1173 
1175  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1177  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1178  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1179 };
1180 
1182  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1183  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1184  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1185  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1186 };
1187 
1189  {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1190 };
1191 
1193  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1195  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1196  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1197  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1198  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1199  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1200  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1201 };
1202 
1204  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205  /* <?xml ... */
1206  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1207  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1208  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1209 };
1210 
1212  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1213  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1214  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1215 };
1216 
1218  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219  /* <br/> case */
1220  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1221  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1222  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1223  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1224  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1225  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1226  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1227  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1228  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230 
1232  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1234  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235 };
1236 
1238  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1240  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1241  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1242  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1243  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1244  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1245  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1246  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1247  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1248  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1249  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1250  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1251  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1252  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1253  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1254  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1255  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1256  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1257 };
1258 
1260  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1262  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1263  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1264 };
1265 
1267  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1268  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1269  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1270  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1271 };
1272 
1274  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1275  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1276 };
1277 
1279  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1280 };
1281 
1283  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1284  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1285  /* <!DOCTYPE ...> */
1286  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1287  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1288  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1289 };
1290 
1292  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1293  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1294  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1295 };
1296 
1298  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1300  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1301 };
1302 
1304  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1306  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1307 };
1308 
1310  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1312  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1313  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1314 };
1315 
1317  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1318 };
1319 
1321  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1322  {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1323  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1324  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1325 };
1326 
1328  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1329  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1330  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1331  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1332  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1333  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1334  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1335  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1336 };
1337 
1339  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1340  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1341  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1342  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1343  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1344  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1345  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1346  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1347  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1349  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1350  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1351 };
1352 
1354  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1355  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1356  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1357 };
1358 
1360  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1361  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1363  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1364  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1365 };
1366 
1368  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1369  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1370  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1371  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1372 };
1373 
1375  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1376  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1377  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1378  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1379  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1380  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1381  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1382  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1383 };
1384 
1386  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1387  {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1388  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1389 };
1390 
1392  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1393  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1394  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1395  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1396  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1397  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1398  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1399 };
1400 
1402  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1403  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1404  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1405  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1406  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1407  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1408 };
1409 
1411  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1412  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1413  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1414  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1415  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1416  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1417  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1418 };
1419 
1421  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1422  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1423  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1424  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1425 };
1426 
1428  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1429  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1430  {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1431  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1432  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434 
1436  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1437  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1438  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1439  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1440  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1441  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1442  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1443  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1444 };
1445 
1447  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1449  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1450  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1451  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1452 };
1453 
1455  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1456  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1457  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1458 };
1459 
1461  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1462 };
1463 
1465  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1466  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1467  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1468 };
1469 
1471  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1474 };
1475 
1477  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1478  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1479  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481 
1483  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1485  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487 
1489  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1490 };
1491 
1493  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1494  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1497  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1498 };
1499 
1502  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1503  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1504  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1506  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1508 };
1509 
1511  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1512  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1514  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516 
1519  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1520  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1521  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1522  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1524 };
1525 
1527  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1528  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1530  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1531 };
1532 
1535  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1536  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1539 };
1540 
1542  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1543  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1544  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1545  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1546  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1547 };
1548 
1550  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1552  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1553  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1554  {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1555  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1556 };
1557 
1559  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1560  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1562  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1563 };
1564 
1566  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1567  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1568  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1570  {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1571 };
1572 
1574  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1576  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1577  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1578  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1579  {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1580 };
1581 
1583  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1584  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1586  {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1587 };
1588 
1590  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1592  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1594  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596 
1597 
1598 /*
1599  * main table of per-state parser actions
1600  */
1601 typedef struct
1602 {
1603  const TParserStateActionItem *action; /* the actual state info */
1604  TParserState state; /* only for Assert crosscheck */
1605 #ifdef WPARSER_TRACE
1606  const char *state_name; /* only for debug printout */
1607 #endif
1609 
1610 #ifdef WPARSER_TRACE
1611 #define TPARSERSTATEACTION(state) \
1612  { CppConcat(action,state), state, CppAsString(state) }
1613 #else
1614 #define TPARSERSTATEACTION(state) \
1615  { CppConcat(action,state), state }
1616 #endif
1617 
1618 /*
1619  * order must be the same as in typedef enum {} TParserState!!
1620  */
1621 
1622 static const TParserStateAction Actions[] = {
1700 };
1701 
1702 
1703 static bool
1705 {
1706  const TParserStateActionItem *item = NULL;
1707 
1708  Assert(prs->state);
1709 
1710  if (prs->state->posbyte >= prs->lenstr)
1711  return false;
1712 
1713  prs->token = prs->str + prs->state->posbyte;
1714  prs->state->pushedAtAction = NULL;
1715 
1716  /* look at string */
1717  while (prs->state->posbyte <= prs->lenstr)
1718  {
1719  if (prs->state->posbyte == prs->lenstr)
1720  prs->state->charlen = 0;
1721  else
1722  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1723  pg_mblen(prs->str + prs->state->posbyte);
1724 
1725  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1726  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1727  Assert(Actions[prs->state->state].state == prs->state->state);
1728 
1729  if (prs->state->pushedAtAction)
1730  {
1731  /* After a POP, pick up at the next test */
1732  item = prs->state->pushedAtAction + 1;
1733  prs->state->pushedAtAction = NULL;
1734  }
1735  else
1736  {
1737  item = Actions[prs->state->state].action;
1738  Assert(item != NULL);
1739  }
1740 
1741  /* find action by character class */
1742  while (item->isclass)
1743  {
1744  prs->c = item->c;
1745  if (item->isclass(prs) != 0)
1746  break;
1747  item++;
1748  }
1749 
1750 #ifdef WPARSER_TRACE
1751  {
1752  TParserPosition *ptr;
1753 
1754  fprintf(stderr, "state ");
1755  /* indent according to stack depth */
1756  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1757  fprintf(stderr, " ");
1758  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1759  if (prs->state->posbyte < prs->lenstr)
1760  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1761  else
1762  fprintf(stderr, "at EOF");
1763  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1764  (int) (item - Actions[prs->state->state].action),
1765  (item->flags & A_BINGO) ? " BINGO" : "",
1766  (item->flags & A_POP) ? " POP" : "",
1767  (item->flags & A_PUSH) ? " PUSH" : "",
1768  (item->flags & A_RERUN) ? " RERUN" : "",
1769  (item->flags & A_CLEAR) ? " CLEAR" : "",
1770  (item->flags & A_MERGE) ? " MERGE" : "",
1771  (item->flags & A_CLRALL) ? " CLRALL" : "",
1772  (item->tostate != TPS_Null) ? " tostate " : "",
1773  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1774  (item->type > 0) ? " type " : "",
1775  tok_alias[item->type]);
1776  }
1777 #endif
1778 
1779  /* call special handler if exists */
1780  if (item->special)
1781  item->special(prs);
1782 
1783  /* BINGO, token is found */
1784  if (item->flags & A_BINGO)
1785  {
1786  Assert(item->type > 0);
1787  prs->lenbytetoken = prs->state->lenbytetoken;
1788  prs->lenchartoken = prs->state->lenchartoken;
1789  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1790  prs->type = item->type;
1791  }
1792 
1793  /* do various actions by flags */
1794  if (item->flags & A_POP)
1795  { /* pop stored state in stack */
1796  TParserPosition *ptr = prs->state->prev;
1797 
1798  pfree(prs->state);
1799  prs->state = ptr;
1800  Assert(prs->state);
1801  }
1802  else if (item->flags & A_PUSH)
1803  { /* push (store) state in stack */
1804  prs->state->pushedAtAction = item; /* remember where we push */
1805  prs->state = newTParserPosition(prs->state);
1806  }
1807  else if (item->flags & A_CLEAR)
1808  { /* clear previous pushed state */
1809  TParserPosition *ptr;
1810 
1811  Assert(prs->state->prev);
1812  ptr = prs->state->prev->prev;
1813  pfree(prs->state->prev);
1814  prs->state->prev = ptr;
1815  }
1816  else if (item->flags & A_CLRALL)
1817  { /* clear all previous pushed state */
1818  TParserPosition *ptr;
1819 
1820  while (prs->state->prev)
1821  {
1822  ptr = prs->state->prev->prev;
1823  pfree(prs->state->prev);
1824  prs->state->prev = ptr;
1825  }
1826  }
1827  else if (item->flags & A_MERGE)
1828  { /* merge posinfo with current and pushed state */
1829  TParserPosition *ptr = prs->state;
1830 
1831  Assert(prs->state->prev);
1832  prs->state = prs->state->prev;
1833 
1834  prs->state->posbyte = ptr->posbyte;
1835  prs->state->poschar = ptr->poschar;
1836  prs->state->charlen = ptr->charlen;
1837  prs->state->lenbytetoken = ptr->lenbytetoken;
1838  prs->state->lenchartoken = ptr->lenchartoken;
1839  pfree(ptr);
1840  }
1841 
1842  /* set new state if pointed */
1843  if (item->tostate != TPS_Null)
1844  prs->state->state = item->tostate;
1845 
1846  /* check for go away */
1847  if ((item->flags & A_BINGO) ||
1848  (prs->state->posbyte >= prs->lenstr &&
1849  (item->flags & A_RERUN) == 0))
1850  break;
1851 
1852  /* go to beginning of loop if we should rerun or we just restore state */
1853  if (item->flags & (A_RERUN | A_POP))
1854  continue;
1855 
1856  /* move forward */
1857  if (prs->state->charlen)
1858  {
1859  prs->state->posbyte += prs->state->charlen;
1860  prs->state->lenbytetoken += prs->state->charlen;
1861  prs->state->poschar++;
1862  prs->state->lenchartoken++;
1863  }
1864  }
1865 
1866  return (item && (item->flags & A_BINGO)) ? true : false;
1867 }
1868 
1869 Datum
1871 {
1872  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1873  int i;
1874 
1875  for (i = 1; i <= LASTNUM; i++)
1876  {
1877  descr[i - 1].lexid = i;
1878  descr[i - 1].alias = pstrdup(tok_alias[i]);
1879  descr[i - 1].descr = pstrdup(lex_descr[i]);
1880  }
1881 
1882  descr[LASTNUM].lexid = 0;
1883 
1884  PG_RETURN_POINTER(descr);
1885 }
1886 
1887 Datum
1889 {
1891 }
1892 
1893 Datum
1895 {
1896  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1897  char **t = (char **) PG_GETARG_POINTER(1);
1898  int *tlen = (int *) PG_GETARG_POINTER(2);
1899 
1900  if (!TParserGet(p))
1901  PG_RETURN_INT32(0);
1902 
1903  *t = p->token;
1904  *tlen = p->lenbytetoken;
1905 
1906  PG_RETURN_INT32(p->type);
1907 }
1908 
1909 Datum
1911 {
1912  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1913 
1914  TParserClose(p);
1915  PG_RETURN_VOID();
1916 }
1917 
1918 #define LEAVETOKEN(x) ( (x)==SPACE )
1919 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1920 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1921 
1922 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1923 #define HLIDREPLACE(x) ( (x)==TAG_T )
1924 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1925 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1926 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1927 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1928 
1929 typedef struct
1930 {
1932  int len;
1933 } hlCheck;
1934 
1935 static bool
1937 {
1938  int i;
1939  hlCheck *checkval = (hlCheck *) opaque;
1940 
1941  for (i = 0; i < checkval->len; i++)
1942  {
1943  if (checkval->words[i].item == val)
1944  {
1945  /* don't need to find all positions */
1946  if (!data)
1947  return true;
1948 
1949  if (!data->pos)
1950  {
1951  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1952  data->allocated = true;
1953  data->npos = 1;
1954  data->pos[0] = checkval->words[i].pos;
1955  }
1956  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1957  {
1958  data->pos[data->npos++] = checkval->words[i].pos;
1959  }
1960  }
1961  }
1962 
1963  if (data && data->npos > 0)
1964  return true;
1965 
1966  return false;
1967 }
1968 
1969 
1970 static bool
1971 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1972 {
1973  int i,
1974  j;
1975  QueryItem *item = GETQUERY(query);
1976  int pos = *p;
1977 
1978  *q = -1;
1979  *p = INT_MAX;
1980 
1981  for (j = 0; j < query->size; j++)
1982  {
1983  if (item->type != QI_VAL)
1984  {
1985  item++;
1986  continue;
1987  }
1988  for (i = pos; i < prs->curwords; i++)
1989  {
1990  if (prs->words[i].item == &item->qoperand)
1991  {
1992  if (i > *q)
1993  *q = i;
1994  break;
1995  }
1996  }
1997  item++;
1998  }
1999 
2000  if (*q < 0)
2001  return false;
2002 
2003  item = GETQUERY(query);
2004  for (j = 0; j < query->size; j++)
2005  {
2006  if (item->type != QI_VAL)
2007  {
2008  item++;
2009  continue;
2010  }
2011  for (i = *q; i >= pos; i--)
2012  {
2013  if (prs->words[i].item == &item->qoperand)
2014  {
2015  if (i < *p)
2016  *p = i;
2017  break;
2018  }
2019  }
2020  item++;
2021  }
2022 
2023  if (*p <= *q)
2024  {
2025  hlCheck ch;
2026 
2027  ch.words = &(prs->words[*p]);
2028  ch.len = *q - *p + 1;
2030  return true;
2031  else
2032  {
2033  (*p)++;
2034  return hlCover(prs, query, p, q);
2035  }
2036  }
2037 
2038  return false;
2039 }
2040 
2041 static void
2042 mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
2043 {
2044  int i;
2045 
2046  for (i = startpos; i <= endpos; i++)
2047  {
2048  if (prs->words[i].item)
2049  prs->words[i].selected = 1;
2050  if (highlight == 0)
2051  {
2052  if (HLIDREPLACE(prs->words[i].type))
2053  prs->words[i].replace = 1;
2054  else if (HLIDSKIP(prs->words[i].type))
2055  prs->words[i].skip = 1;
2056  }
2057  else
2058  {
2059  if (XMLHLIDSKIP(prs->words[i].type))
2060  prs->words[i].skip = 1;
2061  }
2062 
2063  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2064  }
2065 }
2066 
2067 typedef struct
2068 {
2075 } CoverPos;
2076 
2077 static void
2079  int *curlen, int *poslen, int max_words)
2080 {
2081  int i;
2082 
2083  /*
2084  * Objective: Generate a fragment of words between startpos and endpos
2085  * such that it has at most max_words and both ends has query words. If
2086  * the startpos and endpos are the endpoints of the cover and the cover
2087  * has fewer words than max_words, then this function should just return
2088  * the cover
2089  */
2090  /* first move startpos to an item */
2091  for (i = *startpos; i <= *endpos; i++)
2092  {
2093  *startpos = i;
2094  if (prs->words[i].item && !prs->words[i].repeated)
2095  break;
2096  }
2097  /* cut endpos to have only max_words */
2098  *curlen = 0;
2099  *poslen = 0;
2100  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2101  {
2102  if (!NONWORDTOKEN(prs->words[i].type))
2103  *curlen += 1;
2104  if (prs->words[i].item && !prs->words[i].repeated)
2105  *poslen += 1;
2106  }
2107  /* if the cover was cut then move back endpos to a query item */
2108  if (*endpos > i)
2109  {
2110  *endpos = i;
2111  for (i = *endpos; i >= *startpos; i--)
2112  {
2113  *endpos = i;
2114  if (prs->words[i].item && !prs->words[i].repeated)
2115  break;
2116  if (!NONWORDTOKEN(prs->words[i].type))
2117  *curlen -= 1;
2118  }
2119  }
2120 }
2121 
2122 static void
2123 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
2124  int shortword, int min_words,
2125  int max_words, int max_fragments)
2126 {
2127  int32 poslen,
2128  curlen,
2129  i,
2130  f,
2131  num_f = 0;
2132  int32 stretch,
2133  maxstretch,
2134  posmarker;
2135 
2136  int32 startpos = 0,
2137  endpos = 0,
2138  p = 0,
2139  q = 0;
2140 
2141  int32 numcovers = 0,
2142  maxcovers = 32;
2143 
2144  int32 minI,
2145  minwords,
2146  maxitems;
2147  CoverPos *covers;
2148 
2149  covers = palloc(maxcovers * sizeof(CoverPos));
2150 
2151  /* get all covers */
2152  while (hlCover(prs, query, &p, &q))
2153  {
2154  startpos = p;
2155  endpos = q;
2156 
2157  /*
2158  * Break the cover into smaller fragments such that each fragment has
2159  * at most max_words. Also ensure that each end of the fragment is a
2160  * query word. This will allow us to stretch the fragment in either
2161  * direction
2162  */
2163 
2164  while (startpos <= endpos)
2165  {
2166  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2167  if (numcovers >= maxcovers)
2168  {
2169  maxcovers *= 2;
2170  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2171  }
2172  covers[numcovers].startpos = startpos;
2173  covers[numcovers].endpos = endpos;
2174  covers[numcovers].curlen = curlen;
2175  covers[numcovers].poslen = poslen;
2176  covers[numcovers].in = 0;
2177  covers[numcovers].excluded = 0;
2178  numcovers++;
2179  startpos = endpos + 1;
2180  endpos = q;
2181  }
2182  /* move p to generate the next cover */
2183  p++;
2184  }
2185 
2186  /* choose best covers */
2187  for (f = 0; f < max_fragments; f++)
2188  {
2189  maxitems = 0;
2190  minwords = PG_INT32_MAX;
2191  minI = -1;
2192 
2193  /*
2194  * Choose the cover that contains max items. In case of tie choose the
2195  * one with smaller number of words.
2196  */
2197  for (i = 0; i < numcovers; i++)
2198  {
2199  if (!covers[i].in && !covers[i].excluded &&
2200  (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
2201  && minwords > covers[i].curlen)))
2202  {
2203  maxitems = covers[i].poslen;
2204  minwords = covers[i].curlen;
2205  minI = i;
2206  }
2207  }
2208  /* if a cover was found mark it */
2209  if (minI >= 0)
2210  {
2211  covers[minI].in = 1;
2212  /* adjust the size of cover */
2213  startpos = covers[minI].startpos;
2214  endpos = covers[minI].endpos;
2215  curlen = covers[minI].curlen;
2216  /* stretch the cover if cover size is lower than max_words */
2217  if (curlen < max_words)
2218  {
2219  /* divide the stretch on both sides of cover */
2220  maxstretch = (max_words - curlen) / 2;
2221 
2222  /*
2223  * first stretch the startpos stop stretching if 1. we hit the
2224  * beginning of document 2. exceed maxstretch 3. we hit an
2225  * already marked fragment
2226  */
2227  stretch = 0;
2228  posmarker = startpos;
2229  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2230  {
2231  if (!NONWORDTOKEN(prs->words[i].type))
2232  {
2233  curlen++;
2234  stretch++;
2235  }
2236  posmarker = i;
2237  }
2238  /* cut back startpos till we find a non short token */
2239  for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
2240  {
2241  if (!NONWORDTOKEN(prs->words[i].type))
2242  curlen--;
2243  }
2244  startpos = i;
2245  /* now stretch the endpos as much as possible */
2246  posmarker = endpos;
2247  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2248  {
2249  if (!NONWORDTOKEN(prs->words[i].type))
2250  curlen++;
2251  posmarker = i;
2252  }
2253  /* cut back endpos till we find a non-short token */
2254  for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
2255  {
2256  if (!NONWORDTOKEN(prs->words[i].type))
2257  curlen--;
2258  }
2259  endpos = i;
2260  }
2261  covers[minI].startpos = startpos;
2262  covers[minI].endpos = endpos;
2263  covers[minI].curlen = curlen;
2264  /* Mark the chosen fragments (covers) */
2265  mark_fragment(prs, highlight, startpos, endpos);
2266  num_f++;
2267  /* exclude overlapping covers */
2268  for (i = 0; i < numcovers; i++)
2269  {
2270  if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
2271  covers[i].excluded = 1;
2272  }
2273  }
2274  else
2275  break;
2276  }
2277 
2278  /* show at least min_words we have not marked anything */
2279  if (num_f <= 0)
2280  {
2281  startpos = endpos = curlen = 0;
2282  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2283  {
2284  if (!NONWORDTOKEN(prs->words[i].type))
2285  curlen++;
2286  endpos = i;
2287  }
2288  mark_fragment(prs, highlight, startpos, endpos);
2289  }
2290  pfree(covers);
2291 }
2292 
2293 static void
2294 mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
2295  int shortword, int min_words, int max_words)
2296 {
2297  int p = 0,
2298  q = 0;
2299  int bestb = -1,
2300  beste = -1;
2301  int bestlen = -1;
2302  int pose = 0,
2303  posb,
2304  poslen,
2305  curlen;
2306 
2307  int i;
2308 
2309  if (highlight == 0)
2310  {
2311  while (hlCover(prs, query, &p, &q))
2312  {
2313  /* find cover len in words */
2314  curlen = 0;
2315  poslen = 0;
2316  for (i = p; i <= q && curlen < max_words; i++)
2317  {
2318  if (!NONWORDTOKEN(prs->words[i].type))
2319  curlen++;
2320  if (prs->words[i].item && !prs->words[i].repeated)
2321  poslen++;
2322  pose = i;
2323  }
2324 
2325  if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
2326  {
2327  /* best already found, so try one more cover */
2328  p++;
2329  continue;
2330  }
2331 
2332  posb = p;
2333  if (curlen < max_words)
2334  { /* find good end */
2335  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2336  {
2337  if (i != q)
2338  {
2339  if (!NONWORDTOKEN(prs->words[i].type))
2340  curlen++;
2341  if (prs->words[i].item && !prs->words[i].repeated)
2342  poslen++;
2343  }
2344  pose = i;
2345  if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2346  continue;
2347  if (curlen >= min_words)
2348  break;
2349  }
2350  if (curlen < min_words && i >= prs->curwords)
2351  { /* got end of text and our cover is shorter
2352  * than min_words */
2353  for (i = p - 1; i >= 0; i--)
2354  {
2355  if (!NONWORDTOKEN(prs->words[i].type))
2356  curlen++;
2357  if (prs->words[i].item && !prs->words[i].repeated)
2358  poslen++;
2359  if (curlen >= max_words)
2360  break;
2361  if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2362  continue;
2363  if (curlen >= min_words)
2364  break;
2365  }
2366  posb = (i >= 0) ? i : 0;
2367  }
2368  }
2369  else
2370  { /* shorter cover :((( */
2371  if (i > q)
2372  i = q;
2373  for (; curlen > min_words; i--)
2374  {
2375  if (!NONWORDTOKEN(prs->words[i].type))
2376  curlen--;
2377  if (prs->words[i].item && !prs->words[i].repeated)
2378  poslen--;
2379  pose = i;
2380  if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2381  continue;
2382  break;
2383  }
2384  }
2385 
2386  if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
2387  (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
2388  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
2389  {
2390  bestb = posb;
2391  beste = pose;
2392  bestlen = poslen;
2393  }
2394 
2395  p++;
2396  }
2397 
2398  if (bestlen < 0)
2399  {
2400  curlen = 0;
2401  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2402  {
2403  if (!NONWORDTOKEN(prs->words[i].type))
2404  curlen++;
2405  pose = i;
2406  }
2407  bestb = 0;
2408  beste = pose;
2409  }
2410  }
2411  else
2412  {
2413  bestb = 0;
2414  beste = prs->curwords - 1;
2415  }
2416 
2417  for (i = bestb; i <= beste; i++)
2418  {
2419  if (prs->words[i].item)
2420  prs->words[i].selected = 1;
2421  if (highlight == 0)
2422  {
2423  if (HLIDREPLACE(prs->words[i].type))
2424  prs->words[i].replace = 1;
2425  else if (HLIDSKIP(prs->words[i].type))
2426  prs->words[i].skip = 1;
2427  }
2428  else
2429  {
2430  if (XMLHLIDSKIP(prs->words[i].type))
2431  prs->words[i].skip = 1;
2432  }
2433 
2434  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2435  }
2436 
2437 }
2438 
2439 Datum
2441 {
2443  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2444  TSQuery query = PG_GETARG_TSQUERY(2);
2445 
2446  /* from opt + start and end tag */
2447  int min_words = 15;
2448  int max_words = 35;
2449  int shortword = 3;
2450  int max_fragments = 0;
2451  int highlight = 0;
2452  ListCell *l;
2453 
2454  /* config */
2455  prs->startsel = NULL;
2456  prs->stopsel = NULL;
2457  foreach(l, prsoptions)
2458  {
2459  DefElem *defel = (DefElem *) lfirst(l);
2460  char *val = defGetString(defel);
2461 
2462  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2463  max_words = pg_atoi(val, sizeof(int32), 0);
2464  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2465  min_words = pg_atoi(val, sizeof(int32), 0);
2466  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2467  shortword = pg_atoi(val, sizeof(int32), 0);
2468  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2469  max_fragments = pg_atoi(val, sizeof(int32), 0);
2470  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2471  prs->startsel = pstrdup(val);
2472  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2473  prs->stopsel = pstrdup(val);
2474  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2475  prs->fragdelim = pstrdup(val);
2476  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2477  highlight = (pg_strcasecmp(val, "1") == 0 ||
2478  pg_strcasecmp(val, "on") == 0 ||
2479  pg_strcasecmp(val, "true") == 0 ||
2480  pg_strcasecmp(val, "t") == 0 ||
2481  pg_strcasecmp(val, "y") == 0 ||
2482  pg_strcasecmp(val, "yes") == 0);
2483  else
2484  ereport(ERROR,
2485  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2486  errmsg("unrecognized headline parameter: \"%s\"",
2487  defel->defname)));
2488  }
2489 
2490  if (highlight == 0)
2491  {
2492  if (min_words >= max_words)
2493  ereport(ERROR,
2494  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2495  errmsg("MinWords should be less than MaxWords")));
2496  if (min_words <= 0)
2497  ereport(ERROR,
2498  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2499  errmsg("MinWords should be positive")));
2500  if (shortword < 0)
2501  ereport(ERROR,
2502  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2503  errmsg("ShortWord should be >= 0")));
2504  if (max_fragments < 0)
2505  ereport(ERROR,
2506  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2507  errmsg("MaxFragments should be >= 0")));
2508  }
2509 
2510  if (max_fragments == 0)
2511  /* call the default headline generator */
2512  mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
2513  else
2514  mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
2515 
2516  if (!prs->startsel)
2517  prs->startsel = pstrdup("<b>");
2518  if (!prs->stopsel)
2519  prs->stopsel = pstrdup("</b>");
2520  if (!prs->fragdelim)
2521  prs->fragdelim = pstrdup(" ... ");
2522  prs->startsellen = strlen(prs->startsel);
2523  prs->stopsellen = strlen(prs->stopsel);
2524  prs->fragdelimlen = strlen(prs->fragdelim);
2525 
2526  PG_RETURN_POINTER(prs);
2527 }
uint16 WordEntryPos
Definition: ts_type.h:63
signed short int16
Definition: c.h:245
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1815
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1704
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:618
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1353
#define A_POP
Definition: wparser_def.c:220
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1011
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1558
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1140
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1526
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1500
bool wanthost
Definition: wparser_def.c:252
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2440
struct TParser TParser
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1160
TParserState state
Definition: wparser_def.c:234
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1338
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:403
static int p_isascii(TParser *prs)
Definition: wparser_def.c:499
void print(const void *obj)
Definition: print.c:35
TParserState
Definition: wparser_def.c:117
#define NUMHWORD
Definition: wparser_def.c:48
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1020
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:570
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:982
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1385
char * alias
Definition: ts_public.h:28
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1320
#define XMLENTITY
Definition: wparser_def.c:56
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1079
#define HLIDSKIP(x)
Definition: wparser_def.c:1924
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:43
int32 curlen
Definition: wparser_def.c:2072
#define UNSIGNEDINT
Definition: wparser_def.c:55
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:430
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1492
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1870
#define URLPATH
Definition: wparser_def.c:51
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1037
bool allocated
Definition: ts_utils.h:136
struct TParserPosition * prev
Definition: wparser_def.c:235
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:238
char * pstrdup(const char *in)
Definition: mcxt.c:1076
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1460
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1926
static int p_isignore(TParser *prs)
Definition: wparser_def.c:629
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1316
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2078
bool usewide
Definition: wparser_def.c:246
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1273
#define PG_RETURN_INT32(x)
Definition: fmgr.h:314
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1282
int errcode(int sqlerrcode)
Definition: elog.c:575
QueryOperand * item
Definition: ts_public.h:47
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1188
#define NOENDTOKEN(x)
Definition: wparser_def.c:1927
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:480
#define QI_VAL
Definition: ts_type.h:134
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1297
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1291
char * str
Definition: wparser_def.c:242
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1237
static const char *const lex_descr[]
Definition: wparser_def.c:87
#define A_CLRALL
Definition: wparser_def.c:225
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:74
#define lengthof(array)
Definition: c.h:556
#define VERSIONNUMBER
Definition: wparser_def.c:41
unsigned int Oid
Definition: postgres_ext.h:31
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1589
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2294
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1510
static void TParserClose(TParser *prs)
Definition: wparser_def.c:378
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2123
int32 startpos
Definition: wparser_def.c:2069
TParserState state
Definition: wparser_def.c:1604
#define GETQUERY(x)
Definition: _int.h:142
WordEntryPos pos
Definition: ts_public.h:45
signed int int32
Definition: c.h:246
static const char *const tok_alias[]
Definition: wparser_def.c:60
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1910
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1367
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1181
#define ASCIIWORD
Definition: wparser_def.c:34
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1888
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1573
#define PROTOCOL
Definition: wparser_def.c:47
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
#define TAG_T
Definition: wparser_def.c:46
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:602
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1464
unsigned short uint16
Definition: c.h:257
void pfree(void *pointer)
Definition: mcxt.c:949
#define NUMPARTHWORD
Definition: wparser_def.c:42
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:511
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:993
TParserSpecial special
Definition: wparser_def.c:214
TParserPosition * state
Definition: wparser_def.c:250
#define ERROR
Definition: elog.h:43
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1410
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:236
char * defGetString(DefElem *def)
Definition: define.c:49
static void mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
Definition: wparser_def.c:2042
#define SPACE
Definition: wparser_def.c:45
int32 endpos
Definition: wparser_def.c:2070
static XLogRecPtr endpos
Definition: pg_receivewal.c:45
const TParserStateActionItem * action
Definition: wparser_def.c:1603
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1174
#define HLIDREPLACE(x)
Definition: wparser_def.c:1923
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:351
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1064
int16 in
Definition: wparser_def.c:2073
char * c
struct TParserPosition TParserPosition
HeadlineWordEntry * words
Definition: ts_public.h:52
#define A_BINGO
Definition: wparser_def.c:219
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:75
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
#define NUMWORD
Definition: wparser_def.c:36
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1925
WordEntryPos * pos
Definition: ts_utils.h:138
int lenbytetoken
Definition: wparser_def.c:259
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1374
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:965
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1070
void _make_compiler_happy(void)
Definition: wparser_def.c:543
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1168
#define LASTNUM
Definition: wparser_def.c:58
#define ereport(elevel, rest)
Definition: elog.h:122
int type
Definition: wparser_def.c:261
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1266
unsigned int pg_wchar
Definition: mbprint.c:31
char * token
Definition: wparser_def.c:258
#define A_CLEAR
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1085
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:1677
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:686
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1549
char * descr
Definition: ts_public.h:29
#define PARTHWORD
Definition: wparser_def.c:43
#define WORD_T
Definition: wparser_def.c:35
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1470
int lexid
Definition: ts_public.h:27
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1435
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1359
int lenchartoken
Definition: wparser_def.c:260
QueryItemType type
Definition: ts_type.h:195
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1401
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1541
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1327
void * palloc0(Size size)
Definition: mcxt.c:877
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1120
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:505
uintptr_t Datum
Definition: postgres.h:372
int16 excluded
Definition: wparser_def.c:2074
int GetDatabaseEncoding(void)
Definition: mbutils.c:1004
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1043
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1565
int32 poslen
Definition: wparser_def.c:2071
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1094
static const TParserStateAction Actions[]
Definition: wparser_def.c:1622
#define URL_T
Definition: wparser_def.c:38
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1614
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:723
static bool hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
Definition: wparser_def.c:1971
char c
Definition: wparser_def.c:255
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:767
#define PG_RETURN_VOID()
Definition: fmgr.h:309
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:287
#define ASCIIPARTHWORD
Definition: wparser_def.c:44
static int p_ishost(TParser *prs)
Definition: wparser_def.c:635
TParserCharTest isclass
Definition: wparser_def.c:209
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1192
#define Assert(condition)
Definition: c.h:681
#define lfirst(lc)
Definition: pg_list.h:106
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1446
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1113
#define FILEPATH
Definition: wparser_def.c:52
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1420
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1128
pg_wchar * pgwstr
Definition: wparser_def.c:245
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1259
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:657
bool ignore
Definition: wparser_def.c:251
int lenstr
Definition: wparser_def.c:243
static XLogRecPtr startpos
#define A_MERGE
Definition: wparser_def.c:224
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:609
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
int charmaxlen
Definition: wparser_def.c:249
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1533
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1217
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1134
wchar_t * wstr
Definition: wparser_def.c:244
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:270
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:487
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:202
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define SCIENTIFIC
Definition: wparser_def.c:40
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1476
#define PG_INT32_MAX
Definition: c.h:325
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1303
static int p_isneC(TParser *prs)
Definition: wparser_def.c:493
int32 size
Definition: ts_type.h:208
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:594
int i
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1100
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1211
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
char * defname
Definition: parsenodes.h:719
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1517
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1052
#define A_PUSH
Definition: wparser_def.c:221
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1178
#define SIGNEDINT
Definition: wparser_def.c:54
#define A_NEXT
Definition: wparser_def.c:218
QueryOperand qoperand
Definition: ts_type.h:197
#define A_RERUN
Definition: wparser_def.c:222
#define EMAIL
Definition: wparser_def.c:37
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1278
#define DECIMAL_T
Definition: wparser_def.c:53
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1582
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1488
Definition: pg_list.h:45
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1482
TParserState tostate
Definition: wparser_def.c:212
#define HWORD
Definition: wparser_def.c:50
#define TS_EXEC_EMPTY
Definition: ts_utils.h:159
long val
Definition: informix.c:689
#define HOST
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1107
#define ASCIIHWORD
Definition: wparser_def.c:49
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1203
static bool checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1936
int32 pg_atoi(const char *s, int size, int c)
Definition: numutils.c:37
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1309
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1427
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1894
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1391
HeadlineWordEntry * words
Definition: wparser_def.c:1931
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1454
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1231
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1149