PostgreSQL Source Code  git master
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57 
58 #define LASTNUM 23
59 
60 static const char *const tok_alias[] = {
61  "",
62  "asciiword",
63  "word",
64  "numword",
65  "email",
66  "url",
67  "host",
68  "sfloat",
69  "version",
70  "hword_numpart",
71  "hword_part",
72  "hword_asciipart",
73  "blank",
74  "tag",
75  "protocol",
76  "numhword",
77  "asciihword",
78  "hword",
79  "url_path",
80  "file",
81  "float",
82  "int",
83  "uint",
84  "entity"
85 };
86 
87 static const char *const lex_descr[] = {
88  "",
89  "Word, all ASCII",
90  "Word, all letters",
91  "Word, letters and digits",
92  "Email address",
93  "URL",
94  "Host",
95  "Scientific notation",
96  "Version number",
97  "Hyphenated word part, letters and digits",
98  "Hyphenated word part, all letters",
99  "Hyphenated word part, all ASCII",
100  "Space symbols",
101  "XML tag",
102  "Protocol head",
103  "Hyphenated word, letters and digits",
104  "Hyphenated word, all ASCII",
105  "Hyphenated word, all letters",
106  "URL path",
107  "File or path name",
108  "Decimal notation",
109  "Signed integer",
110  "Unsigned integer",
111  "XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119  TPS_Base = 0,
196  TPS_Null /* last state (fake value) */
197 } TParserState;
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203  * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205  * special cases... */
206 
207 typedef struct
208 {
210  char c;
213  int type;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226 
227 typedef struct TParserPosition
228 {
229  int posbyte; /* position of parser in bytes */
230  int poschar; /* position of parser in characters */
231  int charlen; /* length of current char */
232  int lenbytetoken; /* length of token-so-far in bytes */
233  int lenchartoken; /* and in chars */
238 
239 typedef struct TParser
240 {
241  /* string and position information */
242  char *str; /* multibyte string */
243  int lenstr; /* length of mbstring */
244  wchar_t *wstr; /* wide character string */
245  pg_wchar *pgwstr; /* wide character string for C-locale */
246  bool usewide;
247 
248  /* State of parse */
251  bool ignore;
252  bool wanthost;
253 
254  /* silly char */
255  char c;
256 
257  /* out */
258  char *token;
261  int type;
262 } TParser;
263 
264 
265 /* forward decls here */
266 static bool TParserGet(TParser *prs);
267 
268 
269 static TParserPosition *
271 {
273 
274  if (prev)
275  memcpy(res, prev, sizeof(TParserPosition));
276  else
277  memset(res, 0, sizeof(TParserPosition));
278 
279  res->prev = prev;
280 
281  res->pushedAtAction = NULL;
282 
283  return res;
284 }
285 
286 static TParser *
287 TParserInit(char *str, int len)
288 {
289  TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 
292  prs->str = str;
293  prs->lenstr = len;
294 
295  /*
296  * Use wide char code only when max encoding length > 1.
297  */
298  if (prs->charmaxlen > 1)
299  {
300  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
301  pg_locale_t mylocale = 0; /* TODO */
302 
303  prs->usewide = true;
304  if (lc_ctype_is_c(collation))
305  {
306  /*
307  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308  * be different from sizeof(wchar_t)
309  */
310  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312  }
313  else
314  {
315  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317  mylocale);
318  }
319  }
320  else
321  prs->usewide = false;
322 
323  prs->state = newTParserPosition(NULL);
324  prs->state->state = TPS_Base;
325 
326 #ifdef WPARSER_TRACE
327 
328  /*
329  * Use of %.*s here is a bit risky since it can misbehave if the data is
330  * not in what libc thinks is the prevailing encoding. However, since
331  * this is just a debugging aid, we choose to live with that.
332  */
333  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
334 #endif
335 
336  return prs;
337 }
338 
339 /*
340  * As an alternative to a full TParserInit one can create a
341  * TParserCopy which basically is a regular TParser without a private
342  * copy of the string - instead it uses the one from another TParser.
343  * This is useful because at some places TParsers are created
344  * recursively and the repeated copying around of the strings can
345  * cause major inefficiency if the source string is long.
346  * The new parser starts parsing at the original's current position.
347  *
348  * Obviously one must not close the original TParser before the copy.
349  */
350 static TParser *
352 {
353  TParser *prs = (TParser *) palloc0(sizeof(TParser));
354 
355  prs->charmaxlen = orig->charmaxlen;
356  prs->str = orig->str + orig->state->posbyte;
357  prs->lenstr = orig->lenstr - orig->state->posbyte;
358  prs->usewide = orig->usewide;
359 
360  if (orig->pgwstr)
361  prs->pgwstr = orig->pgwstr + orig->state->poschar;
362  if (orig->wstr)
363  prs->wstr = orig->wstr + orig->state->poschar;
364 
365  prs->state = newTParserPosition(NULL);
366  prs->state->state = TPS_Base;
367 
368 #ifdef WPARSER_TRACE
369  /* See note above about %.*s */
370  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
371 #endif
372 
373  return prs;
374 }
375 
376 
377 static void
379 {
380  while (prs->state)
381  {
382  TParserPosition *ptr = prs->state->prev;
383 
384  pfree(prs->state);
385  prs->state = ptr;
386  }
387 
388  if (prs->wstr)
389  pfree(prs->wstr);
390  if (prs->pgwstr)
391  pfree(prs->pgwstr);
392 
393 #ifdef WPARSER_TRACE
394  fprintf(stderr, "closing parser\n");
395 #endif
396  pfree(prs);
397 }
398 
399 /*
400  * Close a parser created with TParserCopyInit
401  */
402 static void
404 {
405  while (prs->state)
406  {
407  TParserPosition *ptr = prs->state->prev;
408 
409  pfree(prs->state);
410  prs->state = ptr;
411  }
412 
413 #ifdef WPARSER_TRACE
414  fprintf(stderr, "closing parser copy\n");
415 #endif
416  pfree(prs);
417 }
418 
419 
420 /*
421  * Character-type support functions, equivalent to is* macros, but
422  * working with any possible encodings and locales. Notes:
423  * - with multibyte encoding and C-locale isw* function may fail
424  * or give wrong result.
425  * - multibyte encoding and C-locale often are used for
426  * Asian languages.
427  * - if locale is C then we use pgwstr instead of wstr.
428  */
429 
430 #define p_iswhat(type, nonascii) \
431  \
432 static int \
433 p_is##type(TParser *prs) \
434 { \
435  Assert(prs->state); \
436  if (prs->usewide) \
437  { \
438  if (prs->pgwstr) \
439  { \
440  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
441  if (c > 0x7f) \
442  return nonascii; \
443  return is##type(c); \
444  } \
445  return isw##type(*(prs->wstr + prs->state->poschar)); \
446  } \
447  return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
448 } \
449  \
450 static int \
451 p_isnot##type(TParser *prs) \
452 { \
453  return !p_is##type(prs); \
454 }
455 
456 /*
457  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
458  * an alpha character, but not a member of other char classes.
459  */
460 p_iswhat(alnum, 1)
461 p_iswhat(alpha, 1)
462 p_iswhat(digit, 0)
463 p_iswhat(lower, 0)
464 p_iswhat(print, 0)
465 p_iswhat(punct, 0)
466 p_iswhat(space, 0)
467 p_iswhat(upper, 0)
468 p_iswhat(xdigit, 0)
469 
470 /* p_iseq should be used only for ascii symbols */
471 
472 static int
473 p_iseq(TParser *prs, char c)
474 {
475  Assert(prs->state);
476  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
477 }
478 
479 static int
481 {
482  Assert(prs->state);
483  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 }
485 
486 static int
488 {
489  return p_iseq(prs, prs->c);
490 }
491 
492 static int
494 {
495  return !p_iseq(prs, prs->c);
496 }
497 
498 static int
500 {
501  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 }
503 
504 static int
506 {
507  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 }
509 
510 static int
512 {
513  char ch;
514 
515  /* no non-ASCII need apply */
516  if (prs->state->charlen != 1)
517  return 0;
518  ch = *(prs->str + prs->state->posbyte);
519  /* no spaces or control characters */
520  if (ch <= 0x20 || ch >= 0x7F)
521  return 0;
522  /* reject characters disallowed by RFC 3986 */
523  switch (ch)
524  {
525  case '"':
526  case '<':
527  case '>':
528  case '\\':
529  case '^':
530  case '`':
531  case '{':
532  case '|':
533  case '}':
534  return 0;
535  }
536  return 1;
537 }
538 
539 
540 /* deliberately suppress unused-function complaints for the above */
541 void _make_compiler_happy(void);
542 void
544 {
545  p_isalnum(NULL);
546  p_isnotalnum(NULL);
547  p_isalpha(NULL);
548  p_isnotalpha(NULL);
549  p_isdigit(NULL);
550  p_isnotdigit(NULL);
551  p_islower(NULL);
552  p_isnotlower(NULL);
553  p_isprint(NULL);
554  p_isnotprint(NULL);
555  p_ispunct(NULL);
556  p_isnotpunct(NULL);
557  p_isspace(NULL);
558  p_isnotspace(NULL);
559  p_isupper(NULL);
560  p_isnotupper(NULL);
561  p_isxdigit(NULL);
562  p_isnotxdigit(NULL);
563  p_isEOF(NULL);
564  p_iseqC(NULL);
565  p_isneC(NULL);
566 }
567 
568 
569 static void
571 {
572  switch (prs->state->lenchartoken)
573  {
574  case 8: /* </script */
575  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
576  prs->ignore = false;
577  break;
578  case 7: /* <script || </style */
579  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
580  prs->ignore = false;
581  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
582  prs->ignore = true;
583  break;
584  case 6: /* <style */
585  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
586  prs->ignore = true;
587  break;
588  default:
589  break;
590  }
591 }
592 
593 static void
595 {
596  prs->wanthost = true;
597  prs->state->posbyte -= prs->state->lenbytetoken;
598  prs->state->poschar -= prs->state->lenchartoken;
599 }
600 
601 static void
603 {
604  prs->state->posbyte -= prs->state->lenbytetoken;
605  prs->state->poschar -= prs->state->lenchartoken;
606 }
607 
608 static void
610 {
611  prs->state->posbyte -= prs->state->lenbytetoken;
612  prs->state->poschar -= prs->state->lenchartoken;
613  prs->state->lenbytetoken = 0;
614  prs->state->lenchartoken = 0;
615 }
616 
617 static int
619 {
620  if (prs->wanthost)
621  {
622  prs->wanthost = false;
623  return 1;
624  }
625  return 0;
626 }
627 
628 static int
630 {
631  return (prs->ignore) ? 1 : 0;
632 }
633 
634 static int
636 {
637  TParser *tmpprs = TParserCopyInit(prs);
638  int res = 0;
639 
640  tmpprs->wanthost = true;
641 
642  if (TParserGet(tmpprs) && tmpprs->type == HOST)
643  {
644  prs->state->posbyte += tmpprs->lenbytetoken;
645  prs->state->poschar += tmpprs->lenchartoken;
646  prs->state->lenbytetoken += tmpprs->lenbytetoken;
647  prs->state->lenchartoken += tmpprs->lenchartoken;
648  prs->state->charlen = tmpprs->state->charlen;
649  res = 1;
650  }
651  TParserCopyClose(tmpprs);
652 
653  return res;
654 }
655 
656 static int
658 {
659  TParser *tmpprs = TParserCopyInit(prs);
660  int res = 0;
661 
662  tmpprs->state = newTParserPosition(tmpprs->state);
663  tmpprs->state->state = TPS_InURLPathFirst;
664 
665  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
666  {
667  prs->state->posbyte += tmpprs->lenbytetoken;
668  prs->state->poschar += tmpprs->lenchartoken;
669  prs->state->lenbytetoken += tmpprs->lenbytetoken;
670  prs->state->lenchartoken += tmpprs->lenchartoken;
671  prs->state->charlen = tmpprs->state->charlen;
672  res = 1;
673  }
674  TParserCopyClose(tmpprs);
675 
676  return res;
677 }
678 
679 /*
680  * returns true if current character has zero display length or
681  * it's a special sign in several languages. Such characters
682  * aren't a word-breaker although they aren't an isalpha.
683  * In beginning of word they aren't a part of it.
684  */
685 static int
687 {
688  /*
689  * pg_dsplen could return -1 which means error or control character
690  */
691  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
692  return 1;
693 
694  /*
695  * Unicode Characters in the 'Mark, Spacing Combining' Category That
696  * characters are not alpha although they are not breakers of word too.
697  * Check that only in utf encoding, because other encodings aren't
698  * supported by postgres or even exists.
699  */
700  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
701  {
702  static const pg_wchar strange_letter[] = {
703  /*
704  * use binary search, so elements should be ordered
705  */
706  0x0903, /* DEVANAGARI SIGN VISARGA */
707  0x093E, /* DEVANAGARI VOWEL SIGN AA */
708  0x093F, /* DEVANAGARI VOWEL SIGN I */
709  0x0940, /* DEVANAGARI VOWEL SIGN II */
710  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
711  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
712  0x094B, /* DEVANAGARI VOWEL SIGN O */
713  0x094C, /* DEVANAGARI VOWEL SIGN AU */
714  0x0982, /* BENGALI SIGN ANUSVARA */
715  0x0983, /* BENGALI SIGN VISARGA */
716  0x09BE, /* BENGALI VOWEL SIGN AA */
717  0x09BF, /* BENGALI VOWEL SIGN I */
718  0x09C0, /* BENGALI VOWEL SIGN II */
719  0x09C7, /* BENGALI VOWEL SIGN E */
720  0x09C8, /* BENGALI VOWEL SIGN AI */
721  0x09CB, /* BENGALI VOWEL SIGN O */
722  0x09CC, /* BENGALI VOWEL SIGN AU */
723  0x09D7, /* BENGALI AU LENGTH MARK */
724  0x0A03, /* GURMUKHI SIGN VISARGA */
725  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
726  0x0A3F, /* GURMUKHI VOWEL SIGN I */
727  0x0A40, /* GURMUKHI VOWEL SIGN II */
728  0x0A83, /* GUJARATI SIGN VISARGA */
729  0x0ABE, /* GUJARATI VOWEL SIGN AA */
730  0x0ABF, /* GUJARATI VOWEL SIGN I */
731  0x0AC0, /* GUJARATI VOWEL SIGN II */
732  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
733  0x0ACB, /* GUJARATI VOWEL SIGN O */
734  0x0ACC, /* GUJARATI VOWEL SIGN AU */
735  0x0B02, /* ORIYA SIGN ANUSVARA */
736  0x0B03, /* ORIYA SIGN VISARGA */
737  0x0B3E, /* ORIYA VOWEL SIGN AA */
738  0x0B40, /* ORIYA VOWEL SIGN II */
739  0x0B47, /* ORIYA VOWEL SIGN E */
740  0x0B48, /* ORIYA VOWEL SIGN AI */
741  0x0B4B, /* ORIYA VOWEL SIGN O */
742  0x0B4C, /* ORIYA VOWEL SIGN AU */
743  0x0B57, /* ORIYA AU LENGTH MARK */
744  0x0BBE, /* TAMIL VOWEL SIGN AA */
745  0x0BBF, /* TAMIL VOWEL SIGN I */
746  0x0BC1, /* TAMIL VOWEL SIGN U */
747  0x0BC2, /* TAMIL VOWEL SIGN UU */
748  0x0BC6, /* TAMIL VOWEL SIGN E */
749  0x0BC7, /* TAMIL VOWEL SIGN EE */
750  0x0BC8, /* TAMIL VOWEL SIGN AI */
751  0x0BCA, /* TAMIL VOWEL SIGN O */
752  0x0BCB, /* TAMIL VOWEL SIGN OO */
753  0x0BCC, /* TAMIL VOWEL SIGN AU */
754  0x0BD7, /* TAMIL AU LENGTH MARK */
755  0x0C01, /* TELUGU SIGN CANDRABINDU */
756  0x0C02, /* TELUGU SIGN ANUSVARA */
757  0x0C03, /* TELUGU SIGN VISARGA */
758  0x0C41, /* TELUGU VOWEL SIGN U */
759  0x0C42, /* TELUGU VOWEL SIGN UU */
760  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
761  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
762  0x0C82, /* KANNADA SIGN ANUSVARA */
763  0x0C83, /* KANNADA SIGN VISARGA */
764  0x0CBE, /* KANNADA VOWEL SIGN AA */
765  0x0CC0, /* KANNADA VOWEL SIGN II */
766  0x0CC1, /* KANNADA VOWEL SIGN U */
767  0x0CC2, /* KANNADA VOWEL SIGN UU */
768  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
769  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
770  0x0CC7, /* KANNADA VOWEL SIGN EE */
771  0x0CC8, /* KANNADA VOWEL SIGN AI */
772  0x0CCA, /* KANNADA VOWEL SIGN O */
773  0x0CCB, /* KANNADA VOWEL SIGN OO */
774  0x0CD5, /* KANNADA LENGTH MARK */
775  0x0CD6, /* KANNADA AI LENGTH MARK */
776  0x0D02, /* MALAYALAM SIGN ANUSVARA */
777  0x0D03, /* MALAYALAM SIGN VISARGA */
778  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
779  0x0D3F, /* MALAYALAM VOWEL SIGN I */
780  0x0D40, /* MALAYALAM VOWEL SIGN II */
781  0x0D46, /* MALAYALAM VOWEL SIGN E */
782  0x0D47, /* MALAYALAM VOWEL SIGN EE */
783  0x0D48, /* MALAYALAM VOWEL SIGN AI */
784  0x0D4A, /* MALAYALAM VOWEL SIGN O */
785  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
786  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
787  0x0D57, /* MALAYALAM AU LENGTH MARK */
788  0x0D82, /* SINHALA SIGN ANUSVARAYA */
789  0x0D83, /* SINHALA SIGN VISARGAYA */
790  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
791  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
792  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
793  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
794  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
795  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
796  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
797  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
798  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
799  * AELA-PILLA */
800  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
801  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
802  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
803  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
804  0x0F3E, /* TIBETAN SIGN YAR TSHES */
805  0x0F3F, /* TIBETAN SIGN MAR TSHES */
806  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
807  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
808  0x102C, /* MYANMAR VOWEL SIGN AA */
809  0x1031, /* MYANMAR VOWEL SIGN E */
810  0x1038, /* MYANMAR SIGN VISARGA */
811  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
812  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
813  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
814  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
815  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
816  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
817  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
818  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
819  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
820  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
821  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
822  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
823  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
824  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
825  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
826  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
827  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
828  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
829  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
830  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
831  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
832  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
833  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
834  0x17B6, /* KHMER VOWEL SIGN AA */
835  0x17BE, /* KHMER VOWEL SIGN OE */
836  0x17BF, /* KHMER VOWEL SIGN YA */
837  0x17C0, /* KHMER VOWEL SIGN IE */
838  0x17C1, /* KHMER VOWEL SIGN E */
839  0x17C2, /* KHMER VOWEL SIGN AE */
840  0x17C3, /* KHMER VOWEL SIGN AI */
841  0x17C4, /* KHMER VOWEL SIGN OO */
842  0x17C5, /* KHMER VOWEL SIGN AU */
843  0x17C7, /* KHMER SIGN REAHMUK */
844  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
845  0x1923, /* LIMBU VOWEL SIGN EE */
846  0x1924, /* LIMBU VOWEL SIGN AI */
847  0x1925, /* LIMBU VOWEL SIGN OO */
848  0x1926, /* LIMBU VOWEL SIGN AU */
849  0x1929, /* LIMBU SUBJOINED LETTER YA */
850  0x192A, /* LIMBU SUBJOINED LETTER RA */
851  0x192B, /* LIMBU SUBJOINED LETTER WA */
852  0x1930, /* LIMBU SMALL LETTER KA */
853  0x1931, /* LIMBU SMALL LETTER NGA */
854  0x1933, /* LIMBU SMALL LETTER TA */
855  0x1934, /* LIMBU SMALL LETTER NA */
856  0x1935, /* LIMBU SMALL LETTER PA */
857  0x1936, /* LIMBU SMALL LETTER MA */
858  0x1937, /* LIMBU SMALL LETTER RA */
859  0x1938, /* LIMBU SMALL LETTER LA */
860  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
861  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
862  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
863  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
864  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
865  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
866  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
867  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
868  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
869  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
870  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
871  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
872  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
873  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
874  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
875  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
876  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
877  0x19C8, /* NEW TAI LUE TONE MARK-1 */
878  0x19C9, /* NEW TAI LUE TONE MARK-2 */
879  0x1A19, /* BUGINESE VOWEL SIGN E */
880  0x1A1A, /* BUGINESE VOWEL SIGN O */
881  0x1A1B, /* BUGINESE VOWEL SIGN AE */
882  0x1B04, /* BALINESE SIGN BISAH */
883  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
884  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
885  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
886  0x1B3E, /* BALINESE VOWEL SIGN TALING */
887  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
888  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
889  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
890  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
891  0x1B44, /* BALINESE ADEG ADEG */
892  0x1B82, /* SUNDANESE SIGN PANGWISAD */
893  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
894  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
895  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
896  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
897  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
898  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
899  0x1C26, /* LEPCHA VOWEL SIGN AA */
900  0x1C27, /* LEPCHA VOWEL SIGN I */
901  0x1C28, /* LEPCHA VOWEL SIGN O */
902  0x1C29, /* LEPCHA VOWEL SIGN OO */
903  0x1C2A, /* LEPCHA VOWEL SIGN U */
904  0x1C2B, /* LEPCHA VOWEL SIGN UU */
905  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
906  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
907  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
908  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
909  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
910  0xA880, /* SAURASHTRA SIGN ANUSVARA */
911  0xA881, /* SAURASHTRA SIGN VISARGA */
912  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
913  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
914  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
915  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
916  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
917  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
918  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
919  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
920  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
921  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
922  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
923  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
924  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
925  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
926  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
927  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
928  0xA952, /* REJANG CONSONANT SIGN H */
929  0xA953, /* REJANG VIRAMA */
930  0xAA2F, /* CHAM VOWEL SIGN O */
931  0xAA30, /* CHAM VOWEL SIGN AI */
932  0xAA33, /* CHAM CONSONANT SIGN YA */
933  0xAA34, /* CHAM CONSONANT SIGN RA */
934  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
935  };
936  const pg_wchar *StopLow = strange_letter,
937  *StopHigh = strange_letter + lengthof(strange_letter),
938  *StopMiddle;
939  pg_wchar c;
940 
941  if (prs->pgwstr)
942  c = *(prs->pgwstr + prs->state->poschar);
943  else
944  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
945 
946  while (StopLow < StopHigh)
947  {
948  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
949  if (*StopMiddle == c)
950  return 1;
951  else if (*StopMiddle < c)
952  StopLow = StopMiddle + 1;
953  else
954  StopHigh = StopMiddle;
955  }
956  }
957 
958  return 0;
959 }
960 
961 /*
962  * Table of state/action of parser
963  */
964 
966  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
967  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
968  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
969  {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
970  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
971  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
972  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
973  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
974  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
975  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
976  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
977  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
978  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
979 };
980 
981 
983  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
984  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
985  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
986  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
987  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
988  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
989  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
990  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
991 };
992 
994  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
995  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
996  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
997  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
998  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
999  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1000  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1002  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1003  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1004  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1006  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1007  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1008  {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1009 };
1010 
1012  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1013  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1014  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1015  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1016  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1017  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1018 };
1019 
1021  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1022  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1023  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1024  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1025  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1026  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1027  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1028  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1029  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1030  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1031  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1032  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1033  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1034  {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1035 };
1036 
1038  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1039  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1040  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1041 };
1042 
1044  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1045  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1046  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1047  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1048  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1049  {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1050 };
1051 
1053  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1054  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1055  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1056  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1057  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1058  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1059  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1060  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1061  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1062 };
1063 
1065  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1066  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1067  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1068 };
1069 
1071  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1072  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1073  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1074  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1075  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1076  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1077 };
1078 
1080  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1081  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1082  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1083 };
1084 
1086  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1087  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1088  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1089  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1090  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1091  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1092 };
1093 
1095  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1096  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1097  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1098 };
1099 
1101  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1103  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1104 };
1105 
1106 
1108  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1109  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1110  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1111 };
1112 
1114  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1115  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1116  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1117  {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1118 };
1119 
1121  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1124  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1125  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1126 };
1127 
1129  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1130  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1131  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1132 };
1133 
1135  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1136  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1137  {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1138 };
1139 
1141  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1142  {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1143  {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1147 };
1148 
1150  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1151  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1152  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1153  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1154  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1155  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1156  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1157  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 };
1159 
1161  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162  {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1163  {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1164  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1165  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1166 };
1167 
1169  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1171  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172 };
1173 
1175  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1177  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1178  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1179 };
1180 
1182  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1183  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1184  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1185  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1186 };
1187 
1189  {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1190 };
1191 
1193  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1195  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1196  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1197  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1198  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1199  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1200  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1201 };
1202 
1204  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205  /* <?xml ... */
1206  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1207  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1208  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1209 };
1210 
1212  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1213  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1214  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1215 };
1216 
1218  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219  /* <br/> case */
1220  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1221  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1222  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1223  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1224  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1225  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1226  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1227  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1228  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230 
1232  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1234  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235 };
1236 
1238  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1240  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1241  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1242  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1243  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1244  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1245  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1246  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1247  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1248  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1249  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1250  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1251  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1252  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1253  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1254  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1255  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1256  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1257 };
1258 
1260  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1262  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1263  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1264 };
1265 
1267  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1268  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1269  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1270  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1271 };
1272 
1274  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1275  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1276 };
1277 
1279  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1280 };
1281 
1283  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1284  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1285  /* <!DOCTYPE ...> */
1286  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1287  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1288  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1289 };
1290 
1292  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1293  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1294  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1295 };
1296 
1298  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1300  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1301 };
1302 
1304  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1306  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1307 };
1308 
1310  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1312  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1313  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1314 };
1315 
1317  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1318 };
1319 
1321  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1322  {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1323  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1324  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1325 };
1326 
1328  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1329  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1330  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1331  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1332  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1333  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1334  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1335  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1336 };
1337 
1339  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1340  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1341  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1342  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1343  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1344  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1345  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1346  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1347  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1349  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1350  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1351 };
1352 
1354  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1355  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1356  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1357 };
1358 
1360  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1361  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1363  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1364  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1365 };
1366 
1368  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1369  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1370  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1371  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1372 };
1373 
1375  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1376  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1377  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1378  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1379  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1380  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1381  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1382  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1383 };
1384 
1386  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1387  {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1388  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1389 };
1390 
1392  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1393  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1394  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1395  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1396  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1397  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1398  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1399 };
1400 
1402  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1403  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1404  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1405  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1406  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1407  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1408 };
1409 
1411  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1412  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1413  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1414  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1415  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1416  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1417  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1418 };
1419 
1421  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1422  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1423  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1424  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1425 };
1426 
1428  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1429  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1430  {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1431  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1432  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434 
1436  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1437  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1438  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1439  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1440  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1441  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1442  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1443  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1444 };
1445 
1447  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1449  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1450  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1451  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1452 };
1453 
1455  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1456  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1457  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1458 };
1459 
1461  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1462 };
1463 
1465  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1466  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1467  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1468 };
1469 
1471  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1474 };
1475 
1477  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1478  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1479  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481 
1483  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1485  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487 
1489  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1490 };
1491 
1493  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1494  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1497  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1498 };
1499 
1502  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1503  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1504  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1506  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1507  {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1508 };
1509 
1511  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1512  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1514  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516 
1519  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1520  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1521  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1522  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1523  {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1524 };
1525 
1527  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1528  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1530  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1531 };
1532 
1535  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1536  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1538  {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1539 };
1540 
1542  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1543  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1544  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1545  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1546  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1547 };
1548 
1550  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1552  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1553  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1554  {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1555  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1556 };
1557 
1559  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1560  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1562  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1563 };
1564 
1566  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1567  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1568  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1570  {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1571 };
1572 
1574  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1576  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1577  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1578  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1579  {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1580 };
1581 
1583  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1584  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1586  {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1587 };
1588 
1590  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1592  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1594  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596 
1597 
1598 /*
1599  * main table of per-state parser actions
1600  */
1601 typedef struct
1602 {
1603  const TParserStateActionItem *action; /* the actual state info */
1604  TParserState state; /* only for Assert crosscheck */
1605 #ifdef WPARSER_TRACE
1606  const char *state_name; /* only for debug printout */
1607 #endif
1609 
1610 #ifdef WPARSER_TRACE
1611 #define TPARSERSTATEACTION(state) \
1612  { CppConcat(action,state), state, CppAsString(state) }
1613 #else
1614 #define TPARSERSTATEACTION(state) \
1615  { CppConcat(action,state), state }
1616 #endif
1617 
1618 /*
1619  * order must be the same as in typedef enum {} TParserState!!
1620  */
1621 
1622 static const TParserStateAction Actions[] = {
1700 };
1701 
1702 
1703 static bool
1705 {
1706  const TParserStateActionItem *item = NULL;
1707 
1708  Assert(prs->state);
1709 
1710  if (prs->state->posbyte >= prs->lenstr)
1711  return false;
1712 
1713  prs->token = prs->str + prs->state->posbyte;
1714  prs->state->pushedAtAction = NULL;
1715 
1716  /* look at string */
1717  while (prs->state->posbyte <= prs->lenstr)
1718  {
1719  if (prs->state->posbyte == prs->lenstr)
1720  prs->state->charlen = 0;
1721  else
1722  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1723  pg_mblen(prs->str + prs->state->posbyte);
1724 
1725  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1726  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1727  Assert(Actions[prs->state->state].state == prs->state->state);
1728 
1729  if (prs->state->pushedAtAction)
1730  {
1731  /* After a POP, pick up at the next test */
1732  item = prs->state->pushedAtAction + 1;
1733  prs->state->pushedAtAction = NULL;
1734  }
1735  else
1736  {
1737  item = Actions[prs->state->state].action;
1738  Assert(item != NULL);
1739  }
1740 
1741  /* find action by character class */
1742  while (item->isclass)
1743  {
1744  prs->c = item->c;
1745  if (item->isclass(prs) != 0)
1746  break;
1747  item++;
1748  }
1749 
1750 #ifdef WPARSER_TRACE
1751  {
1752  TParserPosition *ptr;
1753 
1754  fprintf(stderr, "state ");
1755  /* indent according to stack depth */
1756  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1757  fprintf(stderr, " ");
1758  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1759  if (prs->state->posbyte < prs->lenstr)
1760  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1761  else
1762  fprintf(stderr, "at EOF");
1763  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1764  (int) (item - Actions[prs->state->state].action),
1765  (item->flags & A_BINGO) ? " BINGO" : "",
1766  (item->flags & A_POP) ? " POP" : "",
1767  (item->flags & A_PUSH) ? " PUSH" : "",
1768  (item->flags & A_RERUN) ? " RERUN" : "",
1769  (item->flags & A_CLEAR) ? " CLEAR" : "",
1770  (item->flags & A_MERGE) ? " MERGE" : "",
1771  (item->flags & A_CLRALL) ? " CLRALL" : "",
1772  (item->tostate != TPS_Null) ? " tostate " : "",
1773  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1774  (item->type > 0) ? " type " : "",
1775  tok_alias[item->type]);
1776  }
1777 #endif
1778 
1779  /* call special handler if exists */
1780  if (item->special)
1781  item->special(prs);
1782 
1783  /* BINGO, token is found */
1784  if (item->flags & A_BINGO)
1785  {
1786  Assert(item->type > 0);
1787  prs->lenbytetoken = prs->state->lenbytetoken;
1788  prs->lenchartoken = prs->state->lenchartoken;
1789  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1790  prs->type = item->type;
1791  }
1792 
1793  /* do various actions by flags */
1794  if (item->flags & A_POP)
1795  { /* pop stored state in stack */
1796  TParserPosition *ptr = prs->state->prev;
1797 
1798  pfree(prs->state);
1799  prs->state = ptr;
1800  Assert(prs->state);
1801  }
1802  else if (item->flags & A_PUSH)
1803  { /* push (store) state in stack */
1804  prs->state->pushedAtAction = item; /* remember where we push */
1805  prs->state = newTParserPosition(prs->state);
1806  }
1807  else if (item->flags & A_CLEAR)
1808  { /* clear previous pushed state */
1809  TParserPosition *ptr;
1810 
1811  Assert(prs->state->prev);
1812  ptr = prs->state->prev->prev;
1813  pfree(prs->state->prev);
1814  prs->state->prev = ptr;
1815  }
1816  else if (item->flags & A_CLRALL)
1817  { /* clear all previous pushed state */
1818  TParserPosition *ptr;
1819 
1820  while (prs->state->prev)
1821  {
1822  ptr = prs->state->prev->prev;
1823  pfree(prs->state->prev);
1824  prs->state->prev = ptr;
1825  }
1826  }
1827  else if (item->flags & A_MERGE)
1828  { /* merge posinfo with current and pushed state */
1829  TParserPosition *ptr = prs->state;
1830 
1831  Assert(prs->state->prev);
1832  prs->state = prs->state->prev;
1833 
1834  prs->state->posbyte = ptr->posbyte;
1835  prs->state->poschar = ptr->poschar;
1836  prs->state->charlen = ptr->charlen;
1837  prs->state->lenbytetoken = ptr->lenbytetoken;
1838  prs->state->lenchartoken = ptr->lenchartoken;
1839  pfree(ptr);
1840  }
1841 
1842  /* set new state if pointed */
1843  if (item->tostate != TPS_Null)
1844  prs->state->state = item->tostate;
1845 
1846  /* check for go away */
1847  if ((item->flags & A_BINGO) ||
1848  (prs->state->posbyte >= prs->lenstr &&
1849  (item->flags & A_RERUN) == 0))
1850  break;
1851 
1852  /* go to beginning of loop if we should rerun or we just restore state */
1853  if (item->flags & (A_RERUN | A_POP))
1854  continue;
1855 
1856  /* move forward */
1857  if (prs->state->charlen)
1858  {
1859  prs->state->posbyte += prs->state->charlen;
1860  prs->state->lenbytetoken += prs->state->charlen;
1861  prs->state->poschar++;
1862  prs->state->lenchartoken++;
1863  }
1864  }
1865 
1866  return (item && (item->flags & A_BINGO)) ? true : false;
1867 }
1868 
1869 Datum
1871 {
1872  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1873  int i;
1874 
1875  for (i = 1; i <= LASTNUM; i++)
1876  {
1877  descr[i - 1].lexid = i;
1878  descr[i - 1].alias = pstrdup(tok_alias[i]);
1879  descr[i - 1].descr = pstrdup(lex_descr[i]);
1880  }
1881 
1882  descr[LASTNUM].lexid = 0;
1883 
1884  PG_RETURN_POINTER(descr);
1885 }
1886 
1887 Datum
1889 {
1891 }
1892 
1893 Datum
1895 {
1896  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1897  char **t = (char **) PG_GETARG_POINTER(1);
1898  int *tlen = (int *) PG_GETARG_POINTER(2);
1899 
1900  if (!TParserGet(p))
1901  PG_RETURN_INT32(0);
1902 
1903  *t = p->token;
1904  *tlen = p->lenbytetoken;
1905 
1906  PG_RETURN_INT32(p->type);
1907 }
1908 
1909 Datum
1911 {
1912  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1913 
1914  TParserClose(p);
1915  PG_RETURN_VOID();
1916 }
1917 
1918 
1919 /*
1920  * ts_headline support begins here
1921  */
1922 
1923 /* token type classification macros */
1924 #define LEAVETOKEN(x) ( (x)==SPACE )
1925 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1926 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1927 
1928 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1929 #define HLIDREPLACE(x) ( (x)==TAG_T )
1930 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1931 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1932 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1933 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1934 
1935 /*
1936  * Macros useful in headline selection. These rely on availability of
1937  * "HeadlineParsedText *prs" describing some text, and "int shortword"
1938  * describing the "short word" length parameter.
1939  */
1940 
1941 /* Interesting words are non-repeated search terms */
1942 #define INTERESTINGWORD(j) \
1943  (prs->words[j].item && !prs->words[j].repeated)
1944 
1945 /* Don't want to end at a non-word or a short word, unless interesting */
1946 #define BADENDPOINT(j) \
1947  ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1948  !INTERESTINGWORD(j))
1949 
1950 typedef struct
1951 {
1952  /* one cover (well, really one fragment) for mark_hl_fragments */
1953  int32 startpos; /* fragment's starting word index */
1954  int32 endpos; /* ending word index (inclusive) */
1955  int32 poslen; /* number of interesting words */
1956  int32 curlen; /* total number of words */
1957  bool chosen; /* chosen? */
1958  bool excluded; /* excluded? */
1959 } CoverPos;
1960 
1961 typedef struct
1962 {
1963  /* callback data for checkcondition_HL */
1965  int len;
1966 } hlCheck;
1967 
1968 
1969 /*
1970  * TS_execute callback for matching a tsquery operand to headline words
1971  */
1972 static bool
1974 {
1975  hlCheck *checkval = (hlCheck *) opaque;
1976  int i;
1977 
1978  /* scan words array for marching items */
1979  for (i = 0; i < checkval->len; i++)
1980  {
1981  if (checkval->words[i].item == val)
1982  {
1983  /* if data == NULL, don't need to report positions */
1984  if (!data)
1985  return true;
1986 
1987  if (!data->pos)
1988  {
1989  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1990  data->allocated = true;
1991  data->npos = 1;
1992  data->pos[0] = checkval->words[i].pos;
1993  }
1994  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1995  {
1996  data->pos[data->npos++] = checkval->words[i].pos;
1997  }
1998  }
1999  }
2000 
2001  if (data && data->npos > 0)
2002  return true;
2003 
2004  return false;
2005 }
2006 
2007 /*
2008  * hlFirstIndex: find first index >= pos containing any word used in query
2009  *
2010  * Returns -1 if no such index
2011  */
2012 static int
2014 {
2015  int i;
2016 
2017  /* For each word ... */
2018  for (i = pos; i < prs->curwords; i++)
2019  {
2020  /* ... scan the query to see if this word matches any operand */
2021  QueryItem *item = GETQUERY(query);
2022  int j;
2023 
2024  for (j = 0; j < query->size; j++)
2025  {
2026  if (item->type == QI_VAL &&
2027  prs->words[i].item == &item->qoperand)
2028  return i;
2029  item++;
2030  }
2031  }
2032  return -1;
2033 }
2034 
2035 /*
2036  * hlCover: try to find a substring of prs' word list that satisfies query
2037  *
2038  * At entry, *p must be the first word index to consider (initialize this to
2039  * zero, or to the next index after a previous successful search).
2040  *
2041  * On success, sets *p to first word index and *q to last word index of the
2042  * cover substring, and returns true.
2043  *
2044  * The result is a minimal cover, in the sense that both *p and *q will be
2045  * words used in the query.
2046  */
2047 static bool
2048 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
2049 {
2050  int pmin,
2051  pmax,
2052  nextpmin,
2053  nextpmax;
2054  hlCheck ch;
2055 
2056  /*
2057  * We look for the earliest, shortest substring of prs->words that
2058  * satisfies the query. Both the pmin and pmax indices must be words
2059  * appearing in the query; there's no point in trying endpoints in between
2060  * such points.
2061  */
2062  pmin = hlFirstIndex(prs, query, *p);
2063  while (pmin >= 0)
2064  {
2065  /* This useless assignment just keeps stupider compilers quiet */
2066  nextpmin = -1;
2067  /* Consider substrings starting at pmin */
2068  ch.words = &(prs->words[pmin]);
2069  /* Consider the length-one substring first, then longer substrings */
2070  pmax = pmin;
2071  do
2072  {
2073  /* Try to match query against pmin .. pmax substring */
2074  ch.len = pmax - pmin + 1;
2075  if (TS_execute(GETQUERY(query), &ch,
2077  {
2078  *p = pmin;
2079  *q = pmax;
2080  return true;
2081  }
2082  /* Nope, so advance pmax to next feasible endpoint */
2083  nextpmax = hlFirstIndex(prs, query, pmax + 1);
2084 
2085  /*
2086  * If this is our first advance past pmin, then the result is also
2087  * the next feasible value of pmin; remember it to save a
2088  * redundant search.
2089  */
2090  if (pmax == pmin)
2091  nextpmin = nextpmax;
2092  pmax = nextpmax;
2093  }
2094  while (pmax >= 0);
2095  /* No luck here, so try next feasible startpoint */
2096  pmin = nextpmin;
2097  }
2098  return false;
2099 }
2100 
2101 /*
2102  * Apply suitable highlight marking to words selected by headline selector
2103  *
2104  * The words from startpos to endpos inclusive are marked per highlightall
2105  */
2106 static void
2107 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2108  int startpos, int endpos)
2109 {
2110  int i;
2111 
2112  for (i = startpos; i <= endpos; i++)
2113  {
2114  if (prs->words[i].item)
2115  prs->words[i].selected = 1;
2116  if (!highlightall)
2117  {
2118  if (HLIDREPLACE(prs->words[i].type))
2119  prs->words[i].replace = 1;
2120  else if (HLIDSKIP(prs->words[i].type))
2121  prs->words[i].skip = 1;
2122  }
2123  else
2124  {
2125  if (XMLHLIDSKIP(prs->words[i].type))
2126  prs->words[i].skip = 1;
2127  }
2128 
2129  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2130  }
2131 }
2132 
2133 /*
2134  * split a cover substring into fragments not longer than max_words
2135  *
2136  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2137  * substring. They are updated to hold the bounds of the next fragment.
2138  *
2139  * *curlen and *poslen are set to the fragment's length, in words and
2140  * interesting words respectively.
2141  */
2142 static void
2144  int *curlen, int *poslen, int max_words)
2145 {
2146  int i;
2147 
2148  /*
2149  * Objective: select a fragment of words between startpos and endpos such
2150  * that it has at most max_words and both ends have query words. If the
2151  * startpos and endpos are the endpoints of the cover and the cover has
2152  * fewer words than max_words, then this function should just return the
2153  * cover
2154  */
2155  /* first move startpos to an item */
2156  for (i = *startpos; i <= *endpos; i++)
2157  {
2158  *startpos = i;
2159  if (INTERESTINGWORD(i))
2160  break;
2161  }
2162  /* cut endpos to have only max_words */
2163  *curlen = 0;
2164  *poslen = 0;
2165  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2166  {
2167  if (!NONWORDTOKEN(prs->words[i].type))
2168  *curlen += 1;
2169  if (INTERESTINGWORD(i))
2170  *poslen += 1;
2171  }
2172  /* if the cover was cut then move back endpos to a query item */
2173  if (*endpos > i)
2174  {
2175  *endpos = i;
2176  for (i = *endpos; i >= *startpos; i--)
2177  {
2178  *endpos = i;
2179  if (INTERESTINGWORD(i))
2180  break;
2181  if (!NONWORDTOKEN(prs->words[i].type))
2182  *curlen -= 1;
2183  }
2184  }
2185 }
2186 
2187 /*
2188  * Headline selector used when MaxFragments > 0
2189  *
2190  * Note: in this mode, highlightall is disregarded for phrase selection;
2191  * it only controls presentation details.
2192  */
2193 static void
2194 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2195  int shortword, int min_words,
2196  int max_words, int max_fragments)
2197 {
2198  int32 poslen,
2199  curlen,
2200  i,
2201  f,
2202  num_f = 0;
2203  int32 stretch,
2204  maxstretch,
2205  posmarker;
2206 
2207  int32 startpos = 0,
2208  endpos = 0,
2209  p = 0,
2210  q = 0;
2211 
2212  int32 numcovers = 0,
2213  maxcovers = 32;
2214 
2215  int32 minI,
2216  minwords,
2217  maxitems;
2218  CoverPos *covers;
2219 
2220  covers = palloc(maxcovers * sizeof(CoverPos));
2221 
2222  /* get all covers */
2223  while (hlCover(prs, query, &p, &q))
2224  {
2225  startpos = p;
2226  endpos = q;
2227 
2228  /*
2229  * Break the cover into smaller fragments such that each fragment has
2230  * at most max_words. Also ensure that each end of each fragment is a
2231  * query word. This will allow us to stretch the fragment in either
2232  * direction
2233  */
2234 
2235  while (startpos <= endpos)
2236  {
2237  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2238  if (numcovers >= maxcovers)
2239  {
2240  maxcovers *= 2;
2241  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2242  }
2243  covers[numcovers].startpos = startpos;
2244  covers[numcovers].endpos = endpos;
2245  covers[numcovers].curlen = curlen;
2246  covers[numcovers].poslen = poslen;
2247  covers[numcovers].chosen = false;
2248  covers[numcovers].excluded = false;
2249  numcovers++;
2250  startpos = endpos + 1;
2251  endpos = q;
2252  }
2253 
2254  /* move p to generate the next cover */
2255  p++;
2256  }
2257 
2258  /* choose best covers */
2259  for (f = 0; f < max_fragments; f++)
2260  {
2261  maxitems = 0;
2262  minwords = PG_INT32_MAX;
2263  minI = -1;
2264 
2265  /*
2266  * Choose the cover that contains max items. In case of tie choose the
2267  * one with smaller number of words.
2268  */
2269  for (i = 0; i < numcovers; i++)
2270  {
2271  if (!covers[i].chosen && !covers[i].excluded &&
2272  (maxitems < covers[i].poslen ||
2273  (maxitems == covers[i].poslen &&
2274  minwords > covers[i].curlen)))
2275  {
2276  maxitems = covers[i].poslen;
2277  minwords = covers[i].curlen;
2278  minI = i;
2279  }
2280  }
2281  /* if a cover was found mark it */
2282  if (minI >= 0)
2283  {
2284  covers[minI].chosen = true;
2285  /* adjust the size of cover */
2286  startpos = covers[minI].startpos;
2287  endpos = covers[minI].endpos;
2288  curlen = covers[minI].curlen;
2289  /* stretch the cover if cover size is lower than max_words */
2290  if (curlen < max_words)
2291  {
2292  /* divide the stretch on both sides of cover */
2293  maxstretch = (max_words - curlen) / 2;
2294 
2295  /*
2296  * first stretch the startpos stop stretching if 1. we hit the
2297  * beginning of document 2. exceed maxstretch 3. we hit an
2298  * already marked fragment
2299  */
2300  stretch = 0;
2301  posmarker = startpos;
2302  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2303  {
2304  if (!NONWORDTOKEN(prs->words[i].type))
2305  {
2306  curlen++;
2307  stretch++;
2308  }
2309  posmarker = i;
2310  }
2311  /* cut back startpos till we find a good endpoint */
2312  for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2313  {
2314  if (!NONWORDTOKEN(prs->words[i].type))
2315  curlen--;
2316  }
2317  startpos = i;
2318  /* now stretch the endpos as much as possible */
2319  posmarker = endpos;
2320  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2321  {
2322  if (!NONWORDTOKEN(prs->words[i].type))
2323  curlen++;
2324  posmarker = i;
2325  }
2326  /* cut back endpos till we find a good endpoint */
2327  for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2328  {
2329  if (!NONWORDTOKEN(prs->words[i].type))
2330  curlen--;
2331  }
2332  endpos = i;
2333  }
2334  covers[minI].startpos = startpos;
2335  covers[minI].endpos = endpos;
2336  covers[minI].curlen = curlen;
2337  /* Mark the chosen fragments (covers) */
2338  mark_fragment(prs, highlightall, startpos, endpos);
2339  num_f++;
2340  /* Exclude covers overlapping this one from future consideration */
2341  for (i = 0; i < numcovers; i++)
2342  {
2343  if (i != minI &&
2344  ((covers[i].startpos >= startpos &&
2345  covers[i].startpos <= endpos) ||
2346  (covers[i].endpos >= startpos &&
2347  covers[i].endpos <= endpos) ||
2348  (covers[i].startpos < startpos &&
2349  covers[i].endpos > endpos)))
2350  covers[i].excluded = true;
2351  }
2352  }
2353  else
2354  break; /* no selectable covers remain */
2355  }
2356 
2357  /* show the first min_words words if we have not marked anything */
2358  if (num_f <= 0)
2359  {
2360  startpos = endpos = curlen = 0;
2361  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2362  {
2363  if (!NONWORDTOKEN(prs->words[i].type))
2364  curlen++;
2365  endpos = i;
2366  }
2367  mark_fragment(prs, highlightall, startpos, endpos);
2368  }
2369 
2370  pfree(covers);
2371 }
2372 
2373 /*
2374  * Headline selector used when MaxFragments == 0
2375  */
2376 static void
2377 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2378  int shortword, int min_words, int max_words)
2379 {
2380  int p = 0,
2381  q = 0;
2382  int bestb = -1,
2383  beste = -1;
2384  int bestlen = -1;
2385  bool bestcover = false;
2386  int pose,
2387  posb,
2388  poslen,
2389  curlen;
2390  bool poscover;
2391  int i;
2392 
2393  if (!highlightall)
2394  {
2395  /* examine all covers, select a headline using the best one */
2396  while (hlCover(prs, query, &p, &q))
2397  {
2398  /*
2399  * Count words (curlen) and interesting words (poslen) within
2400  * cover, but stop once we reach max_words. This step doesn't
2401  * consider whether that's a good stopping point. posb and pose
2402  * are set to the start and end indexes of the possible headline.
2403  */
2404  curlen = 0;
2405  poslen = 0;
2406  posb = pose = p;
2407  for (i = p; i <= q && curlen < max_words; i++)
2408  {
2409  if (!NONWORDTOKEN(prs->words[i].type))
2410  curlen++;
2411  if (INTERESTINGWORD(i))
2412  poslen++;
2413  pose = i;
2414  }
2415 
2416  if (curlen < max_words)
2417  {
2418  /*
2419  * We have room to lengthen the headline, so search forward
2420  * until it's full or we find a good stopping point. We'll
2421  * reconsider the word at "q", then move forward.
2422  */
2423  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2424  {
2425  if (i > q)
2426  {
2427  if (!NONWORDTOKEN(prs->words[i].type))
2428  curlen++;
2429  if (INTERESTINGWORD(i))
2430  poslen++;
2431  }
2432  pose = i;
2433  if (BADENDPOINT(i))
2434  continue;
2435  if (curlen >= min_words)
2436  break;
2437  }
2438  if (curlen < min_words)
2439  {
2440  /*
2441  * Reached end of text and our headline is still shorter
2442  * than min_words, so try to extend it to the left.
2443  */
2444  for (i = p - 1; i >= 0; i--)
2445  {
2446  if (!NONWORDTOKEN(prs->words[i].type))
2447  curlen++;
2448  if (INTERESTINGWORD(i))
2449  poslen++;
2450  if (curlen >= max_words)
2451  break;
2452  if (BADENDPOINT(i))
2453  continue;
2454  if (curlen >= min_words)
2455  break;
2456  }
2457  posb = (i >= 0) ? i : 0;
2458  }
2459  }
2460  else
2461  {
2462  /*
2463  * Can't make headline longer, so consider making it shorter
2464  * if needed to avoid a bad endpoint.
2465  */
2466  if (i > q)
2467  i = q;
2468  for (; curlen > min_words; i--)
2469  {
2470  if (!BADENDPOINT(i))
2471  break;
2472  if (!NONWORDTOKEN(prs->words[i].type))
2473  curlen--;
2474  if (INTERESTINGWORD(i))
2475  poslen--;
2476  pose = i - 1;
2477  }
2478  }
2479 
2480  /*
2481  * Check whether the proposed headline includes the original
2482  * cover; it might not if we trimmed it due to max_words.
2483  */
2484  poscover = (posb <= p && pose >= q);
2485 
2486  /*
2487  * Adopt this headline if it's better than the last one, giving
2488  * highest priority to headlines including the cover, then to
2489  * headlines with more interesting words, then to headlines with
2490  * good stopping points. (Since bestlen is initially -1, we will
2491  * certainly adopt the first headline.)
2492  */
2493  if (poscover > bestcover ||
2494  (poscover == bestcover && poslen > bestlen) ||
2495  (poscover == bestcover && poslen == bestlen &&
2496  !BADENDPOINT(pose) && BADENDPOINT(beste)))
2497  {
2498  bestb = posb;
2499  beste = pose;
2500  bestlen = poslen;
2501  bestcover = poscover;
2502  }
2503 
2504  /* move p to generate the next cover */
2505  p++;
2506  }
2507 
2508  /*
2509  * If we found nothing acceptable, select min_words words starting at
2510  * the beginning.
2511  */
2512  if (bestlen < 0)
2513  {
2514  curlen = 0;
2515  pose = 0;
2516  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2517  {
2518  if (!NONWORDTOKEN(prs->words[i].type))
2519  curlen++;
2520  pose = i;
2521  }
2522  bestb = 0;
2523  beste = pose;
2524  }
2525  }
2526  else
2527  {
2528  /* highlightall mode: headline is whole document */
2529  bestb = 0;
2530  beste = prs->curwords - 1;
2531  }
2532 
2533  mark_fragment(prs, highlightall, bestb, beste);
2534 }
2535 
2536 /*
2537  * Default parser's prsheadline function
2538  */
2539 Datum
2541 {
2543  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2544  TSQuery query = PG_GETARG_TSQUERY(2);
2545 
2546  /* default option values: */
2547  int min_words = 15;
2548  int max_words = 35;
2549  int shortword = 3;
2550  int max_fragments = 0;
2551  bool highlightall = false;
2552  ListCell *l;
2553 
2554  /* Extract configuration option values */
2555  prs->startsel = NULL;
2556  prs->stopsel = NULL;
2557  prs->fragdelim = NULL;
2558  foreach(l, prsoptions)
2559  {
2560  DefElem *defel = (DefElem *) lfirst(l);
2561  char *val = defGetString(defel);
2562 
2563  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2564  max_words = pg_strtoint32(val);
2565  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2566  min_words = pg_strtoint32(val);
2567  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2568  shortword = pg_strtoint32(val);
2569  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2570  max_fragments = pg_strtoint32(val);
2571  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2572  prs->startsel = pstrdup(val);
2573  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2574  prs->stopsel = pstrdup(val);
2575  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2576  prs->fragdelim = pstrdup(val);
2577  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2578  highlightall = (pg_strcasecmp(val, "1") == 0 ||
2579  pg_strcasecmp(val, "on") == 0 ||
2580  pg_strcasecmp(val, "true") == 0 ||
2581  pg_strcasecmp(val, "t") == 0 ||
2582  pg_strcasecmp(val, "y") == 0 ||
2583  pg_strcasecmp(val, "yes") == 0);
2584  else
2585  ereport(ERROR,
2586  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2587  errmsg("unrecognized headline parameter: \"%s\"",
2588  defel->defname)));
2589  }
2590 
2591  /* in HighlightAll mode these parameters are ignored */
2592  if (!highlightall)
2593  {
2594  if (min_words >= max_words)
2595  ereport(ERROR,
2596  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2597  errmsg("MinWords should be less than MaxWords")));
2598  if (min_words <= 0)
2599  ereport(ERROR,
2600  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2601  errmsg("MinWords should be positive")));
2602  if (shortword < 0)
2603  ereport(ERROR,
2604  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2605  errmsg("ShortWord should be >= 0")));
2606  if (max_fragments < 0)
2607  ereport(ERROR,
2608  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2609  errmsg("MaxFragments should be >= 0")));
2610  }
2611 
2612  /* Apply appropriate headline selector */
2613  if (max_fragments == 0)
2614  mark_hl_words(prs, query, highlightall, shortword,
2615  min_words, max_words);
2616  else
2617  mark_hl_fragments(prs, query, highlightall, shortword,
2618  min_words, max_words, max_fragments);
2619 
2620  /* Fill in default values for string options */
2621  if (!prs->startsel)
2622  prs->startsel = pstrdup("<b>");
2623  if (!prs->stopsel)
2624  prs->stopsel = pstrdup("</b>");
2625  if (!prs->fragdelim)
2626  prs->fragdelim = pstrdup(" ... ");
2627 
2628  /* Caller will need these lengths, too */
2629  prs->startsellen = strlen(prs->startsel);
2630  prs->stopsellen = strlen(prs->stopsel);
2631  prs->fragdelimlen = strlen(prs->fragdelim);
2632 
2633  PG_RETURN_POINTER(prs);
2634 }
uint16 WordEntryPos
Definition: ts_type.h:63
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1797
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:360
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1704
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:618
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1353
#define A_POP
Definition: wparser_def.c:220
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1011
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1558
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1140
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
Definition: wparser_def.c:2107
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1526
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1500
bool wanthost
Definition: wparser_def.c:252
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2540
struct TParser TParser
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1160
TParserState state
Definition: wparser_def.c:234
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1338
bool chosen
Definition: wparser_def.c:1957
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:403
static int p_isascii(TParser *prs)
Definition: wparser_def.c:499
void print(const void *obj)
Definition: print.c:36
TParserState
Definition: wparser_def.c:117
#define NUMHWORD
Definition: wparser_def.c:48
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1020
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:570
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:982
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1385
char * alias
Definition: ts_public.h:28
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1320
#define XMLENTITY
Definition: wparser_def.c:56
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1079
#define HLIDSKIP(x)
Definition: wparser_def.c:1930
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:43
int32 curlen
Definition: wparser_def.c:1956
#define UNSIGNEDINT
Definition: wparser_def.c:55
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:430
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1492
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1870
#define URLPATH
Definition: wparser_def.c:51
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1037
bool allocated
Definition: ts_utils.h:152
struct TParserPosition * prev
Definition: wparser_def.c:235
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:238
char * pstrdup(const char *in)
Definition: mcxt.c:1186
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1460
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1932
static int p_isignore(TParser *prs)
Definition: wparser_def.c:629
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1316
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2143
bool usewide
Definition: wparser_def.c:246
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1273
#define PG_RETURN_INT32(x)
Definition: fmgr.h:353
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1282
int errcode(int sqlerrcode)
Definition: elog.c:610
QueryOperand * item
Definition: ts_public.h:47
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1188
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:480
#define QI_VAL
Definition: ts_type.h:134
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1297
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1291
char * str
Definition: wparser_def.c:242
#define INTERESTINGWORD(j)
Definition: wparser_def.c:1942
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1237
static const char *const lex_descr[]
Definition: wparser_def.c:87
#define A_CLRALL
Definition: wparser_def.c:225
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:74
#define lengthof(array)
Definition: c.h:668
#define VERSIONNUMBER
Definition: wparser_def.c:41
unsigned int Oid
Definition: postgres_ext.h:31
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1589
#define fprintf
Definition: port.h:197
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1510
static void TParserClose(TParser *prs)
Definition: wparser_def.c:378
int32 startpos
Definition: wparser_def.c:1953
TParserState state
Definition: wparser_def.c:1604
#define GETQUERY(x)
Definition: _int.h:157
WordEntryPos pos
Definition: ts_public.h:45
signed int int32
Definition: c.h:355
static const char *const tok_alias[]
Definition: wparser_def.c:60
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1910
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1367
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1181
#define ASCIIWORD
Definition: wparser_def.c:34
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1888
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1573
#define PROTOCOL
Definition: wparser_def.c:47
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
#define true
Definition: c.h:321
#define TAG_T
Definition: wparser_def.c:46
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:602
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1464
unsigned short uint16
Definition: c.h:366
void pfree(void *pointer)
Definition: mcxt.c:1056
#define NUMPARTHWORD
Definition: wparser_def.c:42
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:511
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:993
TParserSpecial special
Definition: wparser_def.c:214
TParserPosition * state
Definition: wparser_def.c:250
#define ERROR
Definition: elog.h:43
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1410
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:236
char * defGetString(DefElem *def)
Definition: define.c:49
#define SPACE
Definition: wparser_def.c:45
int32 endpos
Definition: wparser_def.c:1954
static XLogRecPtr endpos
Definition: pg_receivewal.c:46
const TParserStateActionItem * action
Definition: wparser_def.c:1603
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1174
#define HLIDREPLACE(x)
Definition: wparser_def.c:1929
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:351
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1064
char * c
struct TParserPosition TParserPosition
HeadlineWordEntry * words
Definition: ts_public.h:52
#define A_BINGO
Definition: wparser_def.c:219
#define NUMWORD
Definition: wparser_def.c:36
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1931
WordEntryPos * pos
Definition: ts_utils.h:154
int lenbytetoken
Definition: wparser_def.c:259
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1374
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:965
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1070
void _make_compiler_happy(void)
Definition: wparser_def.c:543
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1168
#define LASTNUM
Definition: wparser_def.c:58
int type
Definition: wparser_def.c:261
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1266
unsigned int pg_wchar
Definition: mbprint.c:31
char * token
Definition: wparser_def.c:258
#define A_CLEAR
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1085
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2061
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:686
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1549
char * descr
Definition: ts_public.h:29
#define PARTHWORD
Definition: wparser_def.c:43
#define WORD_T
Definition: wparser_def.c:35
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1470
int lexid
Definition: ts_public.h:27
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1435
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1359
int lenchartoken
Definition: wparser_def.c:260
QueryItemType type
Definition: ts_type.h:195
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1401
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1541
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1327
void * palloc0(Size size)
Definition: mcxt.c:980
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1120
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:505
uintptr_t Datum
Definition: postgres.h:367
static int hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
Definition: wparser_def.c:2013
int GetDatabaseEncoding(void)
Definition: mbutils.c:1151
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1043
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1565
int32 poslen
Definition: wparser_def.c:1955
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1094
static const TParserStateAction Actions[]
Definition: wparser_def.c:1622
#define URL_T
Definition: wparser_def.c:38
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1614
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:870
static bool hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
Definition: wparser_def.c:2048
char c
Definition: wparser_def.c:255
#define ereport(elevel,...)
Definition: elog.h:144
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:914
#define PG_RETURN_VOID()
Definition: fmgr.h:348
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:287
#define ASCIIPARTHWORD
Definition: wparser_def.c:44
static int p_ishost(TParser *prs)
Definition: wparser_def.c:635
TParserCharTest isclass
Definition: wparser_def.c:209
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1192
#define Assert(condition)
Definition: c.h:738
#define lfirst(lc)
Definition: pg_list.h:190
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1446
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1113
#define FILEPATH
Definition: wparser_def.c:52
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1420
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1128
pg_wchar * pgwstr
Definition: wparser_def.c:245
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1259
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:657
bool ignore
Definition: wparser_def.c:251
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:202
int lenstr
Definition: wparser_def.c:243
static XLogRecPtr startpos
#define A_MERGE
Definition: wparser_def.c:224
int pg_mblen(const char *mbstr)
Definition: mbutils.c:907
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:609
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
int charmaxlen
Definition: wparser_def.c:249
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1533
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1436
int32 pg_strtoint32(const char *s)
Definition: numutils.c:263
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1217
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1134
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2194
wchar_t * wstr
Definition: wparser_def.c:244
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:270
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:487
#define BADENDPOINT(j)
Definition: wparser_def.c:1946
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define SCIENTIFIC
Definition: wparser_def.c:40
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1476
#define PG_INT32_MAX
Definition: c.h:450
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1303
static int p_isneC(TParser *prs)
Definition: wparser_def.c:493
int32 size
Definition: ts_type.h:208
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:594
int i
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1100
bool excluded
Definition: wparser_def.c:1958
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1211
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
char * defname
Definition: parsenodes.h:732
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1517
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1052
#define A_PUSH
Definition: wparser_def.c:221
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1406
#define SIGNEDINT
Definition: wparser_def.c:54
#define A_NEXT
Definition: wparser_def.c:218
QueryOperand qoperand
Definition: ts_type.h:197
#define A_RERUN
Definition: wparser_def.c:222
#define EMAIL
Definition: wparser_def.c:37
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1278
#define DECIMAL_T
Definition: wparser_def.c:53
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1582
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1488
Definition: pg_list.h:50
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1482
TParserState tostate
Definition: wparser_def.c:212
#define HWORD
Definition: wparser_def.c:50
#define TS_EXEC_EMPTY
Definition: ts_utils.h:175
long val
Definition: informix.c:664
#define HOST
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1107
#define ASCIIHWORD
Definition: wparser_def.c:49
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1203
static bool checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1973
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1309
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1427
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1894
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1391
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2377
HeadlineWordEntry * words
Definition: wparser_def.c:1964
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1454
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1231
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1149