PostgreSQL Source Code  git master
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57 
58 #define LASTNUM 23
59 
60 static const char *const tok_alias[] = {
61  "",
62  "asciiword",
63  "word",
64  "numword",
65  "email",
66  "url",
67  "host",
68  "sfloat",
69  "version",
70  "hword_numpart",
71  "hword_part",
72  "hword_asciipart",
73  "blank",
74  "tag",
75  "protocol",
76  "numhword",
77  "asciihword",
78  "hword",
79  "url_path",
80  "file",
81  "float",
82  "int",
83  "uint",
84  "entity"
85 };
86 
87 static const char *const lex_descr[] = {
88  "",
89  "Word, all ASCII",
90  "Word, all letters",
91  "Word, letters and digits",
92  "Email address",
93  "URL",
94  "Host",
95  "Scientific notation",
96  "Version number",
97  "Hyphenated word part, letters and digits",
98  "Hyphenated word part, all letters",
99  "Hyphenated word part, all ASCII",
100  "Space symbols",
101  "XML tag",
102  "Protocol head",
103  "Hyphenated word, letters and digits",
104  "Hyphenated word, all ASCII",
105  "Hyphenated word, all letters",
106  "URL path",
107  "File or path name",
108  "Decimal notation",
109  "Signed integer",
110  "Unsigned integer",
111  "XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119  TPS_Base = 0,
196  TPS_Null /* last state (fake value) */
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203  * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205  * special cases... */
206 
207 typedef struct
208 {
210  char c;
213  int type;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226 
227 typedef struct TParserPosition
228 {
229  int posbyte; /* position of parser in bytes */
230  int poschar; /* position of parser in characters */
231  int charlen; /* length of current char */
232  int lenbytetoken; /* length of token-so-far in bytes */
233  int lenchartoken; /* and in chars */
238 
239 typedef struct TParser
240 {
241  /* string and position information */
242  char *str; /* multibyte string */
243  int lenstr; /* length of mbstring */
244  wchar_t *wstr; /* wide character string */
245  pg_wchar *pgwstr; /* wide character string for C-locale */
246  bool usewide;
247 
248  /* State of parse */
251  bool ignore;
252  bool wanthost;
253 
254  /* silly char */
255  char c;
256 
257  /* out */
258  char *token;
261  int type;
263 
264 
265 /* forward decls here */
266 static bool TParserGet(TParser *prs);
267 
268 
269 static TParserPosition *
271 {
273 
274  if (prev)
275  memcpy(res, prev, sizeof(TParserPosition));
276  else
277  memset(res, 0, sizeof(TParserPosition));
278 
279  res->prev = prev;
280 
281  res->pushedAtAction = NULL;
282 
283  return res;
284 }
285 
286 static TParser *
287 TParserInit(char *str, int len)
288 {
289  TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 
292  prs->str = str;
293  prs->lenstr = len;
294 
295  /*
296  * Use wide char code only when max encoding length > 1.
297  */
298  if (prs->charmaxlen > 1)
299  {
300  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
301  pg_locale_t mylocale = 0; /* TODO */
302 
303  prs->usewide = true;
304  if (lc_ctype_is_c(collation))
305  {
306  /*
307  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308  * be different from sizeof(wchar_t)
309  */
310  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312  }
313  else
314  {
315  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317  mylocale);
318  }
319  }
320  else
321  prs->usewide = false;
322 
323  prs->state = newTParserPosition(NULL);
324  prs->state->state = TPS_Base;
325 
326 #ifdef WPARSER_TRACE
327  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
328 #endif
329 
330  return prs;
331 }
332 
333 /*
334  * As an alternative to a full TParserInit one can create a
335  * TParserCopy which basically is a regular TParser without a private
336  * copy of the string - instead it uses the one from another TParser.
337  * This is useful because at some places TParsers are created
338  * recursively and the repeated copying around of the strings can
339  * cause major inefficiency if the source string is long.
340  * The new parser starts parsing at the original's current position.
341  *
342  * Obviously one must not close the original TParser before the copy.
343  */
344 static TParser *
346 {
347  TParser *prs = (TParser *) palloc0(sizeof(TParser));
348 
349  prs->charmaxlen = orig->charmaxlen;
350  prs->str = orig->str + orig->state->posbyte;
351  prs->lenstr = orig->lenstr - orig->state->posbyte;
352  prs->usewide = orig->usewide;
353 
354  if (orig->pgwstr)
355  prs->pgwstr = orig->pgwstr + orig->state->poschar;
356  if (orig->wstr)
357  prs->wstr = orig->wstr + orig->state->poschar;
358 
359  prs->state = newTParserPosition(NULL);
360  prs->state->state = TPS_Base;
361 
362 #ifdef WPARSER_TRACE
363  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
364 #endif
365 
366  return prs;
367 }
368 
369 
370 static void
372 {
373  while (prs->state)
374  {
375  TParserPosition *ptr = prs->state->prev;
376 
377  pfree(prs->state);
378  prs->state = ptr;
379  }
380 
381  if (prs->wstr)
382  pfree(prs->wstr);
383  if (prs->pgwstr)
384  pfree(prs->pgwstr);
385 
386 #ifdef WPARSER_TRACE
387  fprintf(stderr, "closing parser\n");
388 #endif
389  pfree(prs);
390 }
391 
392 /*
393  * Close a parser created with TParserCopyInit
394  */
395 static void
397 {
398  while (prs->state)
399  {
400  TParserPosition *ptr = prs->state->prev;
401 
402  pfree(prs->state);
403  prs->state = ptr;
404  }
405 
406 #ifdef WPARSER_TRACE
407  fprintf(stderr, "closing parser copy\n");
408 #endif
409  pfree(prs);
410 }
411 
412 
413 /*
414  * Character-type support functions, equivalent to is* macros, but
415  * working with any possible encodings and locales. Notes:
416  * - with multibyte encoding and C-locale isw* function may fail
417  * or give wrong result.
418  * - multibyte encoding and C-locale often are used for
419  * Asian languages.
420  * - if locale is C then we use pgwstr instead of wstr.
421  */
422 
423 #define p_iswhat(type, nonascii) \
424  \
425 static int \
426 p_is##type(TParser *prs) \
427 { \
428  Assert(prs->state); \
429  if (prs->usewide) \
430  { \
431  if (prs->pgwstr) \
432  { \
433  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
434  if (c > 0x7f) \
435  return nonascii; \
436  return is##type(c); \
437  } \
438  return isw##type(*(prs->wstr + prs->state->poschar)); \
439  } \
440  return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
441 } \
442  \
443 static int \
444 p_isnot##type(TParser *prs) \
445 { \
446  return !p_is##type(prs); \
447 }
448 
449 /*
450  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
451  * an alpha character, but not a member of other char classes.
452  */
453 p_iswhat(alnum, 1)
454 p_iswhat(alpha, 1)
455 p_iswhat(digit, 0)
456 p_iswhat(lower, 0)
457 p_iswhat(print, 0)
458 p_iswhat(punct, 0)
459 p_iswhat(space, 0)
460 p_iswhat(upper, 0)
461 p_iswhat(xdigit, 0)
462 
463 /* p_iseq should be used only for ascii symbols */
464 
465 static int
466 p_iseq(TParser *prs, char c)
467 {
468  Assert(prs->state);
469  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
470 }
471 
472 static int
474 {
475  Assert(prs->state);
476  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
477 }
478 
479 static int
481 {
482  return p_iseq(prs, prs->c);
483 }
484 
485 static int
487 {
488  return !p_iseq(prs, prs->c);
489 }
490 
491 static int
493 {
494  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
495 }
496 
497 static int
499 {
500  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
501 }
502 
503 static int
505 {
506  char ch;
507 
508  /* no non-ASCII need apply */
509  if (prs->state->charlen != 1)
510  return 0;
511  ch = *(prs->str + prs->state->posbyte);
512  /* no spaces or control characters */
513  if (ch <= 0x20 || ch >= 0x7F)
514  return 0;
515  /* reject characters disallowed by RFC 3986 */
516  switch (ch)
517  {
518  case '"':
519  case '<':
520  case '>':
521  case '\\':
522  case '^':
523  case '`':
524  case '{':
525  case '|':
526  case '}':
527  return 0;
528  }
529  return 1;
530 }
531 
532 
533 /* deliberately suppress unused-function complaints for the above */
534 void _make_compiler_happy(void);
535 void
537 {
538  p_isalnum(NULL);
539  p_isnotalnum(NULL);
540  p_isalpha(NULL);
541  p_isnotalpha(NULL);
542  p_isdigit(NULL);
543  p_isnotdigit(NULL);
544  p_islower(NULL);
545  p_isnotlower(NULL);
546  p_isprint(NULL);
547  p_isnotprint(NULL);
548  p_ispunct(NULL);
549  p_isnotpunct(NULL);
550  p_isspace(NULL);
551  p_isnotspace(NULL);
552  p_isupper(NULL);
553  p_isnotupper(NULL);
554  p_isxdigit(NULL);
555  p_isnotxdigit(NULL);
556  p_isEOF(NULL);
557  p_iseqC(NULL);
558  p_isneC(NULL);
559 }
560 
561 
562 static void
564 {
565  switch (prs->state->lenchartoken)
566  {
567  case 8: /* </script */
568  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
569  prs->ignore = false;
570  break;
571  case 7: /* <script || </style */
572  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
573  prs->ignore = false;
574  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
575  prs->ignore = true;
576  break;
577  case 6: /* <style */
578  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
579  prs->ignore = true;
580  break;
581  default:
582  break;
583  }
584 }
585 
586 static void
588 {
589  prs->wanthost = true;
590  prs->state->posbyte -= prs->state->lenbytetoken;
591  prs->state->poschar -= prs->state->lenchartoken;
592 }
593 
594 static void
596 {
597  prs->state->posbyte -= prs->state->lenbytetoken;
598  prs->state->poschar -= prs->state->lenchartoken;
599 }
600 
601 static void
603 {
604  prs->state->posbyte -= prs->state->lenbytetoken;
605  prs->state->poschar -= prs->state->lenchartoken;
606  prs->state->lenbytetoken = 0;
607  prs->state->lenchartoken = 0;
608 }
609 
610 static int
612 {
613  if (prs->wanthost)
614  {
615  prs->wanthost = false;
616  return 1;
617  }
618  return 0;
619 }
620 
621 static int
623 {
624  return (prs->ignore) ? 1 : 0;
625 }
626 
627 static int
629 {
630  TParser *tmpprs = TParserCopyInit(prs);
631  int res = 0;
632 
633  tmpprs->wanthost = true;
634 
635  if (TParserGet(tmpprs) && tmpprs->type == HOST)
636  {
637  prs->state->posbyte += tmpprs->lenbytetoken;
638  prs->state->poschar += tmpprs->lenchartoken;
639  prs->state->lenbytetoken += tmpprs->lenbytetoken;
640  prs->state->lenchartoken += tmpprs->lenchartoken;
641  prs->state->charlen = tmpprs->state->charlen;
642  res = 1;
643  }
644  TParserCopyClose(tmpprs);
645 
646  return res;
647 }
648 
649 static int
651 {
652  TParser *tmpprs = TParserCopyInit(prs);
653  int res = 0;
654 
655  tmpprs->state = newTParserPosition(tmpprs->state);
656  tmpprs->state->state = TPS_InURLPathFirst;
657 
658  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
659  {
660  prs->state->posbyte += tmpprs->lenbytetoken;
661  prs->state->poschar += tmpprs->lenchartoken;
662  prs->state->lenbytetoken += tmpprs->lenbytetoken;
663  prs->state->lenchartoken += tmpprs->lenchartoken;
664  prs->state->charlen = tmpprs->state->charlen;
665  res = 1;
666  }
667  TParserCopyClose(tmpprs);
668 
669  return res;
670 }
671 
672 /*
673  * returns true if current character has zero display length or
674  * it's a special sign in several languages. Such characters
675  * aren't a word-breaker although they aren't an isalpha.
676  * In beginning of word they aren't a part of it.
677  */
678 static int
680 {
681  /*
682  * pg_dsplen could return -1 which means error or control character
683  */
684  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
685  return 1;
686 
687  /*
688  * Unicode Characters in the 'Mark, Spacing Combining' Category That
689  * characters are not alpha although they are not breakers of word too.
690  * Check that only in utf encoding, because other encodings aren't
691  * supported by postgres or even exists.
692  */
693  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
694  {
695  static const pg_wchar strange_letter[] = {
696  /*
697  * use binary search, so elements should be ordered
698  */
699  0x0903, /* DEVANAGARI SIGN VISARGA */
700  0x093E, /* DEVANAGARI VOWEL SIGN AA */
701  0x093F, /* DEVANAGARI VOWEL SIGN I */
702  0x0940, /* DEVANAGARI VOWEL SIGN II */
703  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
704  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
705  0x094B, /* DEVANAGARI VOWEL SIGN O */
706  0x094C, /* DEVANAGARI VOWEL SIGN AU */
707  0x0982, /* BENGALI SIGN ANUSVARA */
708  0x0983, /* BENGALI SIGN VISARGA */
709  0x09BE, /* BENGALI VOWEL SIGN AA */
710  0x09BF, /* BENGALI VOWEL SIGN I */
711  0x09C0, /* BENGALI VOWEL SIGN II */
712  0x09C7, /* BENGALI VOWEL SIGN E */
713  0x09C8, /* BENGALI VOWEL SIGN AI */
714  0x09CB, /* BENGALI VOWEL SIGN O */
715  0x09CC, /* BENGALI VOWEL SIGN AU */
716  0x09D7, /* BENGALI AU LENGTH MARK */
717  0x0A03, /* GURMUKHI SIGN VISARGA */
718  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
719  0x0A3F, /* GURMUKHI VOWEL SIGN I */
720  0x0A40, /* GURMUKHI VOWEL SIGN II */
721  0x0A83, /* GUJARATI SIGN VISARGA */
722  0x0ABE, /* GUJARATI VOWEL SIGN AA */
723  0x0ABF, /* GUJARATI VOWEL SIGN I */
724  0x0AC0, /* GUJARATI VOWEL SIGN II */
725  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
726  0x0ACB, /* GUJARATI VOWEL SIGN O */
727  0x0ACC, /* GUJARATI VOWEL SIGN AU */
728  0x0B02, /* ORIYA SIGN ANUSVARA */
729  0x0B03, /* ORIYA SIGN VISARGA */
730  0x0B3E, /* ORIYA VOWEL SIGN AA */
731  0x0B40, /* ORIYA VOWEL SIGN II */
732  0x0B47, /* ORIYA VOWEL SIGN E */
733  0x0B48, /* ORIYA VOWEL SIGN AI */
734  0x0B4B, /* ORIYA VOWEL SIGN O */
735  0x0B4C, /* ORIYA VOWEL SIGN AU */
736  0x0B57, /* ORIYA AU LENGTH MARK */
737  0x0BBE, /* TAMIL VOWEL SIGN AA */
738  0x0BBF, /* TAMIL VOWEL SIGN I */
739  0x0BC1, /* TAMIL VOWEL SIGN U */
740  0x0BC2, /* TAMIL VOWEL SIGN UU */
741  0x0BC6, /* TAMIL VOWEL SIGN E */
742  0x0BC7, /* TAMIL VOWEL SIGN EE */
743  0x0BC8, /* TAMIL VOWEL SIGN AI */
744  0x0BCA, /* TAMIL VOWEL SIGN O */
745  0x0BCB, /* TAMIL VOWEL SIGN OO */
746  0x0BCC, /* TAMIL VOWEL SIGN AU */
747  0x0BD7, /* TAMIL AU LENGTH MARK */
748  0x0C01, /* TELUGU SIGN CANDRABINDU */
749  0x0C02, /* TELUGU SIGN ANUSVARA */
750  0x0C03, /* TELUGU SIGN VISARGA */
751  0x0C41, /* TELUGU VOWEL SIGN U */
752  0x0C42, /* TELUGU VOWEL SIGN UU */
753  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
754  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
755  0x0C82, /* KANNADA SIGN ANUSVARA */
756  0x0C83, /* KANNADA SIGN VISARGA */
757  0x0CBE, /* KANNADA VOWEL SIGN AA */
758  0x0CC0, /* KANNADA VOWEL SIGN II */
759  0x0CC1, /* KANNADA VOWEL SIGN U */
760  0x0CC2, /* KANNADA VOWEL SIGN UU */
761  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
762  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
763  0x0CC7, /* KANNADA VOWEL SIGN EE */
764  0x0CC8, /* KANNADA VOWEL SIGN AI */
765  0x0CCA, /* KANNADA VOWEL SIGN O */
766  0x0CCB, /* KANNADA VOWEL SIGN OO */
767  0x0CD5, /* KANNADA LENGTH MARK */
768  0x0CD6, /* KANNADA AI LENGTH MARK */
769  0x0D02, /* MALAYALAM SIGN ANUSVARA */
770  0x0D03, /* MALAYALAM SIGN VISARGA */
771  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
772  0x0D3F, /* MALAYALAM VOWEL SIGN I */
773  0x0D40, /* MALAYALAM VOWEL SIGN II */
774  0x0D46, /* MALAYALAM VOWEL SIGN E */
775  0x0D47, /* MALAYALAM VOWEL SIGN EE */
776  0x0D48, /* MALAYALAM VOWEL SIGN AI */
777  0x0D4A, /* MALAYALAM VOWEL SIGN O */
778  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
779  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
780  0x0D57, /* MALAYALAM AU LENGTH MARK */
781  0x0D82, /* SINHALA SIGN ANUSVARAYA */
782  0x0D83, /* SINHALA SIGN VISARGAYA */
783  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
784  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
785  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
786  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
787  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
788  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
789  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
790  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
791  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
792  * AELA-PILLA */
793  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
794  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
795  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
796  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
797  0x0F3E, /* TIBETAN SIGN YAR TSHES */
798  0x0F3F, /* TIBETAN SIGN MAR TSHES */
799  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
800  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
801  0x102C, /* MYANMAR VOWEL SIGN AA */
802  0x1031, /* MYANMAR VOWEL SIGN E */
803  0x1038, /* MYANMAR SIGN VISARGA */
804  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
805  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
806  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
807  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
808  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
809  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
810  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
811  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
812  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
813  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
814  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
815  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
816  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
817  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
818  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
819  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
820  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
821  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
822  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
823  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
824  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
825  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
826  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
827  0x17B6, /* KHMER VOWEL SIGN AA */
828  0x17BE, /* KHMER VOWEL SIGN OE */
829  0x17BF, /* KHMER VOWEL SIGN YA */
830  0x17C0, /* KHMER VOWEL SIGN IE */
831  0x17C1, /* KHMER VOWEL SIGN E */
832  0x17C2, /* KHMER VOWEL SIGN AE */
833  0x17C3, /* KHMER VOWEL SIGN AI */
834  0x17C4, /* KHMER VOWEL SIGN OO */
835  0x17C5, /* KHMER VOWEL SIGN AU */
836  0x17C7, /* KHMER SIGN REAHMUK */
837  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
838  0x1923, /* LIMBU VOWEL SIGN EE */
839  0x1924, /* LIMBU VOWEL SIGN AI */
840  0x1925, /* LIMBU VOWEL SIGN OO */
841  0x1926, /* LIMBU VOWEL SIGN AU */
842  0x1929, /* LIMBU SUBJOINED LETTER YA */
843  0x192A, /* LIMBU SUBJOINED LETTER RA */
844  0x192B, /* LIMBU SUBJOINED LETTER WA */
845  0x1930, /* LIMBU SMALL LETTER KA */
846  0x1931, /* LIMBU SMALL LETTER NGA */
847  0x1933, /* LIMBU SMALL LETTER TA */
848  0x1934, /* LIMBU SMALL LETTER NA */
849  0x1935, /* LIMBU SMALL LETTER PA */
850  0x1936, /* LIMBU SMALL LETTER MA */
851  0x1937, /* LIMBU SMALL LETTER RA */
852  0x1938, /* LIMBU SMALL LETTER LA */
853  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
854  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
855  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
856  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
857  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
858  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
859  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
860  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
861  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
862  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
863  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
864  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
865  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
866  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
867  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
868  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
869  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
870  0x19C8, /* NEW TAI LUE TONE MARK-1 */
871  0x19C9, /* NEW TAI LUE TONE MARK-2 */
872  0x1A19, /* BUGINESE VOWEL SIGN E */
873  0x1A1A, /* BUGINESE VOWEL SIGN O */
874  0x1A1B, /* BUGINESE VOWEL SIGN AE */
875  0x1B04, /* BALINESE SIGN BISAH */
876  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
877  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
878  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
879  0x1B3E, /* BALINESE VOWEL SIGN TALING */
880  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
881  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
882  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
883  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
884  0x1B44, /* BALINESE ADEG ADEG */
885  0x1B82, /* SUNDANESE SIGN PANGWISAD */
886  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
887  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
888  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
889  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
890  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
891  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
892  0x1C26, /* LEPCHA VOWEL SIGN AA */
893  0x1C27, /* LEPCHA VOWEL SIGN I */
894  0x1C28, /* LEPCHA VOWEL SIGN O */
895  0x1C29, /* LEPCHA VOWEL SIGN OO */
896  0x1C2A, /* LEPCHA VOWEL SIGN U */
897  0x1C2B, /* LEPCHA VOWEL SIGN UU */
898  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
899  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
900  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
901  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
902  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
903  0xA880, /* SAURASHTRA SIGN ANUSVARA */
904  0xA881, /* SAURASHTRA SIGN VISARGA */
905  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
906  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
907  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
908  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
909  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
910  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
911  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
912  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
913  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
914  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
915  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
916  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
917  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
918  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
919  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
920  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
921  0xA952, /* REJANG CONSONANT SIGN H */
922  0xA953, /* REJANG VIRAMA */
923  0xAA2F, /* CHAM VOWEL SIGN O */
924  0xAA30, /* CHAM VOWEL SIGN AI */
925  0xAA33, /* CHAM CONSONANT SIGN YA */
926  0xAA34, /* CHAM CONSONANT SIGN RA */
927  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
928  };
929  const pg_wchar *StopLow = strange_letter,
930  *StopHigh = strange_letter + lengthof(strange_letter),
931  *StopMiddle;
932  pg_wchar c;
933 
934  if (prs->pgwstr)
935  c = *(prs->pgwstr + prs->state->poschar);
936  else
937  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
938 
939  while (StopLow < StopHigh)
940  {
941  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
942  if (*StopMiddle == c)
943  return 1;
944  else if (*StopMiddle < c)
945  StopLow = StopMiddle + 1;
946  else
947  StopHigh = StopMiddle;
948  }
949  }
950 
951  return 0;
952 }
953 
954 /*
955  * Table of state/action of parser
956  */
957 
959  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
960  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
961  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
962  {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
963  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
964  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
965  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
966  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
967  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
968  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
969  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
970  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
971  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
972 };
973 
974 
976  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
977  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
978  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
979  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
980  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
981  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
982  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
983  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
984 };
985 
987  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
988  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
989  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
990  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
991  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
992  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
993  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
994  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
995  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
996  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
997  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
998  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
999  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1000  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1001  {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1002 };
1003 
1005  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1006  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1007  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1008  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1009  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1010  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1011 };
1012 
1014  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1015  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1016  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1017  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1018  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1019  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1020  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1021  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1022  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1023  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1024  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1025  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1026  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1027  {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1028 };
1029 
1031  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1032  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1033  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1034 };
1035 
1037  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1038  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1039  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1040  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1041  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1042  {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1043 };
1044 
1046  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1047  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1048  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1049  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1050  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1051  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1052  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1053  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1054  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1055 };
1056 
1058  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1059  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1060  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1061 };
1062 
1064  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1065  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1066  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1067  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1068  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1069  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1070 };
1071 
1073  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1074  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1075  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1076 };
1077 
1079  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1080  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1081  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1082  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1083  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1084  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1085 };
1086 
1088  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1089  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1090  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1091 };
1092 
1094  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1095  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1096  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1097 };
1098 
1099 
1101  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1103  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1104 };
1105 
1107  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1108  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1109  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1110  {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1111 };
1112 
1114  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1115  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1116  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1117  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1118  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1119 };
1120 
1122  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1123  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1124  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1125 };
1126 
1128  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1129  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1130  {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1131 };
1132 
1134  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1135  {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1136  {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1137  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1138  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1139  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1140 };
1141 
1143  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1144  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1145  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1147  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1148  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1149  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1150  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1151 };
1152 
1154  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155  {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1156  {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1157  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1158  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1159 };
1160 
1162  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1163  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1164  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 };
1166 
1168  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1169  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1170  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1171  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172 };
1173 
1175  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1177  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1178  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1179 };
1180 
1182  {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1183 };
1184 
1186  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1187  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1188  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1189  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1190  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1191  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1192  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1193  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1194 };
1195 
1197  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1198  /* <?xml ... */
1199  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1200  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1201  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1202 };
1203 
1205  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1206  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1207  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 };
1209 
1211  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1212  /* <br/> case */
1213  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1214  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1215  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1216  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1217  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1218  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1219  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1220  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1221  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1222 };
1223 
1225  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1226  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1227  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1228 };
1229 
1231  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1232  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1233  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1234  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1235  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1236  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1237  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1238  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1239  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1240  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1241  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1242  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1243  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1244  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1245  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1246  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1247  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1248  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1249  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1250 };
1251 
1253  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1254  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1255  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1256  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1257 };
1258 
1260  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1262  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1263  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1264 };
1265 
1267  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1268  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1269 };
1270 
1272  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1273 };
1274 
1276  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1277  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1278  /* <!DOCTYPE ...> */
1279  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1280  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1281  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1282 };
1283 
1285  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1286  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1287  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1288 };
1289 
1291  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1292  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1293  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1294 };
1295 
1297  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1298  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1299  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1300 };
1301 
1303  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1304  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1305  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1306  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1307 };
1308 
1310  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1311 };
1312 
1314  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1315  {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1316  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1317  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1318 };
1319 
1321  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1322  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1323  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1324  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1325  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1326  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1327  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1328  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1329 };
1330 
1332  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1333  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1334  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1335  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1336  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1337  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1338  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1339  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1340  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1342  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1343  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1344 };
1345 
1347  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1348  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1349  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1350 };
1351 
1353  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1354  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1356  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1357  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1358 };
1359 
1361  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1362  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1363  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1364  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1365 };
1366 
1368  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1369  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1370  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1371  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1372  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1373  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1374  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1375  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1376 };
1377 
1379  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1380  {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1381  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1382 };
1383 
1385  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1386  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1387  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1388  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1389  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1390  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1391  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1392 };
1393 
1395  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1396  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1397  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1398  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1399  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1400  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1401 };
1402 
1404  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1405  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1406  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1407  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1408  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1409  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1410  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1411 };
1412 
1414  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1415  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1416  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1417  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1418 };
1419 
1421  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1422  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1423  {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1424  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1425  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1426 };
1427 
1429  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1430  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1431  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1432  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1433  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1434  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1435  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1436  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1437 };
1438 
1440  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1441  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1442  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1443  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1444  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1445 };
1446 
1448  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1449  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1450  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1451 };
1452 
1454  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1455 };
1456 
1458  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1459  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1460  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1461 };
1462 
1464  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1466  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1467 };
1468 
1470  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1471  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1472  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1473 };
1474 
1476  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1477  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1478  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1479 };
1480 
1482  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1483 };
1484 
1486  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1487  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1488  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1489  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1490  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1491 };
1492 
1495  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1496  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1497  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1498  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1499  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1501 };
1502 
1504  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1505  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1506  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1507  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1508 };
1509 
1512  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1514  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1515  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1517 };
1518 
1520  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1521  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1522  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1523  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1524 };
1525 
1528  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1530  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1532 };
1533 
1535  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1536  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1537  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1538  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1539  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1540 };
1541 
1543  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1545  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1546  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1547  {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1548  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1549 };
1550 
1552  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1553  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1555  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1556 };
1557 
1559  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1560  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1561  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1562  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1563  {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1564 };
1565 
1567  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1569  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1570  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1571  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1572  {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1573 };
1574 
1576  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1577  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1578  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1579  {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1580 };
1581 
1583  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1584  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1585  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1587  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1588 };
1589 
1590 
1591 /*
1592  * main table of per-state parser actions
1593  */
1594 typedef struct
1595 {
1596  const TParserStateActionItem *action; /* the actual state info */
1597  TParserState state; /* only for Assert crosscheck */
1598 #ifdef WPARSER_TRACE
1599  const char *state_name; /* only for debug printout */
1600 #endif
1602 
1603 #ifdef WPARSER_TRACE
1604 #define TPARSERSTATEACTION(state) \
1605  { CppConcat(action,state), state, CppAsString(state) }
1606 #else
1607 #define TPARSERSTATEACTION(state) \
1608  { CppConcat(action,state), state }
1609 #endif
1610 
1611 /*
1612  * order must be the same as in typedef enum {} TParserState!!
1613  */
1614 
1615 static const TParserStateAction Actions[] = {
1693 };
1694 
1695 
1696 static bool
1698 {
1699  const TParserStateActionItem *item = NULL;
1700 
1701  Assert(prs->state);
1702 
1703  if (prs->state->posbyte >= prs->lenstr)
1704  return false;
1705 
1706  prs->token = prs->str + prs->state->posbyte;
1707  prs->state->pushedAtAction = NULL;
1708 
1709  /* look at string */
1710  while (prs->state->posbyte <= prs->lenstr)
1711  {
1712  if (prs->state->posbyte == prs->lenstr)
1713  prs->state->charlen = 0;
1714  else
1715  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1716  pg_mblen(prs->str + prs->state->posbyte);
1717 
1718  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1719  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1720  Assert(Actions[prs->state->state].state == prs->state->state);
1721 
1722  if (prs->state->pushedAtAction)
1723  {
1724  /* After a POP, pick up at the next test */
1725  item = prs->state->pushedAtAction + 1;
1726  prs->state->pushedAtAction = NULL;
1727  }
1728  else
1729  {
1730  item = Actions[prs->state->state].action;
1731  Assert(item != NULL);
1732  }
1733 
1734  /* find action by character class */
1735  while (item->isclass)
1736  {
1737  prs->c = item->c;
1738  if (item->isclass(prs) != 0)
1739  break;
1740  item++;
1741  }
1742 
1743 #ifdef WPARSER_TRACE
1744  {
1745  TParserPosition *ptr;
1746 
1747  fprintf(stderr, "state ");
1748  /* indent according to stack depth */
1749  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1750  fprintf(stderr, " ");
1751  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1752  if (prs->state->posbyte < prs->lenstr)
1753  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1754  else
1755  fprintf(stderr, "at EOF");
1756  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1757  (int) (item - Actions[prs->state->state].action),
1758  (item->flags & A_BINGO) ? " BINGO" : "",
1759  (item->flags & A_POP) ? " POP" : "",
1760  (item->flags & A_PUSH) ? " PUSH" : "",
1761  (item->flags & A_RERUN) ? " RERUN" : "",
1762  (item->flags & A_CLEAR) ? " CLEAR" : "",
1763  (item->flags & A_MERGE) ? " MERGE" : "",
1764  (item->flags & A_CLRALL) ? " CLRALL" : "",
1765  (item->tostate != TPS_Null) ? " tostate " : "",
1766  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1767  (item->type > 0) ? " type " : "",
1768  tok_alias[item->type]);
1769  }
1770 #endif
1771 
1772  /* call special handler if exists */
1773  if (item->special)
1774  item->special(prs);
1775 
1776  /* BINGO, token is found */
1777  if (item->flags & A_BINGO)
1778  {
1779  Assert(item->type > 0);
1780  prs->lenbytetoken = prs->state->lenbytetoken;
1781  prs->lenchartoken = prs->state->lenchartoken;
1782  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1783  prs->type = item->type;
1784  }
1785 
1786  /* do various actions by flags */
1787  if (item->flags & A_POP)
1788  { /* pop stored state in stack */
1789  TParserPosition *ptr = prs->state->prev;
1790 
1791  pfree(prs->state);
1792  prs->state = ptr;
1793  Assert(prs->state);
1794  }
1795  else if (item->flags & A_PUSH)
1796  { /* push (store) state in stack */
1797  prs->state->pushedAtAction = item; /* remember where we push */
1798  prs->state = newTParserPosition(prs->state);
1799  }
1800  else if (item->flags & A_CLEAR)
1801  { /* clear previous pushed state */
1802  TParserPosition *ptr;
1803 
1804  Assert(prs->state->prev);
1805  ptr = prs->state->prev->prev;
1806  pfree(prs->state->prev);
1807  prs->state->prev = ptr;
1808  }
1809  else if (item->flags & A_CLRALL)
1810  { /* clear all previous pushed state */
1811  TParserPosition *ptr;
1812 
1813  while (prs->state->prev)
1814  {
1815  ptr = prs->state->prev->prev;
1816  pfree(prs->state->prev);
1817  prs->state->prev = ptr;
1818  }
1819  }
1820  else if (item->flags & A_MERGE)
1821  { /* merge posinfo with current and pushed state */
1822  TParserPosition *ptr = prs->state;
1823 
1824  Assert(prs->state->prev);
1825  prs->state = prs->state->prev;
1826 
1827  prs->state->posbyte = ptr->posbyte;
1828  prs->state->poschar = ptr->poschar;
1829  prs->state->charlen = ptr->charlen;
1830  prs->state->lenbytetoken = ptr->lenbytetoken;
1831  prs->state->lenchartoken = ptr->lenchartoken;
1832  pfree(ptr);
1833  }
1834 
1835  /* set new state if pointed */
1836  if (item->tostate != TPS_Null)
1837  prs->state->state = item->tostate;
1838 
1839  /* check for go away */
1840  if ((item->flags & A_BINGO) ||
1841  (prs->state->posbyte >= prs->lenstr &&
1842  (item->flags & A_RERUN) == 0))
1843  break;
1844 
1845  /* go to beginning of loop if we should rerun or we just restore state */
1846  if (item->flags & (A_RERUN | A_POP))
1847  continue;
1848 
1849  /* move forward */
1850  if (prs->state->charlen)
1851  {
1852  prs->state->posbyte += prs->state->charlen;
1853  prs->state->lenbytetoken += prs->state->charlen;
1854  prs->state->poschar++;
1855  prs->state->lenchartoken++;
1856  }
1857  }
1858 
1859  return (item && (item->flags & A_BINGO));
1860 }
1861 
1862 Datum
1864 {
1865  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1866  int i;
1867 
1868  for (i = 1; i <= LASTNUM; i++)
1869  {
1870  descr[i - 1].lexid = i;
1871  descr[i - 1].alias = pstrdup(tok_alias[i]);
1872  descr[i - 1].descr = pstrdup(lex_descr[i]);
1873  }
1874 
1875  descr[LASTNUM].lexid = 0;
1876 
1877  PG_RETURN_POINTER(descr);
1878 }
1879 
1880 Datum
1882 {
1884 }
1885 
1886 Datum
1888 {
1889  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1890  char **t = (char **) PG_GETARG_POINTER(1);
1891  int *tlen = (int *) PG_GETARG_POINTER(2);
1892 
1893  if (!TParserGet(p))
1894  PG_RETURN_INT32(0);
1895 
1896  *t = p->token;
1897  *tlen = p->lenbytetoken;
1898 
1899  PG_RETURN_INT32(p->type);
1900 }
1901 
1902 Datum
1904 {
1905  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1906 
1907  TParserClose(p);
1908  PG_RETURN_VOID();
1909 }
1910 
1911 
1912 /*
1913  * ts_headline support begins here
1914  */
1915 
1916 /* token type classification macros */
1917 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1918 #define HLIDREPLACE(x) ( (x)==TAG_T )
1919 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1920 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1921 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1922 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1923 
1924 /*
1925  * Macros useful in headline selection. These rely on availability of
1926  * "HeadlineParsedText *prs" describing some text, and "int shortword"
1927  * describing the "short word" length parameter.
1928  */
1929 
1930 /* Interesting words are non-repeated search terms */
1931 #define INTERESTINGWORD(j) \
1932  (prs->words[j].item && !prs->words[j].repeated)
1933 
1934 /* Don't want to end at a non-word or a short word, unless interesting */
1935 #define BADENDPOINT(j) \
1936  ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1937  !INTERESTINGWORD(j))
1938 
1939 typedef struct
1940 {
1941  /* one cover (well, really one fragment) for mark_hl_fragments */
1942  int32 startpos; /* fragment's starting word index */
1943  int32 endpos; /* ending word index (inclusive) */
1944  int32 poslen; /* number of interesting words */
1945  int32 curlen; /* total number of words */
1946  bool chosen; /* chosen? */
1947  bool excluded; /* excluded? */
1948 } CoverPos;
1949 
1950 typedef struct
1951 {
1952  /* callback data for checkcondition_HL */
1954  int len;
1955 } hlCheck;
1956 
1957 
1958 /*
1959  * TS_execute callback for matching a tsquery operand to headline words
1960  *
1961  * Note: it's tempting to report words[] indexes as pos values to save
1962  * searching in hlCover; but that would screw up phrase matching, which
1963  * expects to measure distances in lexemes not tokens.
1964  */
1965 static TSTernaryValue
1967 {
1968  hlCheck *checkval = (hlCheck *) opaque;
1969  int i;
1970 
1971  /* scan words array for matching items */
1972  for (i = 0; i < checkval->len; i++)
1973  {
1974  if (checkval->words[i].item == val)
1975  {
1976  /* if data == NULL, don't need to report positions */
1977  if (!data)
1978  return TS_YES;
1979 
1980  if (!data->pos)
1981  {
1982  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1983  data->allocated = true;
1984  data->npos = 1;
1985  data->pos[0] = checkval->words[i].pos;
1986  }
1987  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1988  {
1989  data->pos[data->npos++] = checkval->words[i].pos;
1990  }
1991  }
1992  }
1993 
1994  if (data && data->npos > 0)
1995  return TS_YES;
1996 
1997  return TS_NO;
1998 }
1999 
2000 /*
2001  * hlCover: try to find a substring of prs' word list that satisfies query
2002  *
2003  * locations is the result of TS_execute_locations() for the query.
2004  * We use this to identify plausible subranges of the query.
2005  *
2006  * *nextpos is the lexeme position (NOT word index) to start the search
2007  * at. Caller should initialize this to zero. If successful, we'll
2008  * advance it to the next place to search at.
2009  *
2010  * On success, sets *p to first word index and *q to last word index of the
2011  * cover substring, and returns true.
2012  *
2013  * The result is a minimal cover, in the sense that both *p and *q will be
2014  * words used in the query.
2015  */
2016 static bool
2017 hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2018  int *nextpos, int *p, int *q)
2019 {
2020  int pos = *nextpos;
2021 
2022  /* This loop repeats when our selected word-range fails the query */
2023  for (;;)
2024  {
2025  int posb,
2026  pose;
2027  ListCell *lc;
2028 
2029  /*
2030  * For each AND'ed query term or phrase, find its first occurrence at
2031  * or after pos; set pose to the maximum of those positions.
2032  *
2033  * We need not consider ORs or NOTs here; see the comments for
2034  * TS_execute_locations(). Rechecking the match with TS_execute(),
2035  * below, will deal with any ensuing imprecision.
2036  */
2037  pose = -1;
2038  foreach(lc, locations)
2039  {
2040  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2041  int first = -1;
2042 
2043  for (int i = 0; i < pdata->npos; i++)
2044  {
2045  /* For phrase matches, use the ending lexeme */
2046  int endp = pdata->pos[i];
2047 
2048  if (endp >= pos)
2049  {
2050  first = endp;
2051  break;
2052  }
2053  }
2054  if (first < 0)
2055  return false; /* no more matches for this term */
2056  if (first > pose)
2057  pose = first;
2058  }
2059 
2060  if (pose < 0)
2061  return false; /* we only get here if empty list */
2062 
2063  /*
2064  * Now, for each AND'ed query term or phrase, find its last occurrence
2065  * at or before pose; set posb to the minimum of those positions.
2066  *
2067  * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2068  * posb + 1 below.
2069  */
2070  posb = INT_MAX - 1;
2071  foreach(lc, locations)
2072  {
2073  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2074  int last = -1;
2075 
2076  for (int i = pdata->npos - 1; i >= 0; i--)
2077  {
2078  /* For phrase matches, use the starting lexeme */
2079  int startp = pdata->pos[i] - pdata->width;
2080 
2081  if (startp <= pose)
2082  {
2083  last = startp;
2084  break;
2085  }
2086  }
2087  if (last < posb)
2088  posb = last;
2089  }
2090 
2091  /*
2092  * We could end up with posb to the left of pos, in case some phrase
2093  * match crosses pos. Try the match starting at pos anyway, since the
2094  * result of TS_execute_locations is imprecise for phrase matches OR'd
2095  * with plain matches; that is, if the query is "(A <-> B) | C" then C
2096  * could match at pos even though the phrase match would have to
2097  * extend to the left of pos.
2098  */
2099  posb = Max(posb, pos);
2100 
2101  /* This test probably always succeeds, but be paranoid */
2102  if (posb <= pose)
2103  {
2104  /*
2105  * posb .. pose is now the shortest, earliest-after-pos range of
2106  * lexeme positions containing all the query terms. It will
2107  * contain all phrase matches, too, except in the corner case
2108  * described just above.
2109  *
2110  * Now convert these lexeme positions to indexes in prs->words[].
2111  */
2112  int idxb = -1;
2113  int idxe = -1;
2114 
2115  for (int i = 0; i < prs->curwords; i++)
2116  {
2117  if (prs->words[i].item == NULL)
2118  continue;
2119  if (idxb < 0 && prs->words[i].pos >= posb)
2120  idxb = i;
2121  if (prs->words[i].pos <= pose)
2122  idxe = i;
2123  else
2124  break;
2125  }
2126 
2127  /* This test probably always succeeds, but be paranoid */
2128  if (idxb >= 0 && idxe >= idxb)
2129  {
2130  /*
2131  * Finally, check that the selected range satisfies the query.
2132  * This should succeed in all simple cases; but odd cases
2133  * involving non-top-level NOT conditions or phrase matches
2134  * OR'd with other things could fail, since the result of
2135  * TS_execute_locations doesn't fully represent such things.
2136  */
2137  hlCheck ch;
2138 
2139  ch.words = &(prs->words[idxb]);
2140  ch.len = idxe - idxb + 1;
2141  if (TS_execute(GETQUERY(query), &ch,
2143  {
2144  /* Match! Advance *nextpos and return the word range. */
2145  *nextpos = posb + 1;
2146  *p = idxb;
2147  *q = idxe;
2148  return true;
2149  }
2150  }
2151  }
2152 
2153  /*
2154  * Advance pos and try again. Any later workable match must start
2155  * beyond posb.
2156  */
2157  pos = posb + 1;
2158  }
2159  /* Can't get here, but stupider compilers complain if we leave it off */
2160  return false;
2161 }
2162 
2163 /*
2164  * Apply suitable highlight marking to words selected by headline selector
2165  *
2166  * The words from startpos to endpos inclusive are marked per highlightall
2167  */
2168 static void
2169 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2170  int startpos, int endpos)
2171 {
2172  int i;
2173 
2174  for (i = startpos; i <= endpos; i++)
2175  {
2176  if (prs->words[i].item)
2177  prs->words[i].selected = 1;
2178  if (!highlightall)
2179  {
2180  if (HLIDREPLACE(prs->words[i].type))
2181  prs->words[i].replace = 1;
2182  else if (HLIDSKIP(prs->words[i].type))
2183  prs->words[i].skip = 1;
2184  }
2185  else
2186  {
2187  if (XMLHLIDSKIP(prs->words[i].type))
2188  prs->words[i].skip = 1;
2189  }
2190 
2191  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2192  }
2193 }
2194 
2195 /*
2196  * split a cover substring into fragments not longer than max_words
2197  *
2198  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2199  * substring. They are updated to hold the bounds of the next fragment.
2200  *
2201  * *curlen and *poslen are set to the fragment's length, in words and
2202  * interesting words respectively.
2203  */
2204 static void
2206  int *curlen, int *poslen, int max_words)
2207 {
2208  int i;
2209 
2210  /*
2211  * Objective: select a fragment of words between startpos and endpos such
2212  * that it has at most max_words and both ends have query words. If the
2213  * startpos and endpos are the endpoints of the cover and the cover has
2214  * fewer words than max_words, then this function should just return the
2215  * cover
2216  */
2217  /* first move startpos to an item */
2218  for (i = *startpos; i <= *endpos; i++)
2219  {
2220  *startpos = i;
2221  if (INTERESTINGWORD(i))
2222  break;
2223  }
2224  /* cut endpos to have only max_words */
2225  *curlen = 0;
2226  *poslen = 0;
2227  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2228  {
2229  if (!NONWORDTOKEN(prs->words[i].type))
2230  *curlen += 1;
2231  if (INTERESTINGWORD(i))
2232  *poslen += 1;
2233  }
2234  /* if the cover was cut then move back endpos to a query item */
2235  if (*endpos > i)
2236  {
2237  *endpos = i;
2238  for (i = *endpos; i >= *startpos; i--)
2239  {
2240  *endpos = i;
2241  if (INTERESTINGWORD(i))
2242  break;
2243  if (!NONWORDTOKEN(prs->words[i].type))
2244  *curlen -= 1;
2245  }
2246  }
2247 }
2248 
2249 /*
2250  * Headline selector used when MaxFragments > 0
2251  *
2252  * Note: in this mode, highlightall is disregarded for phrase selection;
2253  * it only controls presentation details.
2254  */
2255 static void
2257  bool highlightall,
2258  int shortword, int min_words,
2259  int max_words, int max_fragments)
2260 {
2261  int32 poslen,
2262  curlen,
2263  i,
2264  f,
2265  num_f = 0;
2266  int32 stretch,
2267  maxstretch,
2268  posmarker;
2269 
2270  int32 startpos = 0,
2271  endpos = 0,
2272  nextpos = 0,
2273  p = 0,
2274  q = 0;
2275 
2276  int32 numcovers = 0,
2277  maxcovers = 32;
2278 
2279  int32 minI,
2280  minwords,
2281  maxitems;
2282  CoverPos *covers;
2283 
2284  covers = palloc(maxcovers * sizeof(CoverPos));
2285 
2286  /* get all covers */
2287  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2288  {
2289  startpos = p;
2290  endpos = q;
2291 
2292  /*
2293  * Break the cover into smaller fragments such that each fragment has
2294  * at most max_words. Also ensure that each end of each fragment is a
2295  * query word. This will allow us to stretch the fragment in either
2296  * direction
2297  */
2298 
2299  while (startpos <= endpos)
2300  {
2301  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2302  if (numcovers >= maxcovers)
2303  {
2304  maxcovers *= 2;
2305  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2306  }
2307  covers[numcovers].startpos = startpos;
2308  covers[numcovers].endpos = endpos;
2309  covers[numcovers].curlen = curlen;
2310  covers[numcovers].poslen = poslen;
2311  covers[numcovers].chosen = false;
2312  covers[numcovers].excluded = false;
2313  numcovers++;
2314  startpos = endpos + 1;
2315  endpos = q;
2316  }
2317  }
2318 
2319  /* choose best covers */
2320  for (f = 0; f < max_fragments; f++)
2321  {
2322  maxitems = 0;
2323  minwords = PG_INT32_MAX;
2324  minI = -1;
2325 
2326  /*
2327  * Choose the cover that contains max items. In case of tie choose the
2328  * one with smaller number of words.
2329  */
2330  for (i = 0; i < numcovers; i++)
2331  {
2332  if (!covers[i].chosen && !covers[i].excluded &&
2333  (maxitems < covers[i].poslen ||
2334  (maxitems == covers[i].poslen &&
2335  minwords > covers[i].curlen)))
2336  {
2337  maxitems = covers[i].poslen;
2338  minwords = covers[i].curlen;
2339  minI = i;
2340  }
2341  }
2342  /* if a cover was found mark it */
2343  if (minI >= 0)
2344  {
2345  covers[minI].chosen = true;
2346  /* adjust the size of cover */
2347  startpos = covers[minI].startpos;
2348  endpos = covers[minI].endpos;
2349  curlen = covers[minI].curlen;
2350  /* stretch the cover if cover size is lower than max_words */
2351  if (curlen < max_words)
2352  {
2353  /* divide the stretch on both sides of cover */
2354  maxstretch = (max_words - curlen) / 2;
2355 
2356  /*
2357  * first stretch the startpos stop stretching if 1. we hit the
2358  * beginning of document 2. exceed maxstretch 3. we hit an
2359  * already marked fragment
2360  */
2361  stretch = 0;
2362  posmarker = startpos;
2363  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2364  {
2365  if (!NONWORDTOKEN(prs->words[i].type))
2366  {
2367  curlen++;
2368  stretch++;
2369  }
2370  posmarker = i;
2371  }
2372  /* cut back startpos till we find a good endpoint */
2373  for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2374  {
2375  if (!NONWORDTOKEN(prs->words[i].type))
2376  curlen--;
2377  }
2378  startpos = i;
2379  /* now stretch the endpos as much as possible */
2380  posmarker = endpos;
2381  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2382  {
2383  if (!NONWORDTOKEN(prs->words[i].type))
2384  curlen++;
2385  posmarker = i;
2386  }
2387  /* cut back endpos till we find a good endpoint */
2388  for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2389  {
2390  if (!NONWORDTOKEN(prs->words[i].type))
2391  curlen--;
2392  }
2393  endpos = i;
2394  }
2395  covers[minI].startpos = startpos;
2396  covers[minI].endpos = endpos;
2397  covers[minI].curlen = curlen;
2398  /* Mark the chosen fragments (covers) */
2399  mark_fragment(prs, highlightall, startpos, endpos);
2400  num_f++;
2401  /* Exclude covers overlapping this one from future consideration */
2402  for (i = 0; i < numcovers; i++)
2403  {
2404  if (i != minI &&
2405  ((covers[i].startpos >= startpos &&
2406  covers[i].startpos <= endpos) ||
2407  (covers[i].endpos >= startpos &&
2408  covers[i].endpos <= endpos) ||
2409  (covers[i].startpos < startpos &&
2410  covers[i].endpos > endpos)))
2411  covers[i].excluded = true;
2412  }
2413  }
2414  else
2415  break; /* no selectable covers remain */
2416  }
2417 
2418  /* show the first min_words words if we have not marked anything */
2419  if (num_f <= 0)
2420  {
2421  startpos = endpos = curlen = 0;
2422  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2423  {
2424  if (!NONWORDTOKEN(prs->words[i].type))
2425  curlen++;
2426  endpos = i;
2427  }
2428  mark_fragment(prs, highlightall, startpos, endpos);
2429  }
2430 
2431  pfree(covers);
2432 }
2433 
2434 /*
2435  * Headline selector used when MaxFragments == 0
2436  */
2437 static void
2439  bool highlightall,
2440  int shortword, int min_words, int max_words)
2441 {
2442  int nextpos = 0,
2443  p = 0,
2444  q = 0;
2445  int bestb = -1,
2446  beste = -1;
2447  int bestlen = -1;
2448  bool bestcover = false;
2449  int pose,
2450  posb,
2451  poslen,
2452  curlen;
2453  bool poscover;
2454  int i;
2455 
2456  if (!highlightall)
2457  {
2458  /* examine all covers, select a headline using the best one */
2459  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2460  {
2461  /*
2462  * Count words (curlen) and interesting words (poslen) within
2463  * cover, but stop once we reach max_words. This step doesn't
2464  * consider whether that's a good stopping point. posb and pose
2465  * are set to the start and end indexes of the possible headline.
2466  */
2467  curlen = 0;
2468  poslen = 0;
2469  posb = pose = p;
2470  for (i = p; i <= q && curlen < max_words; i++)
2471  {
2472  if (!NONWORDTOKEN(prs->words[i].type))
2473  curlen++;
2474  if (INTERESTINGWORD(i))
2475  poslen++;
2476  pose = i;
2477  }
2478 
2479  if (curlen < max_words)
2480  {
2481  /*
2482  * We have room to lengthen the headline, so search forward
2483  * until it's full or we find a good stopping point. We'll
2484  * reconsider the word at "q", then move forward.
2485  */
2486  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2487  {
2488  if (i > q)
2489  {
2490  if (!NONWORDTOKEN(prs->words[i].type))
2491  curlen++;
2492  if (INTERESTINGWORD(i))
2493  poslen++;
2494  }
2495  pose = i;
2496  if (BADENDPOINT(i))
2497  continue;
2498  if (curlen >= min_words)
2499  break;
2500  }
2501  if (curlen < min_words)
2502  {
2503  /*
2504  * Reached end of text and our headline is still shorter
2505  * than min_words, so try to extend it to the left.
2506  */
2507  for (i = p - 1; i >= 0; i--)
2508  {
2509  if (!NONWORDTOKEN(prs->words[i].type))
2510  curlen++;
2511  if (INTERESTINGWORD(i))
2512  poslen++;
2513  if (curlen >= max_words)
2514  break;
2515  if (BADENDPOINT(i))
2516  continue;
2517  if (curlen >= min_words)
2518  break;
2519  }
2520  posb = (i >= 0) ? i : 0;
2521  }
2522  }
2523  else
2524  {
2525  /*
2526  * Can't make headline longer, so consider making it shorter
2527  * if needed to avoid a bad endpoint.
2528  */
2529  if (i > q)
2530  i = q;
2531  for (; curlen > min_words; i--)
2532  {
2533  if (!BADENDPOINT(i))
2534  break;
2535  if (!NONWORDTOKEN(prs->words[i].type))
2536  curlen--;
2537  if (INTERESTINGWORD(i))
2538  poslen--;
2539  pose = i - 1;
2540  }
2541  }
2542 
2543  /*
2544  * Check whether the proposed headline includes the original
2545  * cover; it might not if we trimmed it due to max_words.
2546  */
2547  poscover = (posb <= p && pose >= q);
2548 
2549  /*
2550  * Adopt this headline if it's better than the last one, giving
2551  * highest priority to headlines including the cover, then to
2552  * headlines with more interesting words, then to headlines with
2553  * good stopping points. (Since bestlen is initially -1, we will
2554  * certainly adopt the first headline.)
2555  */
2556  if (poscover > bestcover ||
2557  (poscover == bestcover && poslen > bestlen) ||
2558  (poscover == bestcover && poslen == bestlen &&
2559  !BADENDPOINT(pose) && BADENDPOINT(beste)))
2560  {
2561  bestb = posb;
2562  beste = pose;
2563  bestlen = poslen;
2564  bestcover = poscover;
2565  }
2566  }
2567 
2568  /*
2569  * If we found nothing acceptable, select min_words words starting at
2570  * the beginning.
2571  */
2572  if (bestlen < 0)
2573  {
2574  curlen = 0;
2575  pose = 0;
2576  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2577  {
2578  if (!NONWORDTOKEN(prs->words[i].type))
2579  curlen++;
2580  pose = i;
2581  }
2582  bestb = 0;
2583  beste = pose;
2584  }
2585  }
2586  else
2587  {
2588  /* highlightall mode: headline is whole document */
2589  bestb = 0;
2590  beste = prs->curwords - 1;
2591  }
2592 
2593  mark_fragment(prs, highlightall, bestb, beste);
2594 }
2595 
2596 /*
2597  * Default parser's prsheadline function
2598  */
2599 Datum
2601 {
2603  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2604  TSQuery query = PG_GETARG_TSQUERY(2);
2605  hlCheck ch;
2606  List *locations;
2607 
2608  /* default option values: */
2609  int min_words = 15;
2610  int max_words = 35;
2611  int shortword = 3;
2612  int max_fragments = 0;
2613  bool highlightall = false;
2614  ListCell *l;
2615 
2616  /* Extract configuration option values */
2617  prs->startsel = NULL;
2618  prs->stopsel = NULL;
2619  prs->fragdelim = NULL;
2620  foreach(l, prsoptions)
2621  {
2622  DefElem *defel = (DefElem *) lfirst(l);
2623  char *val = defGetString(defel);
2624 
2625  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2626  max_words = pg_strtoint32(val);
2627  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2628  min_words = pg_strtoint32(val);
2629  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2630  shortword = pg_strtoint32(val);
2631  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2632  max_fragments = pg_strtoint32(val);
2633  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2634  prs->startsel = pstrdup(val);
2635  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2636  prs->stopsel = pstrdup(val);
2637  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2638  prs->fragdelim = pstrdup(val);
2639  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2640  highlightall = (pg_strcasecmp(val, "1") == 0 ||
2641  pg_strcasecmp(val, "on") == 0 ||
2642  pg_strcasecmp(val, "true") == 0 ||
2643  pg_strcasecmp(val, "t") == 0 ||
2644  pg_strcasecmp(val, "y") == 0 ||
2645  pg_strcasecmp(val, "yes") == 0);
2646  else
2647  ereport(ERROR,
2648  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2649  errmsg("unrecognized headline parameter: \"%s\"",
2650  defel->defname)));
2651  }
2652 
2653  /* in HighlightAll mode these parameters are ignored */
2654  if (!highlightall)
2655  {
2656  if (min_words >= max_words)
2657  ereport(ERROR,
2658  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2659  errmsg("MinWords should be less than MaxWords")));
2660  if (min_words <= 0)
2661  ereport(ERROR,
2662  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2663  errmsg("MinWords should be positive")));
2664  if (shortword < 0)
2665  ereport(ERROR,
2666  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2667  errmsg("ShortWord should be >= 0")));
2668  if (max_fragments < 0)
2669  ereport(ERROR,
2670  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2671  errmsg("MaxFragments should be >= 0")));
2672  }
2673 
2674  /* Locate words and phrases matching the query */
2675  ch.words = prs->words;
2676  ch.len = prs->curwords;
2677  locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2679 
2680  /* Apply appropriate headline selector */
2681  if (max_fragments == 0)
2682  mark_hl_words(prs, query, locations, highlightall, shortword,
2683  min_words, max_words);
2684  else
2685  mark_hl_fragments(prs, query, locations, highlightall, shortword,
2686  min_words, max_words, max_fragments);
2687 
2688  /* Fill in default values for string options */
2689  if (!prs->startsel)
2690  prs->startsel = pstrdup("<b>");
2691  if (!prs->stopsel)
2692  prs->stopsel = pstrdup("</b>");
2693  if (!prs->fragdelim)
2694  prs->fragdelim = pstrdup(" ... ");
2695 
2696  /* Caller will need these lengths, too */
2697  prs->startsellen = strlen(prs->startsel);
2698  prs->stopsellen = strlen(prs->stopsel);
2699  prs->fragdelimlen = strlen(prs->fragdelim);
2700 
2701  PG_RETURN_POINTER(prs);
2702 }
#define GETQUERY(x)
Definition: _int.h:157
void print(const void *obj)
Definition: print.c:36
unsigned short uint16
Definition: c.h:489
#define PG_INT32_MAX
Definition: c.h:573
signed int int32
Definition: c.h:478
#define Max(x, y)
Definition: c.h:982
#define lengthof(array)
Definition: c.h:772
char * defGetString(DefElem *def)
Definition: define.c:49
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
long val
Definition: informix.c:664
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1268
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1031
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:987
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
char * pstrdup(const char *in)
Definition: mcxt.c:1624
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc0(Size size)
Definition: mcxt.c:1241
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1210
int32 pg_strtoint32(const char *s)
Definition: numutils.c:240
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
const void size_t len
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1352
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:2061
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
static XLogRecPtr startpos
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define fprintf
Definition: port.h:242
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
uintptr_t Datum
Definition: postgres.h:64
unsigned int Oid
Definition: postgres_ext.h:31
char * c
bool chosen
Definition: wparser_def.c:1946
int32 endpos
Definition: wparser_def.c:1943
int32 curlen
Definition: wparser_def.c:1945
int32 startpos
Definition: wparser_def.c:1942
bool excluded
Definition: wparser_def.c:1947
int32 poslen
Definition: wparser_def.c:1944
char * defname
Definition: parsenodes.h:810
WordEntryPos * pos
Definition: ts_utils.h:166
HeadlineWordEntry * words
Definition: ts_public.h:76
WordEntryPos pos
Definition: ts_public.h:68
QueryOperand * item
Definition: ts_public.h:70
char * alias
Definition: ts_public.h:28
int lexid
Definition: ts_public.h:27
char * descr
Definition: ts_public.h:29
Definition: pg_list.h:54
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:236
struct TParserPosition * prev
Definition: wparser_def.c:235
TParserState state
Definition: wparser_def.c:234
TParserCharTest isclass
Definition: wparser_def.c:209
TParserState tostate
Definition: wparser_def.c:212
TParserSpecial special
Definition: wparser_def.c:214
const TParserStateActionItem * action
Definition: wparser_def.c:1596
TParserState state
Definition: wparser_def.c:1597
char * str
Definition: wparser_def.c:242
pg_wchar * pgwstr
Definition: wparser_def.c:245
wchar_t * wstr
Definition: wparser_def.c:244
int lenstr
Definition: wparser_def.c:243
char * token
Definition: wparser_def.c:258
int type
Definition: wparser_def.c:261
int charmaxlen
Definition: wparser_def.c:249
bool wanthost
Definition: wparser_def.c:252
int lenbytetoken
Definition: wparser_def.c:259
bool ignore
Definition: wparser_def.c:251
TParserPosition * state
Definition: wparser_def.c:250
int lenchartoken
Definition: wparser_def.c:260
char c
Definition: wparser_def.c:255
bool usewide
Definition: wparser_def.c:246
HeadlineWordEntry * words
Definition: wparser_def.c:1953
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:266
uint16 WordEntryPos
Definition: ts_type.h:63
TSTernaryValue
Definition: ts_utils.h:133
@ TS_NO
Definition: ts_utils.h:134
@ TS_YES
Definition: ts_utils.h:135
#define TS_EXEC_EMPTY
Definition: ts_utils.h:188
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1856
List * TS_execute_locations(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:2009
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1542
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1153
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1503
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1921
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1133
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1360
#define VERSIONNUMBER
Definition: wparser_def.c:41
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1575
#define BADENDPOINT(j)
Definition: wparser_def.c:1935
#define ASCIIWORD
Definition: wparser_def.c:34
#define PROTOCOL
Definition: wparser_def.c:47
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1420
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1403
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1320
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1296
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:587
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1309
struct TParser TParser
static TSTernaryValue checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1966
void _make_compiler_happy(void)
Definition: wparser_def.c:536
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1453
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1313
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1534
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1331
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1106
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1920
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1485
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1887
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1210
#define DECIMAL_T
Definition: wparser_def.c:53
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1439
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1142
#define ASCIIPARTHWORD
Definition: wparser_def.c:44
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1463
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:423
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1121
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2256
#define WORD_T
Definition: wparser_def.c:35
TParserState
Definition: wparser_def.c:118
@ TPS_InXMLEntityHexNumFirst
Definition: wparser_def.c:142
@ TPS_InPort
Definition: wparser_def.c:165
@ TPS_InXMLEntityHexNum
Definition: wparser_def.c:143
@ TPS_InHostDomainSecond
Definition: wparser_def.c:162
@ TPS_InMantissaFirst
Definition: wparser_def.c:135
@ TPS_InTagName
Definition: wparser_def.c:148
@ TPS_InHyphenAsciiWordFirst
Definition: wparser_def.c:183
@ TPS_Null
Definition: wparser_def.c:196
@ TPS_InPathFirstFirst
Definition: wparser_def.c:172
@ TPS_InSignedIntFirst
Definition: wparser_def.c:124
@ TPS_InSignedInt
Definition: wparser_def.c:125
@ TPS_InUnsignedInt
Definition: wparser_def.c:123
@ TPS_InMantissa
Definition: wparser_def.c:137
@ TPS_InProtocolFirst
Definition: wparser_def.c:180
@ TPS_InFURL
Definition: wparser_def.c:179
@ TPS_InMantissaSign
Definition: wparser_def.c:136
@ TPS_InXMLBegin
Definition: wparser_def.c:146
@ TPS_InCommentEnd
Definition: wparser_def.c:160
@ TPS_InHyphenWordFirst
Definition: wparser_def.c:185
@ TPS_InHyphenNumWordPart
Definition: wparser_def.c:194
@ TPS_InPortFirst
Definition: wparser_def.c:164
@ TPS_InProtocolEnd
Definition: wparser_def.c:182
@ TPS_InXMLEntityFirst
Definition: wparser_def.c:138
@ TPS_InHyphenNumWordFirst
Definition: wparser_def.c:187
@ TPS_InCommentLast
Definition: wparser_def.c:156
@ TPS_InFileTwiddle
Definition: wparser_def.c:170
@ TPS_InURLPathStart
Definition: wparser_def.c:177
@ TPS_InURLPathFirst
Definition: wparser_def.c:176
@ TPS_InPathFirst
Definition: wparser_def.c:171
@ TPS_InPathSecond
Definition: wparser_def.c:173
@ TPS_InHyphenUnsignedInt
Definition: wparser_def.c:195
@ TPS_InFileFirst
Definition: wparser_def.c:169
@ TPS_InXMLEntityNumFirst
Definition: wparser_def.c:140
@ TPS_InHyphenWordPart
Definition: wparser_def.c:192
@ TPS_InNumWord
Definition: wparser_def.c:120
@ TPS_InAsciiWord
Definition: wparser_def.c:121
@ TPS_InVersion
Definition: wparser_def.c:134
@ TPS_InHost
Definition: wparser_def.c:167
@ TPS_InFile
Definition: wparser_def.c:174
@ TPS_InProtocolSecond
Definition: wparser_def.c:181
@ TPS_InCloseCommentFirst
Definition: wparser_def.c:158
@ TPS_InTagEscapeK
Definition: wparser_def.c:151
@ TPS_InParseHyphenHyphen
Definition: wparser_def.c:191
@ TPS_InTagBackSleshed
Definition: wparser_def.c:153
@ TPS_InTagFirst
Definition: wparser_def.c:145
@ TPS_InTagEnd
Definition: wparser_def.c:154
@ TPS_InComment
Definition: wparser_def.c:157
@ TPS_InHyphenWord
Definition: wparser_def.c:186
@ TPS_InHyphenAsciiWord
Definition: wparser_def.c:184
@ TPS_InWord
Definition: wparser_def.c:122
@ TPS_InXMLEntityEnd
Definition: wparser_def.c:144
@ TPS_InTagEscapeKK
Definition: wparser_def.c:152
@ TPS_InSpace
Definition: wparser_def.c:126
@ TPS_InFileNext
Definition: wparser_def.c:175
@ TPS_InURLPath
Definition: wparser_def.c:178
@ TPS_Base
Definition: wparser_def.c:119
@ TPS_InUDecimal
Definition: wparser_def.c:128
@ TPS_InParseHyphen
Definition: wparser_def.c:190
@ TPS_InHostFirstAN
Definition: wparser_def.c:166
@ TPS_InEmail
Definition: wparser_def.c:168
@ TPS_InDecimalFirst
Definition: wparser_def.c:129
@ TPS_InVersionFirst
Definition: wparser_def.c:133
@ TPS_InCloseCommentLast
Definition: wparser_def.c:159
@ TPS_InSVerVersion
Definition: wparser_def.c:132
@ TPS_InHyphenAsciiWordPart
Definition: wparser_def.c:193
@ TPS_InCommentFirst
Definition: wparser_def.c:155
@ TPS_InUDecimalFirst
Definition: wparser_def.c:127
@ TPS_InHostFirstDomain
Definition: wparser_def.c:161
@ TPS_InHostDomain
Definition: wparser_def.c:163
@ TPS_InHyphenDigitLookahead
Definition: wparser_def.c:189
@ TPS_InVerVersion
Definition: wparser_def.c:131
@ TPS_InXMLEntityNum
Definition: wparser_def.c:141
@ TPS_InTag
Definition: wparser_def.c:150
@ TPS_InDecimal
Definition: wparser_def.c:130
@ TPS_InTagCloseFirst
Definition: wparser_def.c:147
@ TPS_InXMLEntity
Definition: wparser_def.c:139
@ TPS_InHyphenNumWord
Definition: wparser_def.c:188
@ TPS_InTagBeginEnd
Definition: wparser_def.c:149
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
Definition: wparser_def.c:2169
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1181
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1526
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1078
#define A_POP
Definition: wparser_def.c:220
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1030
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1252
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1045
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1428
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:345
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1566
#define LASTNUM
Definition: wparser_def.c:58
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:480
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2600
#define NUMHWORD
Definition: wparser_def.c:48
static bool hlCover(HeadlineParsedText *prs, TSQuery query, List *locations, int *nextpos, int *p, int *q)
Definition: wparser_def.c:2017
#define SPACE
Definition: wparser_def.c:45
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1063
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:202
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1036
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:504
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1224
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1185
struct TParserPosition TParserPosition
#define NUMWORD
Definition: wparser_def.c:36
#define FILEPATH
Definition: wparser_def.c:52
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1259
static int p_isneC(TParser *prs)
Definition: wparser_def.c:486
#define EMAIL
Definition: wparser_def.c:37
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1284
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1558
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1113
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:958
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:595
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2438
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1519
#define UNSIGNEDINT
Definition: wparser_def.c:55
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1378
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1161
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1457
#define A_RERUN
Definition: wparser_def.c:222
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1093
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:986
static const char *const tok_alias[]
Definition: wparser_def.c:60
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:611
#define HLIDSKIP(x)
Definition: wparser_def.c:1919
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2205
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1582
#define SIGNEDINT
Definition: wparser_def.c:54
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:498
static const TParserStateAction Actions[]
Definition: wparser_def.c:1615
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1196
#define PARTHWORD
Definition: wparser_def.c:43
#define HLIDREPLACE(x)
Definition: wparser_def.c:1918
#define A_MERGE
Definition: wparser_def.c:224
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1127
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1100
static int p_isascii(TParser *prs)
Definition: wparser_def.c:492
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1275
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1510
static int p_isignore(TParser *prs)
Definition: wparser_def.c:622
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1551
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1352
#define TAG_T
Definition: wparser_def.c:46
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1072
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:270
#define URLPATH
Definition: wparser_def.c:51
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1863
#define ASCIIHWORD
Definition: wparser_def.c:49
#define HOST
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1230
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1881
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:287
#define A_BINGO
Definition: wparser_def.c:219
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1607
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1697
#define XMLENTITY
Definition: wparser_def.c:56
static int p_ishost(TParser *prs)
Definition: wparser_def.c:628
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1903
#define A_CLRALL
Definition: wparser_def.c:225
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:650
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:602
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1469
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1013
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1057
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1204
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:473
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1302
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:396
#define A_CLEAR
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1384
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:975
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1394
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1367
#define A_PUSH
Definition: wparser_def.c:221
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1266
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1475
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1004
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:679
static void TParserClose(TParser *prs)
Definition: wparser_def.c:371
#define URL_T
Definition: wparser_def.c:38
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1167
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1087
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1493
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1174
#define A_NEXT
Definition: wparser_def.c:218
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1346
#define HWORD
Definition: wparser_def.c:50
#define NUMPARTHWORD
Definition: wparser_def.c:42
static const char *const lex_descr[]
Definition: wparser_def.c:87
#define INTERESTINGWORD(j)
Definition: wparser_def.c:1931
#define SCIENTIFIC
Definition: wparser_def.c:40
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:563
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1271
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1290
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1481
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1447
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1413