PostgreSQL Source Code  git master
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57 
58 #define LASTNUM 23
59 
60 static const char *const tok_alias[] = {
61  "",
62  "asciiword",
63  "word",
64  "numword",
65  "email",
66  "url",
67  "host",
68  "sfloat",
69  "version",
70  "hword_numpart",
71  "hword_part",
72  "hword_asciipart",
73  "blank",
74  "tag",
75  "protocol",
76  "numhword",
77  "asciihword",
78  "hword",
79  "url_path",
80  "file",
81  "float",
82  "int",
83  "uint",
84  "entity"
85 };
86 
87 static const char *const lex_descr[] = {
88  "",
89  "Word, all ASCII",
90  "Word, all letters",
91  "Word, letters and digits",
92  "Email address",
93  "URL",
94  "Host",
95  "Scientific notation",
96  "Version number",
97  "Hyphenated word part, letters and digits",
98  "Hyphenated word part, all letters",
99  "Hyphenated word part, all ASCII",
100  "Space symbols",
101  "XML tag",
102  "Protocol head",
103  "Hyphenated word, letters and digits",
104  "Hyphenated word, all ASCII",
105  "Hyphenated word, all letters",
106  "URL path",
107  "File or path name",
108  "Decimal notation",
109  "Signed integer",
110  "Unsigned integer",
111  "XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119  TPS_Base = 0,
196  TPS_Null /* last state (fake value) */
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203  * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205  * special cases... */
206 
207 typedef struct
208 {
210  char c;
213  int type;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226 
227 typedef struct TParserPosition
228 {
229  int posbyte; /* position of parser in bytes */
230  int poschar; /* position of parser in characters */
231  int charlen; /* length of current char */
232  int lenbytetoken; /* length of token-so-far in bytes */
233  int lenchartoken; /* and in chars */
238 
239 typedef struct TParser
240 {
241  /* string and position information */
242  char *str; /* multibyte string */
243  int lenstr; /* length of mbstring */
244  wchar_t *wstr; /* wide character string */
245  pg_wchar *pgwstr; /* wide character string for C-locale */
246  bool usewide;
247 
248  /* State of parse */
251  bool ignore;
252  bool wanthost;
253 
254  /* silly char */
255  char c;
256 
257  /* out */
258  char *token;
261  int type;
263 
264 
265 /* forward decls here */
266 static bool TParserGet(TParser *prs);
267 
268 
269 static TParserPosition *
271 {
273 
274  if (prev)
275  memcpy(res, prev, sizeof(TParserPosition));
276  else
277  memset(res, 0, sizeof(TParserPosition));
278 
279  res->prev = prev;
280 
281  res->pushedAtAction = NULL;
282 
283  return res;
284 }
285 
286 static TParser *
287 TParserInit(char *str, int len)
288 {
289  TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 
292  prs->str = str;
293  prs->lenstr = len;
294 
295  /*
296  * Use wide char code only when max encoding length > 1.
297  */
298  if (prs->charmaxlen > 1)
299  {
300  pg_locale_t mylocale = 0; /* TODO */
301 
302  prs->usewide = true;
304  {
305  /*
306  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
307  * be different from sizeof(wchar_t)
308  */
309  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
310  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
311  }
312  else
313  {
314  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
315  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
316  mylocale);
317  }
318  }
319  else
320  prs->usewide = false;
321 
322  prs->state = newTParserPosition(NULL);
323  prs->state->state = TPS_Base;
324 
325 #ifdef WPARSER_TRACE
326  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
327 #endif
328 
329  return prs;
330 }
331 
332 /*
333  * As an alternative to a full TParserInit one can create a
334  * TParserCopy which basically is a regular TParser without a private
335  * copy of the string - instead it uses the one from another TParser.
336  * This is useful because at some places TParsers are created
337  * recursively and the repeated copying around of the strings can
338  * cause major inefficiency if the source string is long.
339  * The new parser starts parsing at the original's current position.
340  *
341  * Obviously one must not close the original TParser before the copy.
342  */
343 static TParser *
345 {
346  TParser *prs = (TParser *) palloc0(sizeof(TParser));
347 
348  prs->charmaxlen = orig->charmaxlen;
349  prs->str = orig->str + orig->state->posbyte;
350  prs->lenstr = orig->lenstr - orig->state->posbyte;
351  prs->usewide = orig->usewide;
352 
353  if (orig->pgwstr)
354  prs->pgwstr = orig->pgwstr + orig->state->poschar;
355  if (orig->wstr)
356  prs->wstr = orig->wstr + orig->state->poschar;
357 
358  prs->state = newTParserPosition(NULL);
359  prs->state->state = TPS_Base;
360 
361 #ifdef WPARSER_TRACE
362  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
363 #endif
364 
365  return prs;
366 }
367 
368 
369 static void
371 {
372  while (prs->state)
373  {
374  TParserPosition *ptr = prs->state->prev;
375 
376  pfree(prs->state);
377  prs->state = ptr;
378  }
379 
380  if (prs->wstr)
381  pfree(prs->wstr);
382  if (prs->pgwstr)
383  pfree(prs->pgwstr);
384 
385 #ifdef WPARSER_TRACE
386  fprintf(stderr, "closing parser\n");
387 #endif
388  pfree(prs);
389 }
390 
391 /*
392  * Close a parser created with TParserCopyInit
393  */
394 static void
396 {
397  while (prs->state)
398  {
399  TParserPosition *ptr = prs->state->prev;
400 
401  pfree(prs->state);
402  prs->state = ptr;
403  }
404 
405 #ifdef WPARSER_TRACE
406  fprintf(stderr, "closing parser copy\n");
407 #endif
408  pfree(prs);
409 }
410 
411 
412 /*
413  * Character-type support functions, equivalent to is* macros, but
414  * working with any possible encodings and locales. Notes:
415  * - with multibyte encoding and C-locale isw* function may fail
416  * or give wrong result.
417  * - multibyte encoding and C-locale often are used for
418  * Asian languages.
419  * - if locale is C then we use pgwstr instead of wstr.
420  */
421 
422 #define p_iswhat(type, nonascii) \
423  \
424 static int \
425 p_is##type(TParser *prs) \
426 { \
427  Assert(prs->state); \
428  if (prs->usewide) \
429  { \
430  if (prs->pgwstr) \
431  { \
432  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
433  if (c > 0x7f) \
434  return nonascii; \
435  return is##type(c); \
436  } \
437  return isw##type(*(prs->wstr + prs->state->poschar)); \
438  } \
439  return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
440 } \
441  \
442 static int \
443 p_isnot##type(TParser *prs) \
444 { \
445  return !p_is##type(prs); \
446 }
447 
448 /*
449  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
450  * an alpha character, but not a member of other char classes.
451  */
452 p_iswhat(alnum, 1)
453 p_iswhat(alpha, 1)
454 p_iswhat(digit, 0)
455 p_iswhat(lower, 0)
456 p_iswhat(print, 0)
457 p_iswhat(punct, 0)
458 p_iswhat(space, 0)
459 p_iswhat(upper, 0)
460 p_iswhat(xdigit, 0)
461 
462 /* p_iseq should be used only for ascii symbols */
463 
464 static int
465 p_iseq(TParser *prs, char c)
466 {
467  Assert(prs->state);
468  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
469 }
470 
471 static int
473 {
474  Assert(prs->state);
475  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
476 }
477 
478 static int
480 {
481  return p_iseq(prs, prs->c);
482 }
483 
484 static int
486 {
487  return !p_iseq(prs, prs->c);
488 }
489 
490 static int
492 {
493  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
494 }
495 
496 static int
498 {
499  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
500 }
501 
502 static int
504 {
505  char ch;
506 
507  /* no non-ASCII need apply */
508  if (prs->state->charlen != 1)
509  return 0;
510  ch = *(prs->str + prs->state->posbyte);
511  /* no spaces or control characters */
512  if (ch <= 0x20 || ch >= 0x7F)
513  return 0;
514  /* reject characters disallowed by RFC 3986 */
515  switch (ch)
516  {
517  case '"':
518  case '<':
519  case '>':
520  case '\\':
521  case '^':
522  case '`':
523  case '{':
524  case '|':
525  case '}':
526  return 0;
527  }
528  return 1;
529 }
530 
531 
532 /* deliberately suppress unused-function complaints for the above */
533 void _make_compiler_happy(void);
534 void
536 {
537  p_isalnum(NULL);
538  p_isnotalnum(NULL);
539  p_isalpha(NULL);
540  p_isnotalpha(NULL);
541  p_isdigit(NULL);
542  p_isnotdigit(NULL);
543  p_islower(NULL);
544  p_isnotlower(NULL);
545  p_isprint(NULL);
546  p_isnotprint(NULL);
547  p_ispunct(NULL);
548  p_isnotpunct(NULL);
549  p_isspace(NULL);
550  p_isnotspace(NULL);
551  p_isupper(NULL);
552  p_isnotupper(NULL);
553  p_isxdigit(NULL);
554  p_isnotxdigit(NULL);
555  p_isEOF(NULL);
556  p_iseqC(NULL);
557  p_isneC(NULL);
558 }
559 
560 
561 static void
563 {
564  switch (prs->state->lenchartoken)
565  {
566  case 8: /* </script */
567  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
568  prs->ignore = false;
569  break;
570  case 7: /* <script || </style */
571  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
572  prs->ignore = false;
573  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
574  prs->ignore = true;
575  break;
576  case 6: /* <style */
577  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
578  prs->ignore = true;
579  break;
580  default:
581  break;
582  }
583 }
584 
585 static void
587 {
588  prs->wanthost = true;
589  prs->state->posbyte -= prs->state->lenbytetoken;
590  prs->state->poschar -= prs->state->lenchartoken;
591 }
592 
593 static void
595 {
596  prs->state->posbyte -= prs->state->lenbytetoken;
597  prs->state->poschar -= prs->state->lenchartoken;
598 }
599 
600 static void
602 {
603  prs->state->posbyte -= prs->state->lenbytetoken;
604  prs->state->poschar -= prs->state->lenchartoken;
605  prs->state->lenbytetoken = 0;
606  prs->state->lenchartoken = 0;
607 }
608 
609 static int
611 {
612  if (prs->wanthost)
613  {
614  prs->wanthost = false;
615  return 1;
616  }
617  return 0;
618 }
619 
620 static int
622 {
623  return (prs->ignore) ? 1 : 0;
624 }
625 
626 static int
628 {
629  TParser *tmpprs = TParserCopyInit(prs);
630  int res = 0;
631 
632  tmpprs->wanthost = true;
633 
634  if (TParserGet(tmpprs) && tmpprs->type == HOST)
635  {
636  prs->state->posbyte += tmpprs->lenbytetoken;
637  prs->state->poschar += tmpprs->lenchartoken;
638  prs->state->lenbytetoken += tmpprs->lenbytetoken;
639  prs->state->lenchartoken += tmpprs->lenchartoken;
640  prs->state->charlen = tmpprs->state->charlen;
641  res = 1;
642  }
643  TParserCopyClose(tmpprs);
644 
645  return res;
646 }
647 
648 static int
650 {
651  TParser *tmpprs = TParserCopyInit(prs);
652  int res = 0;
653 
654  tmpprs->state = newTParserPosition(tmpprs->state);
655  tmpprs->state->state = TPS_InURLPathFirst;
656 
657  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
658  {
659  prs->state->posbyte += tmpprs->lenbytetoken;
660  prs->state->poschar += tmpprs->lenchartoken;
661  prs->state->lenbytetoken += tmpprs->lenbytetoken;
662  prs->state->lenchartoken += tmpprs->lenchartoken;
663  prs->state->charlen = tmpprs->state->charlen;
664  res = 1;
665  }
666  TParserCopyClose(tmpprs);
667 
668  return res;
669 }
670 
671 /*
672  * returns true if current character has zero display length or
673  * it's a special sign in several languages. Such characters
674  * aren't a word-breaker although they aren't an isalpha.
675  * In beginning of word they aren't a part of it.
676  */
677 static int
679 {
680  /*
681  * pg_dsplen could return -1 which means error or control character
682  */
683  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
684  return 1;
685 
686  /*
687  * Unicode Characters in the 'Mark, Spacing Combining' Category That
688  * characters are not alpha although they are not breakers of word too.
689  * Check that only in utf encoding, because other encodings aren't
690  * supported by postgres or even exists.
691  */
692  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
693  {
694  static const pg_wchar strange_letter[] = {
695  /*
696  * use binary search, so elements should be ordered
697  */
698  0x0903, /* DEVANAGARI SIGN VISARGA */
699  0x093E, /* DEVANAGARI VOWEL SIGN AA */
700  0x093F, /* DEVANAGARI VOWEL SIGN I */
701  0x0940, /* DEVANAGARI VOWEL SIGN II */
702  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
703  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
704  0x094B, /* DEVANAGARI VOWEL SIGN O */
705  0x094C, /* DEVANAGARI VOWEL SIGN AU */
706  0x0982, /* BENGALI SIGN ANUSVARA */
707  0x0983, /* BENGALI SIGN VISARGA */
708  0x09BE, /* BENGALI VOWEL SIGN AA */
709  0x09BF, /* BENGALI VOWEL SIGN I */
710  0x09C0, /* BENGALI VOWEL SIGN II */
711  0x09C7, /* BENGALI VOWEL SIGN E */
712  0x09C8, /* BENGALI VOWEL SIGN AI */
713  0x09CB, /* BENGALI VOWEL SIGN O */
714  0x09CC, /* BENGALI VOWEL SIGN AU */
715  0x09D7, /* BENGALI AU LENGTH MARK */
716  0x0A03, /* GURMUKHI SIGN VISARGA */
717  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
718  0x0A3F, /* GURMUKHI VOWEL SIGN I */
719  0x0A40, /* GURMUKHI VOWEL SIGN II */
720  0x0A83, /* GUJARATI SIGN VISARGA */
721  0x0ABE, /* GUJARATI VOWEL SIGN AA */
722  0x0ABF, /* GUJARATI VOWEL SIGN I */
723  0x0AC0, /* GUJARATI VOWEL SIGN II */
724  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
725  0x0ACB, /* GUJARATI VOWEL SIGN O */
726  0x0ACC, /* GUJARATI VOWEL SIGN AU */
727  0x0B02, /* ORIYA SIGN ANUSVARA */
728  0x0B03, /* ORIYA SIGN VISARGA */
729  0x0B3E, /* ORIYA VOWEL SIGN AA */
730  0x0B40, /* ORIYA VOWEL SIGN II */
731  0x0B47, /* ORIYA VOWEL SIGN E */
732  0x0B48, /* ORIYA VOWEL SIGN AI */
733  0x0B4B, /* ORIYA VOWEL SIGN O */
734  0x0B4C, /* ORIYA VOWEL SIGN AU */
735  0x0B57, /* ORIYA AU LENGTH MARK */
736  0x0BBE, /* TAMIL VOWEL SIGN AA */
737  0x0BBF, /* TAMIL VOWEL SIGN I */
738  0x0BC1, /* TAMIL VOWEL SIGN U */
739  0x0BC2, /* TAMIL VOWEL SIGN UU */
740  0x0BC6, /* TAMIL VOWEL SIGN E */
741  0x0BC7, /* TAMIL VOWEL SIGN EE */
742  0x0BC8, /* TAMIL VOWEL SIGN AI */
743  0x0BCA, /* TAMIL VOWEL SIGN O */
744  0x0BCB, /* TAMIL VOWEL SIGN OO */
745  0x0BCC, /* TAMIL VOWEL SIGN AU */
746  0x0BD7, /* TAMIL AU LENGTH MARK */
747  0x0C01, /* TELUGU SIGN CANDRABINDU */
748  0x0C02, /* TELUGU SIGN ANUSVARA */
749  0x0C03, /* TELUGU SIGN VISARGA */
750  0x0C41, /* TELUGU VOWEL SIGN U */
751  0x0C42, /* TELUGU VOWEL SIGN UU */
752  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
753  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
754  0x0C82, /* KANNADA SIGN ANUSVARA */
755  0x0C83, /* KANNADA SIGN VISARGA */
756  0x0CBE, /* KANNADA VOWEL SIGN AA */
757  0x0CC0, /* KANNADA VOWEL SIGN II */
758  0x0CC1, /* KANNADA VOWEL SIGN U */
759  0x0CC2, /* KANNADA VOWEL SIGN UU */
760  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
761  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
762  0x0CC7, /* KANNADA VOWEL SIGN EE */
763  0x0CC8, /* KANNADA VOWEL SIGN AI */
764  0x0CCA, /* KANNADA VOWEL SIGN O */
765  0x0CCB, /* KANNADA VOWEL SIGN OO */
766  0x0CD5, /* KANNADA LENGTH MARK */
767  0x0CD6, /* KANNADA AI LENGTH MARK */
768  0x0D02, /* MALAYALAM SIGN ANUSVARA */
769  0x0D03, /* MALAYALAM SIGN VISARGA */
770  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
771  0x0D3F, /* MALAYALAM VOWEL SIGN I */
772  0x0D40, /* MALAYALAM VOWEL SIGN II */
773  0x0D46, /* MALAYALAM VOWEL SIGN E */
774  0x0D47, /* MALAYALAM VOWEL SIGN EE */
775  0x0D48, /* MALAYALAM VOWEL SIGN AI */
776  0x0D4A, /* MALAYALAM VOWEL SIGN O */
777  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
778  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
779  0x0D57, /* MALAYALAM AU LENGTH MARK */
780  0x0D82, /* SINHALA SIGN ANUSVARAYA */
781  0x0D83, /* SINHALA SIGN VISARGAYA */
782  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
783  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
784  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
785  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
786  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
787  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
788  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
789  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
790  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
791  * AELA-PILLA */
792  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
793  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
794  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
795  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
796  0x0F3E, /* TIBETAN SIGN YAR TSHES */
797  0x0F3F, /* TIBETAN SIGN MAR TSHES */
798  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
799  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
800  0x102C, /* MYANMAR VOWEL SIGN AA */
801  0x1031, /* MYANMAR VOWEL SIGN E */
802  0x1038, /* MYANMAR SIGN VISARGA */
803  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
804  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
805  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
806  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
807  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
808  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
809  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
810  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
811  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
812  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
813  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
814  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
815  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
816  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
817  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
818  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
819  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
820  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
821  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
822  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
823  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
824  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
825  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
826  0x17B6, /* KHMER VOWEL SIGN AA */
827  0x17BE, /* KHMER VOWEL SIGN OE */
828  0x17BF, /* KHMER VOWEL SIGN YA */
829  0x17C0, /* KHMER VOWEL SIGN IE */
830  0x17C1, /* KHMER VOWEL SIGN E */
831  0x17C2, /* KHMER VOWEL SIGN AE */
832  0x17C3, /* KHMER VOWEL SIGN AI */
833  0x17C4, /* KHMER VOWEL SIGN OO */
834  0x17C5, /* KHMER VOWEL SIGN AU */
835  0x17C7, /* KHMER SIGN REAHMUK */
836  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
837  0x1923, /* LIMBU VOWEL SIGN EE */
838  0x1924, /* LIMBU VOWEL SIGN AI */
839  0x1925, /* LIMBU VOWEL SIGN OO */
840  0x1926, /* LIMBU VOWEL SIGN AU */
841  0x1929, /* LIMBU SUBJOINED LETTER YA */
842  0x192A, /* LIMBU SUBJOINED LETTER RA */
843  0x192B, /* LIMBU SUBJOINED LETTER WA */
844  0x1930, /* LIMBU SMALL LETTER KA */
845  0x1931, /* LIMBU SMALL LETTER NGA */
846  0x1933, /* LIMBU SMALL LETTER TA */
847  0x1934, /* LIMBU SMALL LETTER NA */
848  0x1935, /* LIMBU SMALL LETTER PA */
849  0x1936, /* LIMBU SMALL LETTER MA */
850  0x1937, /* LIMBU SMALL LETTER RA */
851  0x1938, /* LIMBU SMALL LETTER LA */
852  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
853  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
854  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
855  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
856  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
857  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
858  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
859  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
860  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
861  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
862  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
863  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
864  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
865  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
866  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
867  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
868  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
869  0x19C8, /* NEW TAI LUE TONE MARK-1 */
870  0x19C9, /* NEW TAI LUE TONE MARK-2 */
871  0x1A19, /* BUGINESE VOWEL SIGN E */
872  0x1A1A, /* BUGINESE VOWEL SIGN O */
873  0x1A1B, /* BUGINESE VOWEL SIGN AE */
874  0x1B04, /* BALINESE SIGN BISAH */
875  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
876  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
877  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
878  0x1B3E, /* BALINESE VOWEL SIGN TALING */
879  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
880  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
881  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
882  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
883  0x1B44, /* BALINESE ADEG ADEG */
884  0x1B82, /* SUNDANESE SIGN PANGWISAD */
885  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
886  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
887  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
888  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
889  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
890  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
891  0x1C26, /* LEPCHA VOWEL SIGN AA */
892  0x1C27, /* LEPCHA VOWEL SIGN I */
893  0x1C28, /* LEPCHA VOWEL SIGN O */
894  0x1C29, /* LEPCHA VOWEL SIGN OO */
895  0x1C2A, /* LEPCHA VOWEL SIGN U */
896  0x1C2B, /* LEPCHA VOWEL SIGN UU */
897  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
898  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
899  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
900  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
901  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
902  0xA880, /* SAURASHTRA SIGN ANUSVARA */
903  0xA881, /* SAURASHTRA SIGN VISARGA */
904  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
905  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
906  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
907  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
908  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
909  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
910  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
911  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
912  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
913  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
914  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
915  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
916  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
917  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
918  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
919  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
920  0xA952, /* REJANG CONSONANT SIGN H */
921  0xA953, /* REJANG VIRAMA */
922  0xAA2F, /* CHAM VOWEL SIGN O */
923  0xAA30, /* CHAM VOWEL SIGN AI */
924  0xAA33, /* CHAM CONSONANT SIGN YA */
925  0xAA34, /* CHAM CONSONANT SIGN RA */
926  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
927  };
928  const pg_wchar *StopLow = strange_letter,
929  *StopHigh = strange_letter + lengthof(strange_letter),
930  *StopMiddle;
931  pg_wchar c;
932 
933  if (prs->pgwstr)
934  c = *(prs->pgwstr + prs->state->poschar);
935  else
936  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
937 
938  while (StopLow < StopHigh)
939  {
940  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
941  if (*StopMiddle == c)
942  return 1;
943  else if (*StopMiddle < c)
944  StopLow = StopMiddle + 1;
945  else
946  StopHigh = StopMiddle;
947  }
948  }
949 
950  return 0;
951 }
952 
953 /*
954  * Table of state/action of parser
955  */
956 
958  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
959  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
960  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
961  {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
962  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
963  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
964  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
965  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
966  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
967  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
968  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
969  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
970  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
971 };
972 
973 
975  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
976  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
977  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
978  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
979  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
980  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
981  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
982  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
983 };
984 
986  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
987  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
988  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
989  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
990  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
991  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
992  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
993  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
994  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
995  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
996  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
997  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
998  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
999  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1000  {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1001 };
1002 
1004  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1005  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1006  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1007  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1008  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1009  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1010 };
1011 
1013  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1014  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1015  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1016  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1017  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1018  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1019  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1020  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1021  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1022  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1023  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1024  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1025  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1026  {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1027 };
1028 
1030  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1031  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1032  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1033 };
1034 
1036  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1037  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1038  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1039  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1040  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1041  {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1042 };
1043 
1045  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1046  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1047  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1048  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1049  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1050  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1051  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1052  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1053  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1054 };
1055 
1057  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1058  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1059  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1060 };
1061 
1063  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1064  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1065  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1066  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1067  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1068  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1069 };
1070 
1072  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1073  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1074  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1075 };
1076 
1078  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1079  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1080  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1081  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1082  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1083  {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1084 };
1085 
1087  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1088  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1089  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1090 };
1091 
1093  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1094  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1095  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1096 };
1097 
1098 
1100  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1101  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1102  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1103 };
1104 
1106  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1107  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1108  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1109  {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1110 };
1111 
1113  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1114  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1115  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1116  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1117  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1118 };
1119 
1121  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1124 };
1125 
1127  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1128  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1129  {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1130 };
1131 
1133  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1134  {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1135  {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1136  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1137  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1138  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1139 };
1140 
1142  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1143  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1147  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1148  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1149  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1150 };
1151 
1153  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1154  {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1155  {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1156  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1157  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 };
1159 
1161  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1163  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1164 };
1165 
1167  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1168  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1169  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1170  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1171 };
1172 
1174  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1175  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1176  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1177  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1178 };
1179 
1181  {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1182 };
1183 
1185  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1186  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1187  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1188  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1189  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1190  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1191  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1192  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1193 };
1194 
1196  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1197  /* <?xml ... */
1198  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1199  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1200  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1201 };
1202 
1204  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1206  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1207 };
1208 
1210  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1211  /* <br/> case */
1212  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1213  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1214  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1215  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1216  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1217  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1218  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1219  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1220  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1221 };
1222 
1224  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1225  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1226  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1227 };
1228 
1230  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1231  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1232  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1233  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1234  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1235  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1236  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1237  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1238  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1239  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1240  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1241  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1242  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1243  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1244  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1245  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1246  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1247  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1248  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1249 };
1250 
1252  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1253  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1254  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1255  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1256 };
1257 
1259  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1260  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1261  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1262  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1263 };
1264 
1266  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1268 };
1269 
1271  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1272 };
1273 
1275  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1276  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1277  /* <!DOCTYPE ...> */
1278  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1279  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1280  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1281 };
1282 
1284  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1285  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1286  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1287 };
1288 
1290  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1292  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1293 };
1294 
1296  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1297  {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1298  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1299 };
1300 
1302  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1303  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1304  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1305  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1306 };
1307 
1309  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1310 };
1311 
1313  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1314  {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1315  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1316  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1317 };
1318 
1320  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1321  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1322  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1323  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1324  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1325  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1326  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1327  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1328 };
1329 
1331  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1332  {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1333  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1334  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1335  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1336  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1337  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1338  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1339  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1341  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1342  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1343 };
1344 
1346  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1347  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1348  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1349 };
1350 
1352  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1353  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1355  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1356  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1357 };
1358 
1360  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1361  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1362  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1363  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1364 };
1365 
1367  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1368  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1369  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1370  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1371  {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1372  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1373  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1374  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1375 };
1376 
1378  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1379  {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1380  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1381 };
1382 
1384  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1385  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1386  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1387  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1388  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1389  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1390  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1391 };
1392 
1394  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1395  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1396  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1397  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1398  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1399  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1400 };
1401 
1403  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1404  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1405  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1406  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1407  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1408  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1409  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1410 };
1411 
1413  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1414  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1415  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1416  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1417 };
1418 
1420  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1421  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1422  {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1423  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1424  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1425 };
1426 
1428  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1429  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1430  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1431  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1432  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1433  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1434  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1435  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1436 };
1437 
1439  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1440  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1441  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1442  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1443  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1444 };
1445 
1447  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1449  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1450 };
1451 
1453  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1454 };
1455 
1457  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1458  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1459  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1460 };
1461 
1463  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1465  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1466 };
1467 
1469  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1470  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1471  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1472 };
1473 
1475  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1476  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1477  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1478 };
1479 
1481  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1482 };
1483 
1485  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1486  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1487  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1488  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1489  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1490 };
1491 
1494  {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1497  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1498  {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1500 };
1501 
1503  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1504  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1506  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1507 };
1508 
1511  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1512  {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1514  {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1516 };
1517 
1519  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1520  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1521  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1522  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1523 };
1524 
1527  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1528  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529  {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1531 };
1532 
1534  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1535  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1536  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1538  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1539 };
1540 
1542  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1544  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1545  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1546  {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1547  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1548 };
1549 
1551  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1552  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1554  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1555 };
1556 
1558  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1559  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1560  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1561  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1562  {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1563 };
1564 
1566  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1568  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569  {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1570  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1571  {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1572 };
1573 
1575  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1576  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1577  {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1578  {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1579 };
1580 
1582  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1583  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1584  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1586  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1587 };
1588 
1589 
1590 /*
1591  * main table of per-state parser actions
1592  */
1593 typedef struct
1594 {
1595  const TParserStateActionItem *action; /* the actual state info */
1596  TParserState state; /* only for Assert crosscheck */
1597 #ifdef WPARSER_TRACE
1598  const char *state_name; /* only for debug printout */
1599 #endif
1601 
1602 #ifdef WPARSER_TRACE
1603 #define TPARSERSTATEACTION(state) \
1604  { CppConcat(action,state), state, CppAsString(state) }
1605 #else
1606 #define TPARSERSTATEACTION(state) \
1607  { CppConcat(action,state), state }
1608 #endif
1609 
1610 /*
1611  * order must be the same as in typedef enum {} TParserState!!
1612  */
1613 
1614 static const TParserStateAction Actions[] = {
1692 };
1693 
1694 
1695 static bool
1697 {
1698  const TParserStateActionItem *item = NULL;
1699 
1700  Assert(prs->state);
1701 
1702  if (prs->state->posbyte >= prs->lenstr)
1703  return false;
1704 
1705  prs->token = prs->str + prs->state->posbyte;
1706  prs->state->pushedAtAction = NULL;
1707 
1708  /* look at string */
1709  while (prs->state->posbyte <= prs->lenstr)
1710  {
1711  if (prs->state->posbyte == prs->lenstr)
1712  prs->state->charlen = 0;
1713  else
1714  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1715  pg_mblen(prs->str + prs->state->posbyte);
1716 
1717  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1718  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1719  Assert(Actions[prs->state->state].state == prs->state->state);
1720 
1721  if (prs->state->pushedAtAction)
1722  {
1723  /* After a POP, pick up at the next test */
1724  item = prs->state->pushedAtAction + 1;
1725  prs->state->pushedAtAction = NULL;
1726  }
1727  else
1728  {
1729  item = Actions[prs->state->state].action;
1730  Assert(item != NULL);
1731  }
1732 
1733  /* find action by character class */
1734  while (item->isclass)
1735  {
1736  prs->c = item->c;
1737  if (item->isclass(prs) != 0)
1738  break;
1739  item++;
1740  }
1741 
1742 #ifdef WPARSER_TRACE
1743  {
1744  TParserPosition *ptr;
1745 
1746  fprintf(stderr, "state ");
1747  /* indent according to stack depth */
1748  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1749  fprintf(stderr, " ");
1750  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1751  if (prs->state->posbyte < prs->lenstr)
1752  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1753  else
1754  fprintf(stderr, "at EOF");
1755  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1756  (int) (item - Actions[prs->state->state].action),
1757  (item->flags & A_BINGO) ? " BINGO" : "",
1758  (item->flags & A_POP) ? " POP" : "",
1759  (item->flags & A_PUSH) ? " PUSH" : "",
1760  (item->flags & A_RERUN) ? " RERUN" : "",
1761  (item->flags & A_CLEAR) ? " CLEAR" : "",
1762  (item->flags & A_MERGE) ? " MERGE" : "",
1763  (item->flags & A_CLRALL) ? " CLRALL" : "",
1764  (item->tostate != TPS_Null) ? " tostate " : "",
1765  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1766  (item->type > 0) ? " type " : "",
1767  tok_alias[item->type]);
1768  }
1769 #endif
1770 
1771  /* call special handler if exists */
1772  if (item->special)
1773  item->special(prs);
1774 
1775  /* BINGO, token is found */
1776  if (item->flags & A_BINGO)
1777  {
1778  Assert(item->type > 0);
1779  prs->lenbytetoken = prs->state->lenbytetoken;
1780  prs->lenchartoken = prs->state->lenchartoken;
1781  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1782  prs->type = item->type;
1783  }
1784 
1785  /* do various actions by flags */
1786  if (item->flags & A_POP)
1787  { /* pop stored state in stack */
1788  TParserPosition *ptr = prs->state->prev;
1789 
1790  pfree(prs->state);
1791  prs->state = ptr;
1792  Assert(prs->state);
1793  }
1794  else if (item->flags & A_PUSH)
1795  { /* push (store) state in stack */
1796  prs->state->pushedAtAction = item; /* remember where we push */
1797  prs->state = newTParserPosition(prs->state);
1798  }
1799  else if (item->flags & A_CLEAR)
1800  { /* clear previous pushed state */
1801  TParserPosition *ptr;
1802 
1803  Assert(prs->state->prev);
1804  ptr = prs->state->prev->prev;
1805  pfree(prs->state->prev);
1806  prs->state->prev = ptr;
1807  }
1808  else if (item->flags & A_CLRALL)
1809  { /* clear all previous pushed state */
1810  TParserPosition *ptr;
1811 
1812  while (prs->state->prev)
1813  {
1814  ptr = prs->state->prev->prev;
1815  pfree(prs->state->prev);
1816  prs->state->prev = ptr;
1817  }
1818  }
1819  else if (item->flags & A_MERGE)
1820  { /* merge posinfo with current and pushed state */
1821  TParserPosition *ptr = prs->state;
1822 
1823  Assert(prs->state->prev);
1824  prs->state = prs->state->prev;
1825 
1826  prs->state->posbyte = ptr->posbyte;
1827  prs->state->poschar = ptr->poschar;
1828  prs->state->charlen = ptr->charlen;
1829  prs->state->lenbytetoken = ptr->lenbytetoken;
1830  prs->state->lenchartoken = ptr->lenchartoken;
1831  pfree(ptr);
1832  }
1833 
1834  /* set new state if pointed */
1835  if (item->tostate != TPS_Null)
1836  prs->state->state = item->tostate;
1837 
1838  /* check for go away */
1839  if ((item->flags & A_BINGO) ||
1840  (prs->state->posbyte >= prs->lenstr &&
1841  (item->flags & A_RERUN) == 0))
1842  break;
1843 
1844  /* go to beginning of loop if we should rerun or we just restore state */
1845  if (item->flags & (A_RERUN | A_POP))
1846  continue;
1847 
1848  /* move forward */
1849  if (prs->state->charlen)
1850  {
1851  prs->state->posbyte += prs->state->charlen;
1852  prs->state->lenbytetoken += prs->state->charlen;
1853  prs->state->poschar++;
1854  prs->state->lenchartoken++;
1855  }
1856  }
1857 
1858  return (item && (item->flags & A_BINGO));
1859 }
1860 
1861 Datum
1863 {
1864  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1865  int i;
1866 
1867  for (i = 1; i <= LASTNUM; i++)
1868  {
1869  descr[i - 1].lexid = i;
1870  descr[i - 1].alias = pstrdup(tok_alias[i]);
1871  descr[i - 1].descr = pstrdup(lex_descr[i]);
1872  }
1873 
1874  descr[LASTNUM].lexid = 0;
1875 
1876  PG_RETURN_POINTER(descr);
1877 }
1878 
1879 Datum
1881 {
1883 }
1884 
1885 Datum
1887 {
1888  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1889  char **t = (char **) PG_GETARG_POINTER(1);
1890  int *tlen = (int *) PG_GETARG_POINTER(2);
1891 
1892  if (!TParserGet(p))
1893  PG_RETURN_INT32(0);
1894 
1895  *t = p->token;
1896  *tlen = p->lenbytetoken;
1897 
1898  PG_RETURN_INT32(p->type);
1899 }
1900 
1901 Datum
1903 {
1904  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1905 
1906  TParserClose(p);
1907  PG_RETURN_VOID();
1908 }
1909 
1910 
1911 /*
1912  * ts_headline support begins here
1913  */
1914 
1915 /* token type classification macros */
1916 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1917 #define HLIDREPLACE(x) ( (x)==TAG_T )
1918 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1919 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1920 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1921 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1922 
1923 /*
1924  * Macros useful in headline selection. These rely on availability of
1925  * "HeadlineParsedText *prs" describing some text, and "int shortword"
1926  * describing the "short word" length parameter.
1927  */
1928 
1929 /* Interesting words are non-repeated search terms */
1930 #define INTERESTINGWORD(j) \
1931  (prs->words[j].item && !prs->words[j].repeated)
1932 
1933 /* Don't want to end at a non-word or a short word, unless interesting */
1934 #define BADENDPOINT(j) \
1935  ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1936  !INTERESTINGWORD(j))
1937 
1938 typedef struct
1939 {
1940  /* one cover (well, really one fragment) for mark_hl_fragments */
1941  int32 startpos; /* fragment's starting word index */
1942  int32 endpos; /* ending word index (inclusive) */
1943  int32 poslen; /* number of interesting words */
1944  int32 curlen; /* total number of words */
1945  bool chosen; /* chosen? */
1946  bool excluded; /* excluded? */
1947 } CoverPos;
1948 
1949 typedef struct
1950 {
1951  /* callback data for checkcondition_HL */
1953  int len;
1954 } hlCheck;
1955 
1956 
1957 /*
1958  * TS_execute callback for matching a tsquery operand to headline words
1959  *
1960  * Note: it's tempting to report words[] indexes as pos values to save
1961  * searching in hlCover; but that would screw up phrase matching, which
1962  * expects to measure distances in lexemes not tokens.
1963  */
1964 static TSTernaryValue
1966 {
1967  hlCheck *checkval = (hlCheck *) opaque;
1968  int i;
1969 
1970  /* scan words array for matching items */
1971  for (i = 0; i < checkval->len; i++)
1972  {
1973  if (checkval->words[i].item == val)
1974  {
1975  /* if data == NULL, don't need to report positions */
1976  if (!data)
1977  return TS_YES;
1978 
1979  if (!data->pos)
1980  {
1981  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1982  data->allocated = true;
1983  data->npos = 1;
1984  data->pos[0] = checkval->words[i].pos;
1985  }
1986  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1987  {
1988  data->pos[data->npos++] = checkval->words[i].pos;
1989  }
1990  }
1991  }
1992 
1993  if (data && data->npos > 0)
1994  return TS_YES;
1995 
1996  return TS_NO;
1997 }
1998 
1999 /*
2000  * hlCover: try to find a substring of prs' word list that satisfies query
2001  *
2002  * locations is the result of TS_execute_locations() for the query.
2003  * We use this to identify plausible subranges of the query.
2004  *
2005  * *nextpos is the lexeme position (NOT word index) to start the search
2006  * at. Caller should initialize this to zero. If successful, we'll
2007  * advance it to the next place to search at.
2008  *
2009  * On success, sets *p to first word index and *q to last word index of the
2010  * cover substring, and returns true.
2011  *
2012  * The result is a minimal cover, in the sense that both *p and *q will be
2013  * words used in the query.
2014  */
2015 static bool
2016 hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2017  int *nextpos, int *p, int *q)
2018 {
2019  int pos = *nextpos;
2020 
2021  /* This loop repeats when our selected word-range fails the query */
2022  for (;;)
2023  {
2024  int posb,
2025  pose;
2026  ListCell *lc;
2027 
2028  /*
2029  * For each AND'ed query term or phrase, find its first occurrence at
2030  * or after pos; set pose to the maximum of those positions.
2031  *
2032  * We need not consider ORs or NOTs here; see the comments for
2033  * TS_execute_locations(). Rechecking the match with TS_execute(),
2034  * below, will deal with any ensuing imprecision.
2035  */
2036  pose = -1;
2037  foreach(lc, locations)
2038  {
2039  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2040  int first = -1;
2041 
2042  for (int i = 0; i < pdata->npos; i++)
2043  {
2044  /* For phrase matches, use the ending lexeme */
2045  int endp = pdata->pos[i];
2046 
2047  if (endp >= pos)
2048  {
2049  first = endp;
2050  break;
2051  }
2052  }
2053  if (first < 0)
2054  return false; /* no more matches for this term */
2055  if (first > pose)
2056  pose = first;
2057  }
2058 
2059  if (pose < 0)
2060  return false; /* we only get here if empty list */
2061 
2062  /*
2063  * Now, for each AND'ed query term or phrase, find its last occurrence
2064  * at or before pose; set posb to the minimum of those positions.
2065  *
2066  * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2067  * posb + 1 below.
2068  */
2069  posb = INT_MAX - 1;
2070  foreach(lc, locations)
2071  {
2072  ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2073  int last = -1;
2074 
2075  for (int i = pdata->npos - 1; i >= 0; i--)
2076  {
2077  /* For phrase matches, use the starting lexeme */
2078  int startp = pdata->pos[i] - pdata->width;
2079 
2080  if (startp <= pose)
2081  {
2082  last = startp;
2083  break;
2084  }
2085  }
2086  if (last < posb)
2087  posb = last;
2088  }
2089 
2090  /*
2091  * We could end up with posb to the left of pos, in case some phrase
2092  * match crosses pos. Try the match starting at pos anyway, since the
2093  * result of TS_execute_locations is imprecise for phrase matches OR'd
2094  * with plain matches; that is, if the query is "(A <-> B) | C" then C
2095  * could match at pos even though the phrase match would have to
2096  * extend to the left of pos.
2097  */
2098  posb = Max(posb, pos);
2099 
2100  /* This test probably always succeeds, but be paranoid */
2101  if (posb <= pose)
2102  {
2103  /*
2104  * posb .. pose is now the shortest, earliest-after-pos range of
2105  * lexeme positions containing all the query terms. It will
2106  * contain all phrase matches, too, except in the corner case
2107  * described just above.
2108  *
2109  * Now convert these lexeme positions to indexes in prs->words[].
2110  */
2111  int idxb = -1;
2112  int idxe = -1;
2113 
2114  for (int i = 0; i < prs->curwords; i++)
2115  {
2116  if (prs->words[i].item == NULL)
2117  continue;
2118  if (idxb < 0 && prs->words[i].pos >= posb)
2119  idxb = i;
2120  if (prs->words[i].pos <= pose)
2121  idxe = i;
2122  else
2123  break;
2124  }
2125 
2126  /* This test probably always succeeds, but be paranoid */
2127  if (idxb >= 0 && idxe >= idxb)
2128  {
2129  /*
2130  * Finally, check that the selected range satisfies the query.
2131  * This should succeed in all simple cases; but odd cases
2132  * involving non-top-level NOT conditions or phrase matches
2133  * OR'd with other things could fail, since the result of
2134  * TS_execute_locations doesn't fully represent such things.
2135  */
2136  hlCheck ch;
2137 
2138  ch.words = &(prs->words[idxb]);
2139  ch.len = idxe - idxb + 1;
2140  if (TS_execute(GETQUERY(query), &ch,
2142  {
2143  /* Match! Advance *nextpos and return the word range. */
2144  *nextpos = posb + 1;
2145  *p = idxb;
2146  *q = idxe;
2147  return true;
2148  }
2149  }
2150  }
2151 
2152  /*
2153  * Advance pos and try again. Any later workable match must start
2154  * beyond posb.
2155  */
2156  pos = posb + 1;
2157  }
2158  /* Can't get here, but stupider compilers complain if we leave it off */
2159  return false;
2160 }
2161 
2162 /*
2163  * Apply suitable highlight marking to words selected by headline selector
2164  *
2165  * The words from startpos to endpos inclusive are marked per highlightall
2166  */
2167 static void
2168 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2169  int startpos, int endpos)
2170 {
2171  int i;
2172 
2173  for (i = startpos; i <= endpos; i++)
2174  {
2175  if (prs->words[i].item)
2176  prs->words[i].selected = 1;
2177  if (!highlightall)
2178  {
2179  if (HLIDREPLACE(prs->words[i].type))
2180  prs->words[i].replace = 1;
2181  else if (HLIDSKIP(prs->words[i].type))
2182  prs->words[i].skip = 1;
2183  }
2184  else
2185  {
2186  if (XMLHLIDSKIP(prs->words[i].type))
2187  prs->words[i].skip = 1;
2188  }
2189 
2190  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2191  }
2192 }
2193 
2194 /*
2195  * split a cover substring into fragments not longer than max_words
2196  *
2197  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2198  * substring. They are updated to hold the bounds of the next fragment.
2199  *
2200  * *curlen and *poslen are set to the fragment's length, in words and
2201  * interesting words respectively.
2202  */
2203 static void
2205  int *curlen, int *poslen, int max_words)
2206 {
2207  int i;
2208 
2209  /*
2210  * Objective: select a fragment of words between startpos and endpos such
2211  * that it has at most max_words and both ends have query words. If the
2212  * startpos and endpos are the endpoints of the cover and the cover has
2213  * fewer words than max_words, then this function should just return the
2214  * cover
2215  */
2216  /* first move startpos to an item */
2217  for (i = *startpos; i <= *endpos; i++)
2218  {
2219  *startpos = i;
2220  if (INTERESTINGWORD(i))
2221  break;
2222  }
2223  /* cut endpos to have only max_words */
2224  *curlen = 0;
2225  *poslen = 0;
2226  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2227  {
2228  if (!NONWORDTOKEN(prs->words[i].type))
2229  *curlen += 1;
2230  if (INTERESTINGWORD(i))
2231  *poslen += 1;
2232  }
2233  /* if the cover was cut then move back endpos to a query item */
2234  if (*endpos > i)
2235  {
2236  *endpos = i;
2237  for (i = *endpos; i >= *startpos; i--)
2238  {
2239  *endpos = i;
2240  if (INTERESTINGWORD(i))
2241  break;
2242  if (!NONWORDTOKEN(prs->words[i].type))
2243  *curlen -= 1;
2244  }
2245  }
2246 }
2247 
2248 /*
2249  * Headline selector used when MaxFragments > 0
2250  *
2251  * Note: in this mode, highlightall is disregarded for phrase selection;
2252  * it only controls presentation details.
2253  */
2254 static void
2256  bool highlightall,
2257  int shortword, int min_words,
2258  int max_words, int max_fragments)
2259 {
2260  int32 poslen,
2261  curlen,
2262  i,
2263  f,
2264  num_f = 0;
2265  int32 stretch,
2266  maxstretch,
2267  posmarker;
2268 
2269  int32 startpos = 0,
2270  endpos = 0,
2271  nextpos = 0,
2272  p = 0,
2273  q = 0;
2274 
2275  int32 numcovers = 0,
2276  maxcovers = 32;
2277 
2278  int32 minI,
2279  minwords,
2280  maxitems;
2281  CoverPos *covers;
2282 
2283  covers = palloc(maxcovers * sizeof(CoverPos));
2284 
2285  /* get all covers */
2286  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2287  {
2288  startpos = p;
2289  endpos = q;
2290 
2291  /*
2292  * Break the cover into smaller fragments such that each fragment has
2293  * at most max_words. Also ensure that each end of each fragment is a
2294  * query word. This will allow us to stretch the fragment in either
2295  * direction
2296  */
2297 
2298  while (startpos <= endpos)
2299  {
2300  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2301  if (numcovers >= maxcovers)
2302  {
2303  maxcovers *= 2;
2304  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2305  }
2306  covers[numcovers].startpos = startpos;
2307  covers[numcovers].endpos = endpos;
2308  covers[numcovers].curlen = curlen;
2309  covers[numcovers].poslen = poslen;
2310  covers[numcovers].chosen = false;
2311  covers[numcovers].excluded = false;
2312  numcovers++;
2313  startpos = endpos + 1;
2314  endpos = q;
2315  }
2316  }
2317 
2318  /* choose best covers */
2319  for (f = 0; f < max_fragments; f++)
2320  {
2321  maxitems = 0;
2322  minwords = PG_INT32_MAX;
2323  minI = -1;
2324 
2325  /*
2326  * Choose the cover that contains max items. In case of tie choose the
2327  * one with smaller number of words.
2328  */
2329  for (i = 0; i < numcovers; i++)
2330  {
2331  if (!covers[i].chosen && !covers[i].excluded &&
2332  (maxitems < covers[i].poslen ||
2333  (maxitems == covers[i].poslen &&
2334  minwords > covers[i].curlen)))
2335  {
2336  maxitems = covers[i].poslen;
2337  minwords = covers[i].curlen;
2338  minI = i;
2339  }
2340  }
2341  /* if a cover was found mark it */
2342  if (minI >= 0)
2343  {
2344  covers[minI].chosen = true;
2345  /* adjust the size of cover */
2346  startpos = covers[minI].startpos;
2347  endpos = covers[minI].endpos;
2348  curlen = covers[minI].curlen;
2349  /* stretch the cover if cover size is lower than max_words */
2350  if (curlen < max_words)
2351  {
2352  /* divide the stretch on both sides of cover */
2353  maxstretch = (max_words - curlen) / 2;
2354 
2355  /*
2356  * first stretch the startpos stop stretching if 1. we hit the
2357  * beginning of document 2. exceed maxstretch 3. we hit an
2358  * already marked fragment
2359  */
2360  stretch = 0;
2361  posmarker = startpos;
2362  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2363  {
2364  if (!NONWORDTOKEN(prs->words[i].type))
2365  {
2366  curlen++;
2367  stretch++;
2368  }
2369  posmarker = i;
2370  }
2371  /* cut back startpos till we find a good endpoint */
2372  for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2373  {
2374  if (!NONWORDTOKEN(prs->words[i].type))
2375  curlen--;
2376  }
2377  startpos = i;
2378  /* now stretch the endpos as much as possible */
2379  posmarker = endpos;
2380  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2381  {
2382  if (!NONWORDTOKEN(prs->words[i].type))
2383  curlen++;
2384  posmarker = i;
2385  }
2386  /* cut back endpos till we find a good endpoint */
2387  for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2388  {
2389  if (!NONWORDTOKEN(prs->words[i].type))
2390  curlen--;
2391  }
2392  endpos = i;
2393  }
2394  covers[minI].startpos = startpos;
2395  covers[minI].endpos = endpos;
2396  covers[minI].curlen = curlen;
2397  /* Mark the chosen fragments (covers) */
2398  mark_fragment(prs, highlightall, startpos, endpos);
2399  num_f++;
2400  /* Exclude covers overlapping this one from future consideration */
2401  for (i = 0; i < numcovers; i++)
2402  {
2403  if (i != minI &&
2404  ((covers[i].startpos >= startpos &&
2405  covers[i].startpos <= endpos) ||
2406  (covers[i].endpos >= startpos &&
2407  covers[i].endpos <= endpos) ||
2408  (covers[i].startpos < startpos &&
2409  covers[i].endpos > endpos)))
2410  covers[i].excluded = true;
2411  }
2412  }
2413  else
2414  break; /* no selectable covers remain */
2415  }
2416 
2417  /* show the first min_words words if we have not marked anything */
2418  if (num_f <= 0)
2419  {
2420  startpos = curlen = 0;
2421  endpos = -1;
2422  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2423  {
2424  if (!NONWORDTOKEN(prs->words[i].type))
2425  curlen++;
2426  endpos = i;
2427  }
2428  mark_fragment(prs, highlightall, startpos, endpos);
2429  }
2430 
2431  pfree(covers);
2432 }
2433 
2434 /*
2435  * Headline selector used when MaxFragments == 0
2436  */
2437 static void
2439  bool highlightall,
2440  int shortword, int min_words, int max_words)
2441 {
2442  int nextpos = 0,
2443  p = 0,
2444  q = 0;
2445  int bestb = -1,
2446  beste = -1;
2447  int bestlen = -1;
2448  bool bestcover = false;
2449  int pose,
2450  posb,
2451  poslen,
2452  curlen;
2453  bool poscover;
2454  int i;
2455 
2456  if (!highlightall)
2457  {
2458  /* examine all covers, select a headline using the best one */
2459  while (hlCover(prs, query, locations, &nextpos, &p, &q))
2460  {
2461  /*
2462  * Count words (curlen) and interesting words (poslen) within
2463  * cover, but stop once we reach max_words. This step doesn't
2464  * consider whether that's a good stopping point. posb and pose
2465  * are set to the start and end indexes of the possible headline.
2466  */
2467  curlen = 0;
2468  poslen = 0;
2469  posb = pose = p;
2470  for (i = p; i <= q && curlen < max_words; i++)
2471  {
2472  if (!NONWORDTOKEN(prs->words[i].type))
2473  curlen++;
2474  if (INTERESTINGWORD(i))
2475  poslen++;
2476  pose = i;
2477  }
2478 
2479  if (curlen < max_words)
2480  {
2481  /*
2482  * We have room to lengthen the headline, so search forward
2483  * until it's full or we find a good stopping point. We'll
2484  * reconsider the word at "q", then move forward.
2485  */
2486  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2487  {
2488  if (i > q)
2489  {
2490  if (!NONWORDTOKEN(prs->words[i].type))
2491  curlen++;
2492  if (INTERESTINGWORD(i))
2493  poslen++;
2494  }
2495  pose = i;
2496  if (BADENDPOINT(i))
2497  continue;
2498  if (curlen >= min_words)
2499  break;
2500  }
2501  if (curlen < min_words)
2502  {
2503  /*
2504  * Reached end of text and our headline is still shorter
2505  * than min_words, so try to extend it to the left.
2506  */
2507  for (i = p - 1; i >= 0; i--)
2508  {
2509  if (!NONWORDTOKEN(prs->words[i].type))
2510  curlen++;
2511  if (INTERESTINGWORD(i))
2512  poslen++;
2513  if (curlen >= max_words)
2514  break;
2515  if (BADENDPOINT(i))
2516  continue;
2517  if (curlen >= min_words)
2518  break;
2519  }
2520  posb = (i >= 0) ? i : 0;
2521  }
2522  }
2523  else
2524  {
2525  /*
2526  * Can't make headline longer, so consider making it shorter
2527  * if needed to avoid a bad endpoint.
2528  */
2529  if (i > q)
2530  i = q;
2531  for (; curlen > min_words; i--)
2532  {
2533  if (!BADENDPOINT(i))
2534  break;
2535  if (!NONWORDTOKEN(prs->words[i].type))
2536  curlen--;
2537  if (INTERESTINGWORD(i))
2538  poslen--;
2539  pose = i - 1;
2540  }
2541  }
2542 
2543  /*
2544  * Check whether the proposed headline includes the original
2545  * cover; it might not if we trimmed it due to max_words.
2546  */
2547  poscover = (posb <= p && pose >= q);
2548 
2549  /*
2550  * Adopt this headline if it's better than the last one, giving
2551  * highest priority to headlines including the cover, then to
2552  * headlines with more interesting words, then to headlines with
2553  * good stopping points. (Since bestlen is initially -1, we will
2554  * certainly adopt the first headline.)
2555  */
2556  if (poscover > bestcover ||
2557  (poscover == bestcover && poslen > bestlen) ||
2558  (poscover == bestcover && poslen == bestlen &&
2559  !BADENDPOINT(pose) && BADENDPOINT(beste)))
2560  {
2561  bestb = posb;
2562  beste = pose;
2563  bestlen = poslen;
2564  bestcover = poscover;
2565  }
2566  }
2567 
2568  /*
2569  * If we found nothing acceptable, select min_words words starting at
2570  * the beginning.
2571  */
2572  if (bestlen < 0)
2573  {
2574  curlen = 0;
2575  pose = -1;
2576  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2577  {
2578  if (!NONWORDTOKEN(prs->words[i].type))
2579  curlen++;
2580  pose = i;
2581  }
2582  bestb = 0;
2583  beste = pose;
2584  }
2585  }
2586  else
2587  {
2588  /* highlightall mode: headline is whole document */
2589  bestb = 0;
2590  beste = prs->curwords - 1;
2591  }
2592 
2593  mark_fragment(prs, highlightall, bestb, beste);
2594 }
2595 
2596 /*
2597  * Default parser's prsheadline function
2598  */
2599 Datum
2601 {
2603  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2604  TSQuery query = PG_GETARG_TSQUERY(2);
2605  List *locations;
2606 
2607  /* default option values: */
2608  int min_words = 15;
2609  int max_words = 35;
2610  int shortword = 3;
2611  int max_fragments = 0;
2612  bool highlightall = false;
2613  ListCell *l;
2614 
2615  /* Extract configuration option values */
2616  prs->startsel = NULL;
2617  prs->stopsel = NULL;
2618  prs->fragdelim = NULL;
2619  foreach(l, prsoptions)
2620  {
2621  DefElem *defel = (DefElem *) lfirst(l);
2622  char *val = defGetString(defel);
2623 
2624  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2625  max_words = pg_strtoint32(val);
2626  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2627  min_words = pg_strtoint32(val);
2628  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2629  shortword = pg_strtoint32(val);
2630  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2631  max_fragments = pg_strtoint32(val);
2632  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2633  prs->startsel = pstrdup(val);
2634  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2635  prs->stopsel = pstrdup(val);
2636  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2637  prs->fragdelim = pstrdup(val);
2638  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2639  highlightall = (pg_strcasecmp(val, "1") == 0 ||
2640  pg_strcasecmp(val, "on") == 0 ||
2641  pg_strcasecmp(val, "true") == 0 ||
2642  pg_strcasecmp(val, "t") == 0 ||
2643  pg_strcasecmp(val, "y") == 0 ||
2644  pg_strcasecmp(val, "yes") == 0);
2645  else
2646  ereport(ERROR,
2647  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2648  errmsg("unrecognized headline parameter: \"%s\"",
2649  defel->defname)));
2650  }
2651 
2652  /* in HighlightAll mode these parameters are ignored */
2653  if (!highlightall)
2654  {
2655  if (min_words >= max_words)
2656  ereport(ERROR,
2657  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2658  errmsg("MinWords should be less than MaxWords")));
2659  if (min_words <= 0)
2660  ereport(ERROR,
2661  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2662  errmsg("MinWords should be positive")));
2663  if (shortword < 0)
2664  ereport(ERROR,
2665  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2666  errmsg("ShortWord should be >= 0")));
2667  if (max_fragments < 0)
2668  ereport(ERROR,
2669  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2670  errmsg("MaxFragments should be >= 0")));
2671  }
2672 
2673  /* Locate words and phrases matching the query */
2674  if (query->size > 0)
2675  {
2676  hlCheck ch;
2677 
2678  ch.words = prs->words;
2679  ch.len = prs->curwords;
2680  locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2682  }
2683  else
2684  locations = NIL; /* empty query matches nothing */
2685 
2686  /* Apply appropriate headline selector */
2687  if (max_fragments == 0)
2688  mark_hl_words(prs, query, locations, highlightall, shortword,
2689  min_words, max_words);
2690  else
2691  mark_hl_fragments(prs, query, locations, highlightall, shortword,
2692  min_words, max_words, max_fragments);
2693 
2694  /* Fill in default values for string options */
2695  if (!prs->startsel)
2696  prs->startsel = pstrdup("<b>");
2697  if (!prs->stopsel)
2698  prs->stopsel = pstrdup("</b>");
2699  if (!prs->fragdelim)
2700  prs->fragdelim = pstrdup(" ... ");
2701 
2702  /* Caller will need these lengths, too */
2703  prs->startsellen = strlen(prs->startsel);
2704  prs->stopsellen = strlen(prs->stopsel);
2705  prs->fragdelimlen = strlen(prs->fragdelim);
2706 
2707  PG_RETURN_POINTER(prs);
2708 }
#define GETQUERY(x)
Definition: _int.h:157
void print(const void *obj)
Definition: print.c:36
unsigned short uint16
Definition: c.h:489
#define PG_INT32_MAX
Definition: c.h:573
signed int int32
Definition: c.h:478
#define Max(x, y)
Definition: c.h:982
#define lengthof(array)
Definition: c.h:772
char * defGetString(DefElem *def)
Definition: define.c:49
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
long val
Definition: informix.c:664
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1268
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:1031
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:987
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
char * pstrdup(const char *in)
Definition: mcxt.c:1644
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * palloc(Size size)
Definition: mcxt.c:1226
int32 pg_strtoint32(const char *s)
Definition: numutils.c:291
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:49
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:80
const void size_t len
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
bool database_ctype_is_c
Definition: pg_locale.c:114
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, pg_locale_t locale)
Definition: pg_locale.c:3007
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
static XLogRecPtr startpos
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define fprintf
Definition: port.h:242
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
uintptr_t Datum
Definition: postgres.h:64
char * c
bool chosen
Definition: wparser_def.c:1945
int32 endpos
Definition: wparser_def.c:1942
int32 curlen
Definition: wparser_def.c:1944
int32 startpos
Definition: wparser_def.c:1941
bool excluded
Definition: wparser_def.c:1946
int32 poslen
Definition: wparser_def.c:1943
char * defname
Definition: parsenodes.h:810
WordEntryPos * pos
Definition: ts_utils.h:166
HeadlineWordEntry * words
Definition: ts_public.h:76
WordEntryPos pos
Definition: ts_public.h:68
QueryOperand * item
Definition: ts_public.h:70
char * alias
Definition: ts_public.h:28
int lexid
Definition: ts_public.h:27
char * descr
Definition: ts_public.h:29
Definition: pg_list.h:54
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:236
struct TParserPosition * prev
Definition: wparser_def.c:235
TParserState state
Definition: wparser_def.c:234
TParserCharTest isclass
Definition: wparser_def.c:209
TParserState tostate
Definition: wparser_def.c:212
TParserSpecial special
Definition: wparser_def.c:214
const TParserStateActionItem * action
Definition: wparser_def.c:1595
TParserState state
Definition: wparser_def.c:1596
char * str
Definition: wparser_def.c:242
pg_wchar * pgwstr
Definition: wparser_def.c:245
wchar_t * wstr
Definition: wparser_def.c:244
int lenstr
Definition: wparser_def.c:243
char * token
Definition: wparser_def.c:258
int type
Definition: wparser_def.c:261
int charmaxlen
Definition: wparser_def.c:249
bool wanthost
Definition: wparser_def.c:252
int lenbytetoken
Definition: wparser_def.c:259
bool ignore
Definition: wparser_def.c:251
TParserPosition * state
Definition: wparser_def.c:250
int lenchartoken
Definition: wparser_def.c:260
char c
Definition: wparser_def.c:255
bool usewide
Definition: wparser_def.c:246
int32 size
Definition: ts_type.h:221
HeadlineWordEntry * words
Definition: wparser_def.c:1952
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:266
uint16 WordEntryPos
Definition: ts_type.h:63
TSTernaryValue
Definition: ts_utils.h:133
@ TS_NO
Definition: ts_utils.h:134
@ TS_YES
Definition: ts_utils.h:135
#define TS_EXEC_EMPTY
Definition: ts_utils.h:188
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1856
List * TS_execute_locations(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:2009
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1541
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1152
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1502
#define NONWORDTOKEN(x)
Definition: wparser_def.c:1920
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1132
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1359
#define VERSIONNUMBER
Definition: wparser_def.c:41
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1574
#define BADENDPOINT(j)
Definition: wparser_def.c:1934
#define ASCIIWORD
Definition: wparser_def.c:34
#define PROTOCOL
Definition: wparser_def.c:47
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1419
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1402
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1319
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1295
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:586
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1308
struct TParser TParser
static TSTernaryValue checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:1965
void _make_compiler_happy(void)
Definition: wparser_def.c:535
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1452
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1312
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1533
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1330
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1105
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:1919
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1484
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1886
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1209
#define DECIMAL_T
Definition: wparser_def.c:53
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1438
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1141
#define ASCIIPARTHWORD
Definition: wparser_def.c:44
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1462
#define p_iswhat(type, nonascii)
Definition: wparser_def.c:422
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1120
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2255
#define WORD_T
Definition: wparser_def.c:35
TParserState
Definition: wparser_def.c:118
@ TPS_InXMLEntityHexNumFirst
Definition: wparser_def.c:142
@ TPS_InPort
Definition: wparser_def.c:165
@ TPS_InXMLEntityHexNum
Definition: wparser_def.c:143
@ TPS_InHostDomainSecond
Definition: wparser_def.c:162
@ TPS_InMantissaFirst
Definition: wparser_def.c:135
@ TPS_InTagName
Definition: wparser_def.c:148
@ TPS_InHyphenAsciiWordFirst
Definition: wparser_def.c:183
@ TPS_Null
Definition: wparser_def.c:196
@ TPS_InPathFirstFirst
Definition: wparser_def.c:172
@ TPS_InSignedIntFirst
Definition: wparser_def.c:124
@ TPS_InSignedInt
Definition: wparser_def.c:125
@ TPS_InUnsignedInt
Definition: wparser_def.c:123
@ TPS_InMantissa
Definition: wparser_def.c:137
@ TPS_InProtocolFirst
Definition: wparser_def.c:180
@ TPS_InFURL
Definition: wparser_def.c:179
@ TPS_InMantissaSign
Definition: wparser_def.c:136
@ TPS_InXMLBegin
Definition: wparser_def.c:146
@ TPS_InCommentEnd
Definition: wparser_def.c:160
@ TPS_InHyphenWordFirst
Definition: wparser_def.c:185
@ TPS_InHyphenNumWordPart
Definition: wparser_def.c:194
@ TPS_InPortFirst
Definition: wparser_def.c:164
@ TPS_InProtocolEnd
Definition: wparser_def.c:182
@ TPS_InXMLEntityFirst
Definition: wparser_def.c:138
@ TPS_InHyphenNumWordFirst
Definition: wparser_def.c:187
@ TPS_InCommentLast
Definition: wparser_def.c:156
@ TPS_InFileTwiddle
Definition: wparser_def.c:170
@ TPS_InURLPathStart
Definition: wparser_def.c:177
@ TPS_InURLPathFirst
Definition: wparser_def.c:176
@ TPS_InPathFirst
Definition: wparser_def.c:171
@ TPS_InPathSecond
Definition: wparser_def.c:173
@ TPS_InHyphenUnsignedInt
Definition: wparser_def.c:195
@ TPS_InFileFirst
Definition: wparser_def.c:169
@ TPS_InXMLEntityNumFirst
Definition: wparser_def.c:140
@ TPS_InHyphenWordPart
Definition: wparser_def.c:192
@ TPS_InNumWord
Definition: wparser_def.c:120
@ TPS_InAsciiWord
Definition: wparser_def.c:121
@ TPS_InVersion
Definition: wparser_def.c:134
@ TPS_InHost
Definition: wparser_def.c:167
@ TPS_InFile
Definition: wparser_def.c:174
@ TPS_InProtocolSecond
Definition: wparser_def.c:181
@ TPS_InCloseCommentFirst
Definition: wparser_def.c:158
@ TPS_InTagEscapeK
Definition: wparser_def.c:151
@ TPS_InParseHyphenHyphen
Definition: wparser_def.c:191
@ TPS_InTagBackSleshed
Definition: wparser_def.c:153
@ TPS_InTagFirst
Definition: wparser_def.c:145
@ TPS_InTagEnd
Definition: wparser_def.c:154
@ TPS_InComment
Definition: wparser_def.c:157
@ TPS_InHyphenWord
Definition: wparser_def.c:186
@ TPS_InHyphenAsciiWord
Definition: wparser_def.c:184
@ TPS_InWord
Definition: wparser_def.c:122
@ TPS_InXMLEntityEnd
Definition: wparser_def.c:144
@ TPS_InTagEscapeKK
Definition: wparser_def.c:152
@ TPS_InSpace
Definition: wparser_def.c:126
@ TPS_InFileNext
Definition: wparser_def.c:175
@ TPS_InURLPath
Definition: wparser_def.c:178
@ TPS_Base
Definition: wparser_def.c:119
@ TPS_InUDecimal
Definition: wparser_def.c:128
@ TPS_InParseHyphen
Definition: wparser_def.c:190
@ TPS_InHostFirstAN
Definition: wparser_def.c:166
@ TPS_InEmail
Definition: wparser_def.c:168
@ TPS_InDecimalFirst
Definition: wparser_def.c:129
@ TPS_InVersionFirst
Definition: wparser_def.c:133
@ TPS_InCloseCommentLast
Definition: wparser_def.c:159
@ TPS_InSVerVersion
Definition: wparser_def.c:132
@ TPS_InHyphenAsciiWordPart
Definition: wparser_def.c:193
@ TPS_InCommentFirst
Definition: wparser_def.c:155
@ TPS_InUDecimalFirst
Definition: wparser_def.c:127
@ TPS_InHostFirstDomain
Definition: wparser_def.c:161
@ TPS_InHostDomain
Definition: wparser_def.c:163
@ TPS_InHyphenDigitLookahead
Definition: wparser_def.c:189
@ TPS_InVerVersion
Definition: wparser_def.c:131
@ TPS_InXMLEntityNum
Definition: wparser_def.c:141
@ TPS_InTag
Definition: wparser_def.c:150
@ TPS_InDecimal
Definition: wparser_def.c:130
@ TPS_InTagCloseFirst
Definition: wparser_def.c:147
@ TPS_InXMLEntity
Definition: wparser_def.c:139
@ TPS_InHyphenNumWord
Definition: wparser_def.c:188
@ TPS_InTagBeginEnd
Definition: wparser_def.c:149
static void mark_fragment(HeadlineParsedText *prs, bool highlightall, int startpos, int endpos)
Definition: wparser_def.c:2168
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1180
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1525
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1077
#define A_POP
Definition: wparser_def.c:220
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1029
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1251
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1044
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1427
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:344
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1565
#define LASTNUM
Definition: wparser_def.c:58
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:479
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2600
#define NUMHWORD
Definition: wparser_def.c:48
static bool hlCover(HeadlineParsedText *prs, TSQuery query, List *locations, int *nextpos, int *p, int *q)
Definition: wparser_def.c:2016
#define SPACE
Definition: wparser_def.c:45
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1062
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:202
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1035
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:503
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1223
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1184
struct TParserPosition TParserPosition
#define NUMWORD
Definition: wparser_def.c:36
#define FILEPATH
Definition: wparser_def.c:52
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1258
static int p_isneC(TParser *prs)
Definition: wparser_def.c:485
#define EMAIL
Definition: wparser_def.c:37
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1283
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1557
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1112
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:957
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:594
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations, bool highlightall, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2438
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1518
#define UNSIGNEDINT
Definition: wparser_def.c:55
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1377
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1160
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1456
#define A_RERUN
Definition: wparser_def.c:222
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1092
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:985
static const char *const tok_alias[]
Definition: wparser_def.c:60
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:610
#define HLIDSKIP(x)
Definition: wparser_def.c:1918
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2204
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1581
#define SIGNEDINT
Definition: wparser_def.c:54
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:497
static const TParserStateAction Actions[]
Definition: wparser_def.c:1614
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1195
#define PARTHWORD
Definition: wparser_def.c:43
#define HLIDREPLACE(x)
Definition: wparser_def.c:1917
#define A_MERGE
Definition: wparser_def.c:224
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1126
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1099
static int p_isascii(TParser *prs)
Definition: wparser_def.c:491
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1274
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1509
static int p_isignore(TParser *prs)
Definition: wparser_def.c:621
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1550
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1351
#define TAG_T
Definition: wparser_def.c:46
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1071
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:270
#define URLPATH
Definition: wparser_def.c:51
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1862
#define ASCIIHWORD
Definition: wparser_def.c:49
#define HOST
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1229
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1880
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:287
#define A_BINGO
Definition: wparser_def.c:219
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1606
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1696
#define XMLENTITY
Definition: wparser_def.c:56
static int p_ishost(TParser *prs)
Definition: wparser_def.c:627
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1902
#define A_CLRALL
Definition: wparser_def.c:225
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:649
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:601
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1468
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1012
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1056
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1203
static int p_isEOF(TParser *prs)
Definition: wparser_def.c:472
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1301
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:395
#define A_CLEAR
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1383
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:974
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1393
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1366
#define A_PUSH
Definition: wparser_def.c:221
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1265
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1474
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1003
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:678
static void TParserClose(TParser *prs)
Definition: wparser_def.c:370
#define URL_T
Definition: wparser_def.c:38
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1166
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1086
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1492
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1173
#define A_NEXT
Definition: wparser_def.c:218
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1345
#define HWORD
Definition: wparser_def.c:50
#define NUMPARTHWORD
Definition: wparser_def.c:42
static const char *const lex_descr[]
Definition: wparser_def.c:87
#define INTERESTINGWORD(j)
Definition: wparser_def.c:1930
#define SCIENTIFIC
Definition: wparser_def.c:40
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:562
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1270
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1289
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1480
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1446
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1412