PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
wparser_def.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  * Default text search parser
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57 
58 #define LASTNUM 23
59 
60 static const char *const tok_alias[] = {
61  "",
62  "asciiword",
63  "word",
64  "numword",
65  "email",
66  "url",
67  "host",
68  "sfloat",
69  "version",
70  "hword_numpart",
71  "hword_part",
72  "hword_asciipart",
73  "blank",
74  "tag",
75  "protocol",
76  "numhword",
77  "asciihword",
78  "hword",
79  "url_path",
80  "file",
81  "float",
82  "int",
83  "uint",
84  "entity"
85 };
86 
87 static const char *const lex_descr[] = {
88  "",
89  "Word, all ASCII",
90  "Word, all letters",
91  "Word, letters and digits",
92  "Email address",
93  "URL",
94  "Host",
95  "Scientific notation",
96  "Version number",
97  "Hyphenated word part, letters and digits",
98  "Hyphenated word part, all letters",
99  "Hyphenated word part, all ASCII",
100  "Space symbols",
101  "XML tag",
102  "Protocol head",
103  "Hyphenated word, letters and digits",
104  "Hyphenated word, all ASCII",
105  "Hyphenated word, all letters",
106  "URL path",
107  "File or path name",
108  "Decimal notation",
109  "Signed integer",
110  "Unsigned integer",
111  "XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119  TPS_Base = 0,
196  TPS_Null /* last state (fake value) */
197 } TParserState;
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203  * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205  * special cases... */
206 
207 typedef struct
208 {
210  char c;
213  int type;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226 
227 typedef struct TParserPosition
228 {
229  int posbyte; /* position of parser in bytes */
230  int poschar; /* position of parser in characters */
231  int charlen; /* length of current char */
232  int lenbytetoken; /* length of token-so-far in bytes */
233  int lenchartoken; /* and in chars */
238 
239 typedef struct TParser
240 {
241  /* string and position information */
242  char *str; /* multibyte string */
243  int lenstr; /* length of mbstring */
244 #ifdef USE_WIDE_UPPER_LOWER
245  wchar_t *wstr; /* wide character string */
246  pg_wchar *pgwstr; /* wide character string for C-locale */
247  bool usewide;
248 #endif
249 
250  /* State of parse */
253  bool ignore;
254  bool wanthost;
255 
256  /* silly char */
257  char c;
258 
259  /* out */
260  char *token;
263  int type;
264 } TParser;
265 
266 
267 /* forward decls here */
268 static bool TParserGet(TParser *prs);
269 
270 
271 static TParserPosition *
273 {
275 
276  if (prev)
277  memcpy(res, prev, sizeof(TParserPosition));
278  else
279  memset(res, 0, sizeof(TParserPosition));
280 
281  res->prev = prev;
282 
283  res->pushedAtAction = NULL;
284 
285  return res;
286 }
287 
288 static TParser *
289 TParserInit(char *str, int len)
290 {
291  TParser *prs = (TParser *) palloc0(sizeof(TParser));
292 
294  prs->str = str;
295  prs->lenstr = len;
296 
297 #ifdef USE_WIDE_UPPER_LOWER
298 
299  /*
300  * Use wide char code only when max encoding length > 1.
301  */
302  if (prs->charmaxlen > 1)
303  {
304  Oid collation = DEFAULT_COLLATION_OID; /* TODO */
305  pg_locale_t mylocale = 0; /* TODO */
306 
307  prs->usewide = true;
308  if (lc_ctype_is_c(collation))
309  {
310  /*
311  * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
312  * be different from sizeof(wchar_t)
313  */
314  prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
315  pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
316  }
317  else
318  {
319  prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
320  char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
321  mylocale);
322  }
323  }
324  else
325  prs->usewide = false;
326 #endif
327 
328  prs->state = newTParserPosition(NULL);
329  prs->state->state = TPS_Base;
330 
331 #ifdef WPARSER_TRACE
332 
333  /*
334  * Use of %.*s here is a bit risky since it can misbehave if the data is
335  * not in what libc thinks is the prevailing encoding. However, since
336  * this is just a debugging aid, we choose to live with that.
337  */
338  fprintf(stderr, "parsing \"%.*s\"\n", len, str);
339 #endif
340 
341  return prs;
342 }
343 
344 /*
345  * As an alternative to a full TParserInit one can create a
346  * TParserCopy which basically is a regular TParser without a private
347  * copy of the string - instead it uses the one from another TParser.
348  * This is useful because at some places TParsers are created
349  * recursively and the repeated copying around of the strings can
350  * cause major inefficiency if the source string is long.
351  * The new parser starts parsing at the original's current position.
352  *
353  * Obviously one must not close the original TParser before the copy.
354  */
355 static TParser *
357 {
358  TParser *prs = (TParser *) palloc0(sizeof(TParser));
359 
360  prs->charmaxlen = orig->charmaxlen;
361  prs->str = orig->str + orig->state->posbyte;
362  prs->lenstr = orig->lenstr - orig->state->posbyte;
363 
364 #ifdef USE_WIDE_UPPER_LOWER
365  prs->usewide = orig->usewide;
366 
367  if (orig->pgwstr)
368  prs->pgwstr = orig->pgwstr + orig->state->poschar;
369  if (orig->wstr)
370  prs->wstr = orig->wstr + orig->state->poschar;
371 #endif
372 
373  prs->state = newTParserPosition(NULL);
374  prs->state->state = TPS_Base;
375 
376 #ifdef WPARSER_TRACE
377  /* See note above about %.*s */
378  fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
379 #endif
380 
381  return prs;
382 }
383 
384 
385 static void
387 {
388  while (prs->state)
389  {
390  TParserPosition *ptr = prs->state->prev;
391 
392  pfree(prs->state);
393  prs->state = ptr;
394  }
395 
396 #ifdef USE_WIDE_UPPER_LOWER
397  if (prs->wstr)
398  pfree(prs->wstr);
399  if (prs->pgwstr)
400  pfree(prs->pgwstr);
401 #endif
402 
403 #ifdef WPARSER_TRACE
404  fprintf(stderr, "closing parser\n");
405 #endif
406  pfree(prs);
407 }
408 
409 /*
410  * Close a parser created with TParserCopyInit
411  */
412 static void
414 {
415  while (prs->state)
416  {
417  TParserPosition *ptr = prs->state->prev;
418 
419  pfree(prs->state);
420  prs->state = ptr;
421  }
422 
423 #ifdef WPARSER_TRACE
424  fprintf(stderr, "closing parser copy\n");
425 #endif
426  pfree(prs);
427 }
428 
429 
430 /*
431  * Character-type support functions, equivalent to is* macros, but
432  * working with any possible encodings and locales. Notes:
433  * - with multibyte encoding and C-locale isw* function may fail
434  * or give wrong result.
435  * - multibyte encoding and C-locale often are used for
436  * Asian languages.
437  * - if locale is C then we use pgwstr instead of wstr.
438  */
439 
440 #ifdef USE_WIDE_UPPER_LOWER
441 
442 #define p_iswhat(type) \
443 static int \
444 p_is##type(TParser *prs) { \
445  Assert( prs->state ); \
446  if ( prs->usewide ) \
447  { \
448  if ( prs->pgwstr ) \
449  { \
450  unsigned int c = *(prs->pgwstr + prs->state->poschar); \
451  if ( c > 0x7f ) \
452  return 0; \
453  return is##type( c ); \
454  } \
455  return isw##type( *( prs->wstr + prs->state->poschar ) ); \
456  } \
457  \
458  return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
459 } \
460  \
461 static int \
462 p_isnot##type(TParser *prs) { \
463  return !p_is##type(prs); \
464 }
465 
466 static int
467 p_isalnum(TParser *prs)
468 {
469  Assert(prs->state);
470 
471  if (prs->usewide)
472  {
473  if (prs->pgwstr)
474  {
475  unsigned int c = *(prs->pgwstr + prs->state->poschar);
476 
477  /*
478  * any non-ascii symbol with multibyte encoding with C-locale is
479  * an alpha character
480  */
481  if (c > 0x7f)
482  return 1;
483 
484  return isalnum(c);
485  }
486 
487  return iswalnum(*(prs->wstr + prs->state->poschar));
488  }
489 
490  return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
491 }
492 static int
493 p_isnotalnum(TParser *prs)
494 {
495  return !p_isalnum(prs);
496 }
497 
498 static int
499 p_isalpha(TParser *prs)
500 {
501  Assert(prs->state);
502 
503  if (prs->usewide)
504  {
505  if (prs->pgwstr)
506  {
507  unsigned int c = *(prs->pgwstr + prs->state->poschar);
508 
509  /*
510  * any non-ascii symbol with multibyte encoding with C-locale is
511  * an alpha character
512  */
513  if (c > 0x7f)
514  return 1;
515 
516  return isalpha(c);
517  }
518 
519  return iswalpha(*(prs->wstr + prs->state->poschar));
520  }
521 
522  return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
523 }
524 
525 static int
526 p_isnotalpha(TParser *prs)
527 {
528  return !p_isalpha(prs);
529 }
530 
531 /* p_iseq should be used only for ascii symbols */
532 
533 static int
534 p_iseq(TParser *prs, char c)
535 {
536  Assert(prs->state);
537  return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
538 }
539 #else /* USE_WIDE_UPPER_LOWER */
540 
541 #define p_iswhat(type) \
542 static int \
543 p_is##type(TParser *prs) { \
544  Assert( prs->state ); \
545  return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
546 } \
547  \
548 static int \
549 p_isnot##type(TParser *prs) { \
550  return !p_is##type(prs); \
551 }
552 
553 
554 static int
555 p_iseq(TParser *prs, char c)
556 {
557  Assert(prs->state);
558  return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
559 }
560 
561 p_iswhat(alnum)
562 p_iswhat(alpha)
563 #endif /* USE_WIDE_UPPER_LOWER */
564 
565 p_iswhat(digit)
568 p_iswhat(punct)
569 p_iswhat(space)
571 p_iswhat(xdigit)
572 
573 static int
574 p_isEOF(TParser *prs)
575 {
576  Assert(prs->state);
577  return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
578 }
579 
580 static int
582 {
583  return p_iseq(prs, prs->c);
584 }
585 
586 static int
588 {
589  return !p_iseq(prs, prs->c);
590 }
591 
592 static int
594 {
595  return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
596 }
597 
598 static int
600 {
601  return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
602 }
603 
604 static int
606 {
607  char ch;
608 
609  /* no non-ASCII need apply */
610  if (prs->state->charlen != 1)
611  return 0;
612  ch = *(prs->str + prs->state->posbyte);
613  /* no spaces or control characters */
614  if (ch <= 0x20 || ch >= 0x7F)
615  return 0;
616  /* reject characters disallowed by RFC 3986 */
617  switch (ch)
618  {
619  case '"':
620  case '<':
621  case '>':
622  case '\\':
623  case '^':
624  case '`':
625  case '{':
626  case '|':
627  case '}':
628  return 0;
629  }
630  return 1;
631 }
632 
633 
634 /* deliberately suppress unused-function complaints for the above */
635 void _make_compiler_happy(void);
636 void
638 {
639  p_isalnum(NULL);
640  p_isnotalnum(NULL);
641  p_isalpha(NULL);
642  p_isnotalpha(NULL);
643  p_isdigit(NULL);
644  p_isnotdigit(NULL);
645  p_islower(NULL);
646  p_isnotlower(NULL);
647  p_isprint(NULL);
648  p_isnotprint(NULL);
649  p_ispunct(NULL);
650  p_isnotpunct(NULL);
651  p_isspace(NULL);
652  p_isnotspace(NULL);
653  p_isupper(NULL);
654  p_isnotupper(NULL);
655  p_isxdigit(NULL);
656  p_isnotxdigit(NULL);
657  p_isEOF(NULL);
658  p_iseqC(NULL);
659  p_isneC(NULL);
660 }
661 
662 
663 static void
665 {
666  switch (prs->state->lenchartoken)
667  {
668  case 8: /* </script */
669  if (pg_strncasecmp(prs->token, "</script", 8) == 0)
670  prs->ignore = false;
671  break;
672  case 7: /* <script || </style */
673  if (pg_strncasecmp(prs->token, "</style", 7) == 0)
674  prs->ignore = false;
675  else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
676  prs->ignore = true;
677  break;
678  case 6: /* <style */
679  if (pg_strncasecmp(prs->token, "<style", 6) == 0)
680  prs->ignore = true;
681  break;
682  default:
683  break;
684  }
685 }
686 
687 static void
689 {
690  prs->wanthost = true;
691  prs->state->posbyte -= prs->state->lenbytetoken;
692  prs->state->poschar -= prs->state->lenchartoken;
693 }
694 
695 static void
697 {
698  prs->state->posbyte -= prs->state->lenbytetoken;
699  prs->state->poschar -= prs->state->lenchartoken;
700 }
701 
702 static void
704 {
705  prs->state->posbyte -= prs->state->lenbytetoken;
706  prs->state->poschar -= prs->state->lenchartoken;
707  prs->state->lenbytetoken = 0;
708  prs->state->lenchartoken = 0;
709 }
710 
711 static int
713 {
714  if (prs->wanthost)
715  {
716  prs->wanthost = false;
717  return 1;
718  }
719  return 0;
720 }
721 
722 static int
724 {
725  return (prs->ignore) ? 1 : 0;
726 }
727 
728 static int
730 {
731  TParser *tmpprs = TParserCopyInit(prs);
732  int res = 0;
733 
734  tmpprs->wanthost = true;
735 
736  if (TParserGet(tmpprs) && tmpprs->type == HOST)
737  {
738  prs->state->posbyte += tmpprs->lenbytetoken;
739  prs->state->poschar += tmpprs->lenchartoken;
740  prs->state->lenbytetoken += tmpprs->lenbytetoken;
741  prs->state->lenchartoken += tmpprs->lenchartoken;
742  prs->state->charlen = tmpprs->state->charlen;
743  res = 1;
744  }
745  TParserCopyClose(tmpprs);
746 
747  return res;
748 }
749 
750 static int
752 {
753  TParser *tmpprs = TParserCopyInit(prs);
754  int res = 0;
755 
756  tmpprs->state = newTParserPosition(tmpprs->state);
757  tmpprs->state->state = TPS_InURLPathFirst;
758 
759  if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
760  {
761  prs->state->posbyte += tmpprs->lenbytetoken;
762  prs->state->poschar += tmpprs->lenchartoken;
763  prs->state->lenbytetoken += tmpprs->lenbytetoken;
764  prs->state->lenchartoken += tmpprs->lenchartoken;
765  prs->state->charlen = tmpprs->state->charlen;
766  res = 1;
767  }
768  TParserCopyClose(tmpprs);
769 
770  return res;
771 }
772 
773 /*
774  * returns true if current character has zero display length or
775  * it's a special sign in several languages. Such characters
776  * aren't a word-breaker although they aren't an isalpha.
777  * In beginning of word they aren't a part of it.
778  */
779 static int
781 {
782  /*
783  * pg_dsplen could return -1 which means error or control character
784  */
785  if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
786  return 1;
787 
788 #ifdef USE_WIDE_UPPER_LOWER
789 
790  /*
791  * Unicode Characters in the 'Mark, Spacing Combining' Category That
792  * characters are not alpha although they are not breakers of word too.
793  * Check that only in utf encoding, because other encodings aren't
794  * supported by postgres or even exists.
795  */
796  if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
797  {
798  static const pg_wchar strange_letter[] = {
799  /*
800  * use binary search, so elements should be ordered
801  */
802  0x0903, /* DEVANAGARI SIGN VISARGA */
803  0x093E, /* DEVANAGARI VOWEL SIGN AA */
804  0x093F, /* DEVANAGARI VOWEL SIGN I */
805  0x0940, /* DEVANAGARI VOWEL SIGN II */
806  0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
807  0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
808  0x094B, /* DEVANAGARI VOWEL SIGN O */
809  0x094C, /* DEVANAGARI VOWEL SIGN AU */
810  0x0982, /* BENGALI SIGN ANUSVARA */
811  0x0983, /* BENGALI SIGN VISARGA */
812  0x09BE, /* BENGALI VOWEL SIGN AA */
813  0x09BF, /* BENGALI VOWEL SIGN I */
814  0x09C0, /* BENGALI VOWEL SIGN II */
815  0x09C7, /* BENGALI VOWEL SIGN E */
816  0x09C8, /* BENGALI VOWEL SIGN AI */
817  0x09CB, /* BENGALI VOWEL SIGN O */
818  0x09CC, /* BENGALI VOWEL SIGN AU */
819  0x09D7, /* BENGALI AU LENGTH MARK */
820  0x0A03, /* GURMUKHI SIGN VISARGA */
821  0x0A3E, /* GURMUKHI VOWEL SIGN AA */
822  0x0A3F, /* GURMUKHI VOWEL SIGN I */
823  0x0A40, /* GURMUKHI VOWEL SIGN II */
824  0x0A83, /* GUJARATI SIGN VISARGA */
825  0x0ABE, /* GUJARATI VOWEL SIGN AA */
826  0x0ABF, /* GUJARATI VOWEL SIGN I */
827  0x0AC0, /* GUJARATI VOWEL SIGN II */
828  0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
829  0x0ACB, /* GUJARATI VOWEL SIGN O */
830  0x0ACC, /* GUJARATI VOWEL SIGN AU */
831  0x0B02, /* ORIYA SIGN ANUSVARA */
832  0x0B03, /* ORIYA SIGN VISARGA */
833  0x0B3E, /* ORIYA VOWEL SIGN AA */
834  0x0B40, /* ORIYA VOWEL SIGN II */
835  0x0B47, /* ORIYA VOWEL SIGN E */
836  0x0B48, /* ORIYA VOWEL SIGN AI */
837  0x0B4B, /* ORIYA VOWEL SIGN O */
838  0x0B4C, /* ORIYA VOWEL SIGN AU */
839  0x0B57, /* ORIYA AU LENGTH MARK */
840  0x0BBE, /* TAMIL VOWEL SIGN AA */
841  0x0BBF, /* TAMIL VOWEL SIGN I */
842  0x0BC1, /* TAMIL VOWEL SIGN U */
843  0x0BC2, /* TAMIL VOWEL SIGN UU */
844  0x0BC6, /* TAMIL VOWEL SIGN E */
845  0x0BC7, /* TAMIL VOWEL SIGN EE */
846  0x0BC8, /* TAMIL VOWEL SIGN AI */
847  0x0BCA, /* TAMIL VOWEL SIGN O */
848  0x0BCB, /* TAMIL VOWEL SIGN OO */
849  0x0BCC, /* TAMIL VOWEL SIGN AU */
850  0x0BD7, /* TAMIL AU LENGTH MARK */
851  0x0C01, /* TELUGU SIGN CANDRABINDU */
852  0x0C02, /* TELUGU SIGN ANUSVARA */
853  0x0C03, /* TELUGU SIGN VISARGA */
854  0x0C41, /* TELUGU VOWEL SIGN U */
855  0x0C42, /* TELUGU VOWEL SIGN UU */
856  0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
857  0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
858  0x0C82, /* KANNADA SIGN ANUSVARA */
859  0x0C83, /* KANNADA SIGN VISARGA */
860  0x0CBE, /* KANNADA VOWEL SIGN AA */
861  0x0CC0, /* KANNADA VOWEL SIGN II */
862  0x0CC1, /* KANNADA VOWEL SIGN U */
863  0x0CC2, /* KANNADA VOWEL SIGN UU */
864  0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
865  0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
866  0x0CC7, /* KANNADA VOWEL SIGN EE */
867  0x0CC8, /* KANNADA VOWEL SIGN AI */
868  0x0CCA, /* KANNADA VOWEL SIGN O */
869  0x0CCB, /* KANNADA VOWEL SIGN OO */
870  0x0CD5, /* KANNADA LENGTH MARK */
871  0x0CD6, /* KANNADA AI LENGTH MARK */
872  0x0D02, /* MALAYALAM SIGN ANUSVARA */
873  0x0D03, /* MALAYALAM SIGN VISARGA */
874  0x0D3E, /* MALAYALAM VOWEL SIGN AA */
875  0x0D3F, /* MALAYALAM VOWEL SIGN I */
876  0x0D40, /* MALAYALAM VOWEL SIGN II */
877  0x0D46, /* MALAYALAM VOWEL SIGN E */
878  0x0D47, /* MALAYALAM VOWEL SIGN EE */
879  0x0D48, /* MALAYALAM VOWEL SIGN AI */
880  0x0D4A, /* MALAYALAM VOWEL SIGN O */
881  0x0D4B, /* MALAYALAM VOWEL SIGN OO */
882  0x0D4C, /* MALAYALAM VOWEL SIGN AU */
883  0x0D57, /* MALAYALAM AU LENGTH MARK */
884  0x0D82, /* SINHALA SIGN ANUSVARAYA */
885  0x0D83, /* SINHALA SIGN VISARGAYA */
886  0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
887  0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
888  0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
889  0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
890  0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
891  0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
892  0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
893  0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
894  0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
895  * AELA-PILLA */
896  0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
897  0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
898  0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
899  0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
900  0x0F3E, /* TIBETAN SIGN YAR TSHES */
901  0x0F3F, /* TIBETAN SIGN MAR TSHES */
902  0x0F7F, /* TIBETAN SIGN RNAM BCAD */
903  0x102B, /* MYANMAR VOWEL SIGN TALL AA */
904  0x102C, /* MYANMAR VOWEL SIGN AA */
905  0x1031, /* MYANMAR VOWEL SIGN E */
906  0x1038, /* MYANMAR SIGN VISARGA */
907  0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
908  0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
909  0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
910  0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
911  0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
912  0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
913  0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
914  0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
915  0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
916  0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
917  0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
918  0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
919  0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
920  0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
921  0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
922  0x1084, /* MYANMAR VOWEL SIGN SHAN E */
923  0x1087, /* MYANMAR SIGN SHAN TONE-2 */
924  0x1088, /* MYANMAR SIGN SHAN TONE-3 */
925  0x1089, /* MYANMAR SIGN SHAN TONE-5 */
926  0x108A, /* MYANMAR SIGN SHAN TONE-6 */
927  0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
928  0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
929  0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
930  0x17B6, /* KHMER VOWEL SIGN AA */
931  0x17BE, /* KHMER VOWEL SIGN OE */
932  0x17BF, /* KHMER VOWEL SIGN YA */
933  0x17C0, /* KHMER VOWEL SIGN IE */
934  0x17C1, /* KHMER VOWEL SIGN E */
935  0x17C2, /* KHMER VOWEL SIGN AE */
936  0x17C3, /* KHMER VOWEL SIGN AI */
937  0x17C4, /* KHMER VOWEL SIGN OO */
938  0x17C5, /* KHMER VOWEL SIGN AU */
939  0x17C7, /* KHMER SIGN REAHMUK */
940  0x17C8, /* KHMER SIGN YUUKALEAPINTU */
941  0x1923, /* LIMBU VOWEL SIGN EE */
942  0x1924, /* LIMBU VOWEL SIGN AI */
943  0x1925, /* LIMBU VOWEL SIGN OO */
944  0x1926, /* LIMBU VOWEL SIGN AU */
945  0x1929, /* LIMBU SUBJOINED LETTER YA */
946  0x192A, /* LIMBU SUBJOINED LETTER RA */
947  0x192B, /* LIMBU SUBJOINED LETTER WA */
948  0x1930, /* LIMBU SMALL LETTER KA */
949  0x1931, /* LIMBU SMALL LETTER NGA */
950  0x1933, /* LIMBU SMALL LETTER TA */
951  0x1934, /* LIMBU SMALL LETTER NA */
952  0x1935, /* LIMBU SMALL LETTER PA */
953  0x1936, /* LIMBU SMALL LETTER MA */
954  0x1937, /* LIMBU SMALL LETTER RA */
955  0x1938, /* LIMBU SMALL LETTER LA */
956  0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
957  0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
958  0x19B2, /* NEW TAI LUE VOWEL SIGN II */
959  0x19B3, /* NEW TAI LUE VOWEL SIGN U */
960  0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
961  0x19B5, /* NEW TAI LUE VOWEL SIGN E */
962  0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
963  0x19B7, /* NEW TAI LUE VOWEL SIGN O */
964  0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
965  0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
966  0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
967  0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
968  0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
969  0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
970  0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
971  0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
972  0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
973  0x19C8, /* NEW TAI LUE TONE MARK-1 */
974  0x19C9, /* NEW TAI LUE TONE MARK-2 */
975  0x1A19, /* BUGINESE VOWEL SIGN E */
976  0x1A1A, /* BUGINESE VOWEL SIGN O */
977  0x1A1B, /* BUGINESE VOWEL SIGN AE */
978  0x1B04, /* BALINESE SIGN BISAH */
979  0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
980  0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
981  0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
982  0x1B3E, /* BALINESE VOWEL SIGN TALING */
983  0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
984  0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
985  0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
986  0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
987  0x1B44, /* BALINESE ADEG ADEG */
988  0x1B82, /* SUNDANESE SIGN PANGWISAD */
989  0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
990  0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
991  0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
992  0x1BAA, /* SUNDANESE SIGN PAMAAEH */
993  0x1C24, /* LEPCHA SUBJOINED LETTER YA */
994  0x1C25, /* LEPCHA SUBJOINED LETTER RA */
995  0x1C26, /* LEPCHA VOWEL SIGN AA */
996  0x1C27, /* LEPCHA VOWEL SIGN I */
997  0x1C28, /* LEPCHA VOWEL SIGN O */
998  0x1C29, /* LEPCHA VOWEL SIGN OO */
999  0x1C2A, /* LEPCHA VOWEL SIGN U */
1000  0x1C2B, /* LEPCHA VOWEL SIGN UU */
1001  0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
1002  0x1C35, /* LEPCHA CONSONANT SIGN KANG */
1003  0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
1004  0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
1005  0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
1006  0xA880, /* SAURASHTRA SIGN ANUSVARA */
1007  0xA881, /* SAURASHTRA SIGN VISARGA */
1008  0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
1009  0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
1010  0xA8B6, /* SAURASHTRA VOWEL SIGN I */
1011  0xA8B7, /* SAURASHTRA VOWEL SIGN II */
1012  0xA8B8, /* SAURASHTRA VOWEL SIGN U */
1013  0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
1014  0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
1015  0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
1016  0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
1017  0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
1018  0xA8BE, /* SAURASHTRA VOWEL SIGN E */
1019  0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
1020  0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
1021  0xA8C1, /* SAURASHTRA VOWEL SIGN O */
1022  0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
1023  0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
1024  0xA952, /* REJANG CONSONANT SIGN H */
1025  0xA953, /* REJANG VIRAMA */
1026  0xAA2F, /* CHAM VOWEL SIGN O */
1027  0xAA30, /* CHAM VOWEL SIGN AI */
1028  0xAA33, /* CHAM CONSONANT SIGN YA */
1029  0xAA34, /* CHAM CONSONANT SIGN RA */
1030  0xAA4D /* CHAM CONSONANT SIGN FINAL H */
1031  };
1032  const pg_wchar *StopLow = strange_letter,
1033  *StopHigh = strange_letter + lengthof(strange_letter),
1034  *StopMiddle;
1035  pg_wchar c;
1036 
1037  if (prs->pgwstr)
1038  c = *(prs->pgwstr + prs->state->poschar);
1039  else
1040  c = (pg_wchar) *(prs->wstr + prs->state->poschar);
1041 
1042  while (StopLow < StopHigh)
1043  {
1044  StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1045  if (*StopMiddle == c)
1046  return 1;
1047  else if (*StopMiddle < c)
1048  StopLow = StopMiddle + 1;
1049  else
1050  StopHigh = StopMiddle;
1051  }
1052  }
1053 #endif
1054 
1055  return 0;
1056 }
1057 
1058 /*
1059  * Table of state/action of parser
1060  */
1061 
1063  {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
1064  {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
1065  {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
1067  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1068  {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
1069  {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1070  {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1071  {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
1072  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1073  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1074  {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
1075  {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
1076 };
1077 
1078 
1080  {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
1081  {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1082  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1083  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1084  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1085  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1087  {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
1088 };
1089 
1091  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1092  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1094  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1095  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1097  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1098  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1099  {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1100  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1101  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1102  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1103  {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1104  {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1106 };
1107 
1109  {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1110  {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1111  {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1112  {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1114  {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1115 };
1116 
1118  {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1119  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1121  {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1122  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1123  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1124  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1125  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1126  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1127  {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1128  {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1129  {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1130  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1132 };
1133 
1135  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1136  {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1137  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1138 };
1139 
1141  {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1142  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1143  {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1144  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1145  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1147 };
1148 
1150  {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1151  {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1152  {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1153  {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1154  {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1155  {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1156  {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1157  {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1158  {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1159 };
1160 
1162  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1163  {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1164  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 };
1166 
1168  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1169  {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1170  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1171  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1172  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1174 };
1175 
1177  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1178  {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1179  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1180 };
1181 
1183  {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1184  {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1185  {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1186  {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1187  {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1189 };
1190 
1192  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1193  {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1194  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1195 };
1196 
1198  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1199  {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1200  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1201 };
1202 
1203 
1205  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1206  {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1207  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 };
1209 
1211  {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1212  {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1213  {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1215 };
1216 
1218  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1220  {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1221  {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1222  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1223 };
1224 
1226  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1227  {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1228  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230 
1232  {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1233  {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1235 };
1236 
1238  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1241  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1242  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1243  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1244 };
1245 
1247  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1248  {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1249  {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1250  {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1251  {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1252  {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1253  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1254  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1255 };
1256 
1258  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1262  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1263 };
1264 
1266  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1268  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1269 };
1270 
1272  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1273  {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1274  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1275  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1276 };
1277 
1279  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1280  {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1281  {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1282  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1283 };
1284 
1287 };
1288 
1290  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291  {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1292  {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1293  {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1294  {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1295  {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1296  {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1297  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1298 };
1299 
1301  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1302  /* <?xml ... */
1303  /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1304  {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1305  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1306 };
1307 
1309  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1310  {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1311  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1312 };
1313 
1315  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316  /* <br/> case */
1317  {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1318  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1319  {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1320  {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1321  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1322  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1323  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1324  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1325  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1326 };
1327 
1329  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1330  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1331  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1332 };
1333 
1335  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1336  {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1337  {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1338  {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1339  {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1340  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1341  {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1342  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1343  {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1344  {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1345  {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1346  {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1347  {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1348  {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1349  {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1350  {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1351  {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1352  {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1353  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1354 };
1355 
1357  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1358  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1359  {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1360  {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1361 };
1362 
1364  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1365  {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1366  {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1367  {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1368 };
1369 
1371  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1372  {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1373 };
1374 
1376  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1377 };
1378 
1380  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1381  {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1382  /* <!DOCTYPE ...> */
1383  {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1384  {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1385  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1386 };
1387 
1389  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1390  {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1391  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1392 };
1393 
1395  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1397  {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1398 };
1399 
1401  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1403  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1404 };
1405 
1407  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1408  {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1409  {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1410  {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1411 };
1412 
1414  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1415 };
1416 
1418  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1420  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1421  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1422 };
1423 
1425  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1427  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1428  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1429  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1431  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1432  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434 
1436  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1438  {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1439  {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1440  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1441  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1443  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1444  {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1446  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1447  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1448 };
1449 
1451  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1452  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1453  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1454 };
1455 
1457  {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1458  {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1460  {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1461  {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1462 };
1463 
1465  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1466  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1467  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1468  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1469 };
1470 
1472  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473  {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1474  {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1475  {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1477  {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1478  {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1479  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481 
1483  {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1485  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487 
1489  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1490  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1491  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1492  {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1493  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1494  {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1495  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1496 };
1497 
1499  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1500  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1501  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1502  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1503  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1504  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1505 };
1506 
1508  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1509  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1510  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1511  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1512  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1513  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1514  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516 
1518  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1519  {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1520  {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1521  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1522 };
1523 
1525  {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1526  {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1528  {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1529  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1530 };
1531 
1533  {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1534  {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1535  {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1536  {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1537  {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1538  {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1539  {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1540  {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1541 };
1542 
1544  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1545  {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1546  {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1547  {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1548  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1549 };
1550 
1552  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1553  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1554  {NULL, 0, A_POP, TPS_Null, 0, NULL},
1555 };
1556 
1558  {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1559 };
1560 
1562  {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1563  {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1564  {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1565 };
1566 
1568  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1570  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1571 };
1572 
1574  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1575  {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1576  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1577 };
1578 
1580  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1581  {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1582  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1583 };
1584 
1587 };
1588 
1590  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1592  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1593  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1594  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596 
1600  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1602  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1605 };
1606 
1608  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1609  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1610  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1611  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1612 };
1613 
1616  {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1618  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1621 };
1622 
1624  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1625  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1626  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1627  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1628 };
1629 
1632  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1636 };
1637 
1639  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1640  {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1641  {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1643  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1644 };
1645 
1647  {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1649  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1650  {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1652  {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1653 };
1654 
1656  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1657  {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1659  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1660 };
1661 
1663  {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1664  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1666  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1668 };
1669 
1671  {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1673  {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1675  {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1677 };
1678 
1680  {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1681  {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1684 };
1685 
1687  {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1688  {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1689  {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1691  {NULL, 0, A_POP, TPS_Null, 0, NULL}
1692 };
1693 
1694 
1695 /*
1696  * main table of per-state parser actions
1697  */
1698 typedef struct
1699 {
1700  const TParserStateActionItem *action; /* the actual state info */
1701  TParserState state; /* only for Assert crosscheck */
1702 #ifdef WPARSER_TRACE
1703  const char *state_name; /* only for debug printout */
1704 #endif
1706 
1707 #ifdef WPARSER_TRACE
1708 #define TPARSERSTATEACTION(state) \
1709  { CppConcat(action,state), state, CppAsString(state) }
1710 #else
1711 #define TPARSERSTATEACTION(state) \
1712  { CppConcat(action,state), state }
1713 #endif
1714 
1715 /*
1716  * order must be the same as in typedef enum {} TParserState!!
1717  */
1718 
1719 static const TParserStateAction Actions[] = {
1797 };
1798 
1799 
1800 static bool
1802 {
1803  const TParserStateActionItem *item = NULL;
1804 
1805  Assert(prs->state);
1806 
1807  if (prs->state->posbyte >= prs->lenstr)
1808  return false;
1809 
1810  prs->token = prs->str + prs->state->posbyte;
1811  prs->state->pushedAtAction = NULL;
1812 
1813  /* look at string */
1814  while (prs->state->posbyte <= prs->lenstr)
1815  {
1816  if (prs->state->posbyte == prs->lenstr)
1817  prs->state->charlen = 0;
1818  else
1819  prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1820  pg_mblen(prs->str + prs->state->posbyte);
1821 
1822  Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1823  Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1824  Assert(Actions[prs->state->state].state == prs->state->state);
1825 
1826  if (prs->state->pushedAtAction)
1827  {
1828  /* After a POP, pick up at the next test */
1829  item = prs->state->pushedAtAction + 1;
1830  prs->state->pushedAtAction = NULL;
1831  }
1832  else
1833  {
1834  item = Actions[prs->state->state].action;
1835  Assert(item != NULL);
1836  }
1837 
1838  /* find action by character class */
1839  while (item->isclass)
1840  {
1841  prs->c = item->c;
1842  if (item->isclass(prs) != 0)
1843  break;
1844  item++;
1845  }
1846 
1847 #ifdef WPARSER_TRACE
1848  {
1849  TParserPosition *ptr;
1850 
1851  fprintf(stderr, "state ");
1852  /* indent according to stack depth */
1853  for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1854  fprintf(stderr, " ");
1855  fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1856  if (prs->state->posbyte < prs->lenstr)
1857  fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1858  else
1859  fprintf(stderr, "at EOF");
1860  fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1861  (int) (item - Actions[prs->state->state].action),
1862  (item->flags & A_BINGO) ? " BINGO" : "",
1863  (item->flags & A_POP) ? " POP" : "",
1864  (item->flags & A_PUSH) ? " PUSH" : "",
1865  (item->flags & A_RERUN) ? " RERUN" : "",
1866  (item->flags & A_CLEAR) ? " CLEAR" : "",
1867  (item->flags & A_MERGE) ? " MERGE" : "",
1868  (item->flags & A_CLRALL) ? " CLRALL" : "",
1869  (item->tostate != TPS_Null) ? " tostate " : "",
1870  (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1871  (item->type > 0) ? " type " : "",
1872  tok_alias[item->type]);
1873  }
1874 #endif
1875 
1876  /* call special handler if exists */
1877  if (item->special)
1878  item->special(prs);
1879 
1880  /* BINGO, token is found */
1881  if (item->flags & A_BINGO)
1882  {
1883  Assert(item->type > 0);
1884  prs->lenbytetoken = prs->state->lenbytetoken;
1885  prs->lenchartoken = prs->state->lenchartoken;
1886  prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1887  prs->type = item->type;
1888  }
1889 
1890  /* do various actions by flags */
1891  if (item->flags & A_POP)
1892  { /* pop stored state in stack */
1893  TParserPosition *ptr = prs->state->prev;
1894 
1895  pfree(prs->state);
1896  prs->state = ptr;
1897  Assert(prs->state);
1898  }
1899  else if (item->flags & A_PUSH)
1900  { /* push (store) state in stack */
1901  prs->state->pushedAtAction = item; /* remember where we push */
1902  prs->state = newTParserPosition(prs->state);
1903  }
1904  else if (item->flags & A_CLEAR)
1905  { /* clear previous pushed state */
1906  TParserPosition *ptr;
1907 
1908  Assert(prs->state->prev);
1909  ptr = prs->state->prev->prev;
1910  pfree(prs->state->prev);
1911  prs->state->prev = ptr;
1912  }
1913  else if (item->flags & A_CLRALL)
1914  { /* clear all previous pushed state */
1915  TParserPosition *ptr;
1916 
1917  while (prs->state->prev)
1918  {
1919  ptr = prs->state->prev->prev;
1920  pfree(prs->state->prev);
1921  prs->state->prev = ptr;
1922  }
1923  }
1924  else if (item->flags & A_MERGE)
1925  { /* merge posinfo with current and pushed state */
1926  TParserPosition *ptr = prs->state;
1927 
1928  Assert(prs->state->prev);
1929  prs->state = prs->state->prev;
1930 
1931  prs->state->posbyte = ptr->posbyte;
1932  prs->state->poschar = ptr->poschar;
1933  prs->state->charlen = ptr->charlen;
1934  prs->state->lenbytetoken = ptr->lenbytetoken;
1935  prs->state->lenchartoken = ptr->lenchartoken;
1936  pfree(ptr);
1937  }
1938 
1939  /* set new state if pointed */
1940  if (item->tostate != TPS_Null)
1941  prs->state->state = item->tostate;
1942 
1943  /* check for go away */
1944  if ((item->flags & A_BINGO) ||
1945  (prs->state->posbyte >= prs->lenstr &&
1946  (item->flags & A_RERUN) == 0))
1947  break;
1948 
1949  /* go to beginning of loop if we should rerun or we just restore state */
1950  if (item->flags & (A_RERUN | A_POP))
1951  continue;
1952 
1953  /* move forward */
1954  if (prs->state->charlen)
1955  {
1956  prs->state->posbyte += prs->state->charlen;
1957  prs->state->lenbytetoken += prs->state->charlen;
1958  prs->state->poschar++;
1959  prs->state->lenchartoken++;
1960  }
1961  }
1962 
1963  return (item && (item->flags & A_BINGO)) ? true : false;
1964 }
1965 
1966 Datum
1968 {
1969  LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1970  int i;
1971 
1972  for (i = 1; i <= LASTNUM; i++)
1973  {
1974  descr[i - 1].lexid = i;
1975  descr[i - 1].alias = pstrdup(tok_alias[i]);
1976  descr[i - 1].descr = pstrdup(lex_descr[i]);
1977  }
1978 
1979  descr[LASTNUM].lexid = 0;
1980 
1981  PG_RETURN_POINTER(descr);
1982 }
1983 
1984 Datum
1986 {
1988 }
1989 
1990 Datum
1992 {
1993  TParser *p = (TParser *) PG_GETARG_POINTER(0);
1994  char **t = (char **) PG_GETARG_POINTER(1);
1995  int *tlen = (int *) PG_GETARG_POINTER(2);
1996 
1997  if (!TParserGet(p))
1998  PG_RETURN_INT32(0);
1999 
2000  *t = p->token;
2001  *tlen = p->lenbytetoken;
2002 
2003  PG_RETURN_INT32(p->type);
2004 }
2005 
2006 Datum
2008 {
2009  TParser *p = (TParser *) PG_GETARG_POINTER(0);
2010 
2011  TParserClose(p);
2012  PG_RETURN_VOID();
2013 }
2014 
2015 #define LEAVETOKEN(x) ( (x)==SPACE )
2016 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2017 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
2018 
2019 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
2020 #define HLIDREPLACE(x) ( (x)==TAG_T )
2021 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2022 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2023 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
2024 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
2025 
2026 typedef struct
2027 {
2029  int len;
2030 } hlCheck;
2031 
2032 static bool
2034 {
2035  int i;
2036  hlCheck *checkval = (hlCheck *) opaque;
2037 
2038  for (i = 0; i < checkval->len; i++)
2039  {
2040  if (checkval->words[i].item == val)
2041  {
2042  /* don't need to find all positions */
2043  if (!data)
2044  return true;
2045 
2046  if (!data->pos)
2047  {
2048  data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
2049  data->allocated = true;
2050  data->npos = 1;
2051  data->pos[0] = checkval->words[i].pos;
2052  }
2053  else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2054  {
2055  data->pos[data->npos++] = checkval->words[i].pos;
2056  }
2057  }
2058  }
2059 
2060  if (data && data->npos > 0)
2061  return true;
2062 
2063  return false;
2064 }
2065 
2066 
2067 static bool
2068 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
2069 {
2070  int i,
2071  j;
2072  QueryItem *item = GETQUERY(query);
2073  int pos = *p;
2074 
2075  *q = -1;
2076  *p = INT_MAX;
2077 
2078  for (j = 0; j < query->size; j++)
2079  {
2080  if (item->type != QI_VAL)
2081  {
2082  item++;
2083  continue;
2084  }
2085  for (i = pos; i < prs->curwords; i++)
2086  {
2087  if (prs->words[i].item == &item->qoperand)
2088  {
2089  if (i > *q)
2090  *q = i;
2091  break;
2092  }
2093  }
2094  item++;
2095  }
2096 
2097  if (*q < 0)
2098  return false;
2099 
2100  item = GETQUERY(query);
2101  for (j = 0; j < query->size; j++)
2102  {
2103  if (item->type != QI_VAL)
2104  {
2105  item++;
2106  continue;
2107  }
2108  for (i = *q; i >= pos; i--)
2109  {
2110  if (prs->words[i].item == &item->qoperand)
2111  {
2112  if (i < *p)
2113  *p = i;
2114  break;
2115  }
2116  }
2117  item++;
2118  }
2119 
2120  if (*p <= *q)
2121  {
2122  hlCheck ch;
2123 
2124  ch.words = &(prs->words[*p]);
2125  ch.len = *q - *p + 1;
2127  return true;
2128  else
2129  {
2130  (*p)++;
2131  return hlCover(prs, query, p, q);
2132  }
2133  }
2134 
2135  return false;
2136 }
2137 
2138 static void
2139 mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
2140 {
2141  int i;
2142 
2143  for (i = startpos; i <= endpos; i++)
2144  {
2145  if (prs->words[i].item)
2146  prs->words[i].selected = 1;
2147  if (highlight == 0)
2148  {
2149  if (HLIDREPLACE(prs->words[i].type))
2150  prs->words[i].replace = 1;
2151  else if (HLIDSKIP(prs->words[i].type))
2152  prs->words[i].skip = 1;
2153  }
2154  else
2155  {
2156  if (XMLHLIDSKIP(prs->words[i].type))
2157  prs->words[i].skip = 1;
2158  }
2159 
2160  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2161  }
2162 }
2163 
2164 typedef struct
2165 {
2172 } CoverPos;
2173 
2174 static void
2176  int *curlen, int *poslen, int max_words)
2177 {
2178  int i;
2179 
2180  /*
2181  * Objective: Generate a fragment of words between startpos and endpos
2182  * such that it has at most max_words and both ends has query words. If
2183  * the startpos and endpos are the endpoints of the cover and the cover
2184  * has fewer words than max_words, then this function should just return
2185  * the cover
2186  */
2187  /* first move startpos to an item */
2188  for (i = *startpos; i <= *endpos; i++)
2189  {
2190  *startpos = i;
2191  if (prs->words[i].item && !prs->words[i].repeated)
2192  break;
2193  }
2194  /* cut endpos to have only max_words */
2195  *curlen = 0;
2196  *poslen = 0;
2197  for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2198  {
2199  if (!NONWORDTOKEN(prs->words[i].type))
2200  *curlen += 1;
2201  if (prs->words[i].item && !prs->words[i].repeated)
2202  *poslen += 1;
2203  }
2204  /* if the cover was cut then move back endpos to a query item */
2205  if (*endpos > i)
2206  {
2207  *endpos = i;
2208  for (i = *endpos; i >= *startpos; i--)
2209  {
2210  *endpos = i;
2211  if (prs->words[i].item && !prs->words[i].repeated)
2212  break;
2213  if (!NONWORDTOKEN(prs->words[i].type))
2214  *curlen -= 1;
2215  }
2216  }
2217 }
2218 
2219 static void
2220 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
2221  int shortword, int min_words,
2222  int max_words, int max_fragments)
2223 {
2224  int32 poslen,
2225  curlen,
2226  i,
2227  f,
2228  num_f = 0;
2229  int32 stretch,
2230  maxstretch,
2231  posmarker;
2232 
2233  int32 startpos = 0,
2234  endpos = 0,
2235  p = 0,
2236  q = 0;
2237 
2238  int32 numcovers = 0,
2239  maxcovers = 32;
2240 
2241  int32 minI,
2242  minwords,
2243  maxitems;
2244  CoverPos *covers;
2245 
2246  covers = palloc(maxcovers * sizeof(CoverPos));
2247 
2248  /* get all covers */
2249  while (hlCover(prs, query, &p, &q))
2250  {
2251  startpos = p;
2252  endpos = q;
2253 
2254  /*
2255  * Break the cover into smaller fragments such that each fragment has
2256  * at most max_words. Also ensure that each end of the fragment is a
2257  * query word. This will allow us to stretch the fragment in either
2258  * direction
2259  */
2260 
2261  while (startpos <= endpos)
2262  {
2263  get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2264  if (numcovers >= maxcovers)
2265  {
2266  maxcovers *= 2;
2267  covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2268  }
2269  covers[numcovers].startpos = startpos;
2270  covers[numcovers].endpos = endpos;
2271  covers[numcovers].curlen = curlen;
2272  covers[numcovers].poslen = poslen;
2273  covers[numcovers].in = 0;
2274  covers[numcovers].excluded = 0;
2275  numcovers++;
2276  startpos = endpos + 1;
2277  endpos = q;
2278  }
2279  /* move p to generate the next cover */
2280  p++;
2281  }
2282 
2283  /* choose best covers */
2284  for (f = 0; f < max_fragments; f++)
2285  {
2286  maxitems = 0;
2287  minwords = PG_INT32_MAX;
2288  minI = -1;
2289 
2290  /*
2291  * Choose the cover that contains max items. In case of tie choose the
2292  * one with smaller number of words.
2293  */
2294  for (i = 0; i < numcovers; i++)
2295  {
2296  if (!covers[i].in && !covers[i].excluded &&
2297  (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
2298  && minwords > covers[i].curlen)))
2299  {
2300  maxitems = covers[i].poslen;
2301  minwords = covers[i].curlen;
2302  minI = i;
2303  }
2304  }
2305  /* if a cover was found mark it */
2306  if (minI >= 0)
2307  {
2308  covers[minI].in = 1;
2309  /* adjust the size of cover */
2310  startpos = covers[minI].startpos;
2311  endpos = covers[minI].endpos;
2312  curlen = covers[minI].curlen;
2313  /* stretch the cover if cover size is lower than max_words */
2314  if (curlen < max_words)
2315  {
2316  /* divide the stretch on both sides of cover */
2317  maxstretch = (max_words - curlen) / 2;
2318 
2319  /*
2320  * first stretch the startpos stop stretching if 1. we hit the
2321  * beginning of document 2. exceed maxstretch 3. we hit an
2322  * already marked fragment
2323  */
2324  stretch = 0;
2325  posmarker = startpos;
2326  for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2327  {
2328  if (!NONWORDTOKEN(prs->words[i].type))
2329  {
2330  curlen++;
2331  stretch++;
2332  }
2333  posmarker = i;
2334  }
2335  /* cut back startpos till we find a non short token */
2336  for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
2337  {
2338  if (!NONWORDTOKEN(prs->words[i].type))
2339  curlen--;
2340  }
2341  startpos = i;
2342  /* now stretch the endpos as much as possible */
2343  posmarker = endpos;
2344  for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2345  {
2346  if (!NONWORDTOKEN(prs->words[i].type))
2347  curlen++;
2348  posmarker = i;
2349  }
2350  /* cut back endpos till we find a non-short token */
2351  for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
2352  {
2353  if (!NONWORDTOKEN(prs->words[i].type))
2354  curlen--;
2355  }
2356  endpos = i;
2357  }
2358  covers[minI].startpos = startpos;
2359  covers[minI].endpos = endpos;
2360  covers[minI].curlen = curlen;
2361  /* Mark the chosen fragments (covers) */
2362  mark_fragment(prs, highlight, startpos, endpos);
2363  num_f++;
2364  /* exclude overlapping covers */
2365  for (i = 0; i < numcovers; i++)
2366  {
2367  if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
2368  covers[i].excluded = 1;
2369  }
2370  }
2371  else
2372  break;
2373  }
2374 
2375  /* show at least min_words we have not marked anything */
2376  if (num_f <= 0)
2377  {
2378  startpos = endpos = curlen = 0;
2379  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2380  {
2381  if (!NONWORDTOKEN(prs->words[i].type))
2382  curlen++;
2383  endpos = i;
2384  }
2385  mark_fragment(prs, highlight, startpos, endpos);
2386  }
2387  pfree(covers);
2388 }
2389 
2390 static void
2391 mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
2392  int shortword, int min_words, int max_words)
2393 {
2394  int p = 0,
2395  q = 0;
2396  int bestb = -1,
2397  beste = -1;
2398  int bestlen = -1;
2399  int pose = 0,
2400  posb,
2401  poslen,
2402  curlen;
2403 
2404  int i;
2405 
2406  if (highlight == 0)
2407  {
2408  while (hlCover(prs, query, &p, &q))
2409  {
2410  /* find cover len in words */
2411  curlen = 0;
2412  poslen = 0;
2413  for (i = p; i <= q && curlen < max_words; i++)
2414  {
2415  if (!NONWORDTOKEN(prs->words[i].type))
2416  curlen++;
2417  if (prs->words[i].item && !prs->words[i].repeated)
2418  poslen++;
2419  pose = i;
2420  }
2421 
2422  if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
2423  {
2424  /* best already found, so try one more cover */
2425  p++;
2426  continue;
2427  }
2428 
2429  posb = p;
2430  if (curlen < max_words)
2431  { /* find good end */
2432  for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2433  {
2434  if (i != q)
2435  {
2436  if (!NONWORDTOKEN(prs->words[i].type))
2437  curlen++;
2438  if (prs->words[i].item && !prs->words[i].repeated)
2439  poslen++;
2440  }
2441  pose = i;
2442  if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2443  continue;
2444  if (curlen >= min_words)
2445  break;
2446  }
2447  if (curlen < min_words && i >= prs->curwords)
2448  { /* got end of text and our cover is shorter
2449  * than min_words */
2450  for (i = p - 1; i >= 0; i--)
2451  {
2452  if (!NONWORDTOKEN(prs->words[i].type))
2453  curlen++;
2454  if (prs->words[i].item && !prs->words[i].repeated)
2455  poslen++;
2456  if (curlen >= max_words)
2457  break;
2458  if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2459  continue;
2460  if (curlen >= min_words)
2461  break;
2462  }
2463  posb = (i >= 0) ? i : 0;
2464  }
2465  }
2466  else
2467  { /* shorter cover :((( */
2468  if (i > q)
2469  i = q;
2470  for (; curlen > min_words; i--)
2471  {
2472  if (!NONWORDTOKEN(prs->words[i].type))
2473  curlen--;
2474  if (prs->words[i].item && !prs->words[i].repeated)
2475  poslen--;
2476  pose = i;
2477  if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2478  continue;
2479  break;
2480  }
2481  }
2482 
2483  if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
2484  (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
2485  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
2486  {
2487  bestb = posb;
2488  beste = pose;
2489  bestlen = poslen;
2490  }
2491 
2492  p++;
2493  }
2494 
2495  if (bestlen < 0)
2496  {
2497  curlen = 0;
2498  for (i = 0; i < prs->curwords && curlen < min_words; i++)
2499  {
2500  if (!NONWORDTOKEN(prs->words[i].type))
2501  curlen++;
2502  pose = i;
2503  }
2504  bestb = 0;
2505  beste = pose;
2506  }
2507  }
2508  else
2509  {
2510  bestb = 0;
2511  beste = prs->curwords - 1;
2512  }
2513 
2514  for (i = bestb; i <= beste; i++)
2515  {
2516  if (prs->words[i].item)
2517  prs->words[i].selected = 1;
2518  if (highlight == 0)
2519  {
2520  if (HLIDREPLACE(prs->words[i].type))
2521  prs->words[i].replace = 1;
2522  else if (HLIDSKIP(prs->words[i].type))
2523  prs->words[i].skip = 1;
2524  }
2525  else
2526  {
2527  if (XMLHLIDSKIP(prs->words[i].type))
2528  prs->words[i].skip = 1;
2529  }
2530 
2531  prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2532  }
2533 
2534 }
2535 
2536 Datum
2538 {
2540  List *prsoptions = (List *) PG_GETARG_POINTER(1);
2541  TSQuery query = PG_GETARG_TSQUERY(2);
2542 
2543  /* from opt + start and end tag */
2544  int min_words = 15;
2545  int max_words = 35;
2546  int shortword = 3;
2547  int max_fragments = 0;
2548  int highlight = 0;
2549  ListCell *l;
2550 
2551  /* config */
2552  prs->startsel = NULL;
2553  prs->stopsel = NULL;
2554  foreach(l, prsoptions)
2555  {
2556  DefElem *defel = (DefElem *) lfirst(l);
2557  char *val = defGetString(defel);
2558 
2559  if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2560  max_words = pg_atoi(val, sizeof(int32), 0);
2561  else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2562  min_words = pg_atoi(val, sizeof(int32), 0);
2563  else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2564  shortword = pg_atoi(val, sizeof(int32), 0);
2565  else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2566  max_fragments = pg_atoi(val, sizeof(int32), 0);
2567  else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2568  prs->startsel = pstrdup(val);
2569  else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2570  prs->stopsel = pstrdup(val);
2571  else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2572  prs->fragdelim = pstrdup(val);
2573  else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2574  highlight = (pg_strcasecmp(val, "1") == 0 ||
2575  pg_strcasecmp(val, "on") == 0 ||
2576  pg_strcasecmp(val, "true") == 0 ||
2577  pg_strcasecmp(val, "t") == 0 ||
2578  pg_strcasecmp(val, "y") == 0 ||
2579  pg_strcasecmp(val, "yes") == 0);
2580  else
2581  ereport(ERROR,
2582  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2583  errmsg("unrecognized headline parameter: \"%s\"",
2584  defel->defname)));
2585  }
2586 
2587  if (highlight == 0)
2588  {
2589  if (min_words >= max_words)
2590  ereport(ERROR,
2591  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2592  errmsg("MinWords should be less than MaxWords")));
2593  if (min_words <= 0)
2594  ereport(ERROR,
2595  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2596  errmsg("MinWords should be positive")));
2597  if (shortword < 0)
2598  ereport(ERROR,
2599  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2600  errmsg("ShortWord should be >= 0")));
2601  if (max_fragments < 0)
2602  ereport(ERROR,
2603  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2604  errmsg("MaxFragments should be >= 0")));
2605  }
2606 
2607  if (max_fragments == 0)
2608  /* call the default headline generator */
2609  mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
2610  else
2611  mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
2612 
2613  if (!prs->startsel)
2614  prs->startsel = pstrdup("<b>");
2615  if (!prs->stopsel)
2616  prs->stopsel = pstrdup("</b>");
2617  if (!prs->fragdelim)
2618  prs->fragdelim = pstrdup(" ... ");
2619  prs->startsellen = strlen(prs->startsel);
2620  prs->stopsellen = strlen(prs->stopsel);
2621  prs->fragdelimlen = strlen(prs->fragdelim);
2622 
2623  PG_RETURN_POINTER(prs);
2624 }
uint16 WordEntryPos
Definition: ts_type.h:63
signed short int16
Definition: c.h:255
bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond)
Definition: tsvector_op.c:1815
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
static bool TParserGet(TParser *prs)
Definition: wparser_def.c:1801
static int p_isstophost(TParser *prs)
Definition: wparser_def.c:712
static const TParserStateActionItem actionTPS_InPortFirst[]
Definition: wparser_def.c:1450
#define A_POP
Definition: wparser_def.c:220
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
static const TParserStateActionItem actionTPS_InWord[]
Definition: wparser_def.c:1108
static const TParserStateActionItem actionTPS_InParseHyphenHyphen[]
Definition: wparser_def.c:1655
static const TParserStateActionItem actionTPS_InXMLEntityFirst[]
Definition: wparser_def.c:1237
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[]
Definition: wparser_def.c:1623
static const TParserStateActionItem actionTPS_InHyphenAsciiWord[]
Definition: wparser_def.c:1597
bool wanthost
Definition: wparser_def.c:254
Datum prsd_headline(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2537
struct TParser TParser
static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[]
Definition: wparser_def.c:1257
TParserState state
Definition: wparser_def.c:234
static const TParserStateActionItem actionTPS_InHostDomain[]
Definition: wparser_def.c:1435
static void TParserCopyClose(TParser *prs)
Definition: wparser_def.c:413
static int p_isascii(TParser *prs)
Definition: wparser_def.c:593
void print(const void *obj)
Definition: print.c:35
TParserState
Definition: wparser_def.c:117
#define NUMHWORD
Definition: wparser_def.c:48
static const TParserStateActionItem actionTPS_InUnsignedInt[]
Definition: wparser_def.c:1117
static void SpecialTags(TParser *prs)
Definition: wparser_def.c:664
static const TParserStateActionItem actionTPS_InNumWord[]
Definition: wparser_def.c:1079
static const TParserStateActionItem actionTPS_InEmail[]
Definition: wparser_def.c:1482
char * alias
Definition: ts_public.h:28
static const TParserStateActionItem actionTPS_InHostFirstDomain[]
Definition: wparser_def.c:1417
#define XMLENTITY
Definition: wparser_def.c:56
static const TParserStateActionItem actionTPS_InDecimalFirst[]
Definition: wparser_def.c:1176
#define HLIDSKIP(x)
Definition: wparser_def.c:2021
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:43
int32 curlen
Definition: wparser_def.c:2169
#define UNSIGNEDINT
Definition: wparser_def.c:55
static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[]
Definition: wparser_def.c:1589
Datum prsd_lextype(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1967
#define URLPATH
Definition: wparser_def.c:51
static const TParserStateActionItem actionTPS_InSignedIntFirst[]
Definition: wparser_def.c:1134
bool allocated
Definition: ts_utils.h:136
struct TParserPosition * prev
Definition: wparser_def.c:235
#define PG_GETARG_TSQUERY(n)
Definition: ts_type.h:247
char * pstrdup(const char *in)
Definition: mcxt.c:1077
static const TParserStateActionItem actionTPS_InURLPathStart[]
Definition: wparser_def.c:1557
#define NONWORDTOKEN(x)
Definition: wparser_def.c:2023
static int p_isignore(TParser *prs)
Definition: wparser_def.c:723
static const TParserStateActionItem actionTPS_InCommentEnd[]
Definition: wparser_def.c:1413
static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words)
Definition: wparser_def.c:2175
static const TParserStateActionItem actionTPS_InTagBackSleshed[]
Definition: wparser_def.c:1370
#define PG_RETURN_INT32(x)
Definition: fmgr.h:314
#define p_iswhat(type)
Definition: wparser_def.c:541
static const TParserStateActionItem actionTPS_InCommentFirst[]
Definition: wparser_def.c:1379
int errcode(int sqlerrcode)
Definition: elog.c:575
QueryOperand * item
Definition: ts_public.h:47
static const TParserStateActionItem actionTPS_InXMLEntityEnd[]
Definition: wparser_def.c:1285
#define NOENDTOKEN(x)
Definition: wparser_def.c:2024
#define QI_VAL
Definition: ts_type.h:143
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
static const TParserStateActionItem actionTPS_InComment[]
Definition: wparser_def.c:1394
static const TParserStateActionItem actionTPS_InCommentLast[]
Definition: wparser_def.c:1388
char * str
Definition: wparser_def.c:242
static const TParserStateActionItem actionTPS_InTag[]
Definition: wparser_def.c:1334
static const char *const lex_descr[]
Definition: wparser_def.c:87
#define A_CLRALL
Definition: wparser_def.c:225
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:74
#define lengthof(array)
Definition: c.h:562
#define VERSIONNUMBER
Definition: wparser_def.c:41
unsigned int Oid
Definition: postgres_ext.h:31
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[]
Definition: wparser_def.c:1686
static void mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, int shortword, int min_words, int max_words)
Definition: wparser_def.c:2391
static const TParserStateActionItem actionTPS_InHyphenWordFirst[]
Definition: wparser_def.c:1607
static void TParserClose(TParser *prs)
Definition: wparser_def.c:386
static void mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, int shortword, int min_words, int max_words, int max_fragments)
Definition: wparser_def.c:2220
int32 startpos
Definition: wparser_def.c:2166
TParserState state
Definition: wparser_def.c:1701
#define GETQUERY(x)
Definition: _int.h:142
WordEntryPos pos
Definition: ts_public.h:45
signed int int32
Definition: c.h:256
static const char *const tok_alias[]
Definition: wparser_def.c:60
Datum prsd_end(PG_FUNCTION_ARGS)
Definition: wparser_def.c:2007
void(* TParserSpecial)(struct TParser *)
Definition: wparser_def.c:204
static const TParserStateActionItem actionTPS_InHostFirstAN[]
Definition: wparser_def.c:1464
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[]
Definition: wparser_def.c:1278
#define ASCIIWORD
Definition: wparser_def.c:34
Datum prsd_start(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1985
static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[]
Definition: wparser_def.c:1670
#define PROTOCOL
Definition: wparser_def.c:47
static XLogRecPtr endpos
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
#define TAG_T
Definition: wparser_def.c:46
static void SpecialHyphen(TParser *prs)
Definition: wparser_def.c:696
static const TParserStateActionItem actionTPS_InURLPath[]
Definition: wparser_def.c:1561
unsigned short uint16
Definition: c.h:267
void pfree(void *pointer)
Definition: mcxt.c:950
#define NUMPARTHWORD
Definition: wparser_def.c:42
static int p_isurlchar(TParser *prs)
Definition: wparser_def.c:605
static const TParserStateActionItem actionTPS_InAsciiWord[]
Definition: wparser_def.c:1090
TParserSpecial special
Definition: wparser_def.c:214
TParserPosition * state
Definition: wparser_def.c:252
#define ERROR
Definition: elog.h:43
static const TParserStateActionItem actionTPS_InPathFirst[]
Definition: wparser_def.c:1507
const TParserStateActionItem * pushedAtAction
Definition: wparser_def.c:236
char * defGetString(DefElem *def)
Definition: define.c:49
static void mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
Definition: wparser_def.c:2139
#define SPACE
Definition: wparser_def.c:45
int32 endpos
Definition: wparser_def.c:2167
const TParserStateActionItem * action
Definition: wparser_def.c:1700
static const TParserStateActionItem actionTPS_InXMLEntityNum[]
Definition: wparser_def.c:1271
#define HLIDREPLACE(x)
Definition: wparser_def.c:2020
static TParser * TParserCopyInit(const TParser *orig)
Definition: wparser_def.c:356
static const TParserStateActionItem actionTPS_InUDecimalFirst[]
Definition: wparser_def.c:1161
int16 in
Definition: wparser_def.c:2170
char * c
struct TParserPosition TParserPosition
HeadlineWordEntry * words
Definition: ts_public.h:52
#define A_BINGO
Definition: wparser_def.c:219
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:75
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
#define NUMWORD
Definition: wparser_def.c:36
#define XMLHLIDSKIP(x)
Definition: wparser_def.c:2022
WordEntryPos * pos
Definition: ts_utils.h:138
int lenbytetoken
Definition: wparser_def.c:261
static const TParserStateActionItem actionTPS_InHost[]
Definition: wparser_def.c:1471
static const TParserStateActionItem actionTPS_Base[]
Definition: wparser_def.c:1062
static const TParserStateActionItem actionTPS_InUDecimal[]
Definition: wparser_def.c:1167
void _make_compiler_happy(void)
Definition: wparser_def.c:637
static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[]
Definition: wparser_def.c:1265
#define LASTNUM
Definition: wparser_def.c:58
#define ereport(elevel, rest)
Definition: elog.h:122
int type
Definition: wparser_def.c:263
static const TParserStateActionItem actionTPS_InTagEscapeKK[]
Definition: wparser_def.c:1363
unsigned int pg_wchar
Definition: mbprint.c:31
char * token
Definition: wparser_def.c:260
#define A_CLEAR
Definition: wparser_def.c:223
static const TParserStateActionItem actionTPS_InDecimal[]
Definition: wparser_def.c:1182
static int p_isspecial(TParser *prs)
Definition: wparser_def.c:780
static const TParserStateActionItem actionTPS_InParseHyphen[]
Definition: wparser_def.c:1646
char * descr
Definition: ts_public.h:29
#define PARTHWORD
Definition: wparser_def.c:43
#define WORD_T
Definition: wparser_def.c:35
static const TParserStateActionItem actionTPS_InFURL[]
Definition: wparser_def.c:1567
int lexid
Definition: ts_public.h:27
static const TParserStateActionItem actionTPS_InFile[]
Definition: wparser_def.c:1532
static const TParserStateActionItem actionTPS_InPort[]
Definition: wparser_def.c:1456
int lenchartoken
Definition: wparser_def.c:262
QueryItemType type
Definition: ts_type.h:204
static const TParserStateActionItem actionTPS_InFileTwiddle[]
Definition: wparser_def.c:1498
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[]
Definition: wparser_def.c:1638
static const TParserStateActionItem actionTPS_InHostDomainSecond[]
Definition: wparser_def.c:1424
void * palloc0(Size size)
Definition: mcxt.c:878
static const TParserStateActionItem actionTPS_InMantissaFirst[]
Definition: wparser_def.c:1217
static int p_isasclet(TParser *prs)
Definition: wparser_def.c:599
uintptr_t Datum
Definition: postgres.h:372
int16 excluded
Definition: wparser_def.c:2171
int GetDatabaseEncoding(void)
Definition: mbutils.c:1015
static const TParserStateActionItem actionTPS_InSignedInt[]
Definition: wparser_def.c:1140
static const TParserStateActionItem actionTPS_InHyphenWordPart[]
Definition: wparser_def.c:1662
int32 poslen
Definition: wparser_def.c:2168
static const TParserStateActionItem actionTPS_InVerVersion[]
Definition: wparser_def.c:1191
static const TParserStateAction Actions[]
Definition: wparser_def.c:1719
#define URL_T
Definition: wparser_def.c:38
#define TPARSERSTATEACTION(state)
Definition: wparser_def.c:1711
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:734
static bool hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
Definition: wparser_def.c:2068
char c
Definition: wparser_def.c:257
int pg_dsplen(const char *mbstr)
Definition: mbutils.c:778
#define PG_RETURN_VOID()
Definition: fmgr.h:309
static TParser * TParserInit(char *str, int len)
Definition: wparser_def.c:289
#define ASCIIPARTHWORD
Definition: wparser_def.c:44
static int p_ishost(TParser *prs)
Definition: wparser_def.c:729
TParserCharTest isclass
Definition: wparser_def.c:209
#define NULL
Definition: c.h:229
static const TParserStateActionItem actionTPS_InTagFirst[]
Definition: wparser_def.c:1289
#define Assert(condition)
Definition: c.h:675
#define lfirst(lc)
Definition: pg_list.h:106
static const TParserStateActionItem actionTPS_InFileNext[]
Definition: wparser_def.c:1543
static const TParserStateActionItem actionTPS_InVersion[]
Definition: wparser_def.c:1210
#define FILEPATH
Definition: wparser_def.c:52
static int p_iseq(TParser *prs, char c)
Definition: wparser_def.c:555
static const TParserStateActionItem actionTPS_InPathFirstFirst[]
Definition: wparser_def.c:1517
static const TParserStateActionItem actionTPS_InMantissaSign[]
Definition: wparser_def.c:1225
static const TParserStateActionItem actionTPS_InTagEscapeK[]
Definition: wparser_def.c:1356
static int p_isURLPath(TParser *prs)
Definition: wparser_def.c:751
bool ignore
Definition: wparser_def.c:253
int lenstr
Definition: wparser_def.c:243
static XLogRecPtr startpos
#define A_MERGE
Definition: wparser_def.c:224
int pg_mblen(const char *mbstr)
Definition: mbutils.c:771
static void SpecialVerVersion(TParser *prs)
Definition: wparser_def.c:703
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:963
int charmaxlen
Definition: wparser_def.c:251
static const TParserStateActionItem actionTPS_InHyphenNumWord[]
Definition: wparser_def.c:1630
static const TParserStateActionItem actionTPS_InTagName[]
Definition: wparser_def.c:1314
static const TParserStateActionItem actionTPS_InMantissa[]
Definition: wparser_def.c:1231
static TParserPosition * newTParserPosition(TParserPosition *prev)
Definition: wparser_def.c:272
static int p_iseqC(TParser *prs)
Definition: wparser_def.c:581
int(* TParserCharTest)(struct TParser *)
Definition: wparser_def.c:202
void * palloc(Size size)
Definition: mcxt.c:849
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define SCIENTIFIC
Definition: wparser_def.c:40
static const TParserStateActionItem actionTPS_InProtocolFirst[]
Definition: wparser_def.c:1573
#define PG_INT32_MAX
Definition: c.h:340
static const TParserStateActionItem actionTPS_InCloseCommentFirst[]
Definition: wparser_def.c:1400
static int p_isneC(TParser *prs)
Definition: wparser_def.c:587
int32 size
Definition: ts_type.h:217
static void SpecialFURL(TParser *prs)
Definition: wparser_def.c:688
int i
static const TParserStateActionItem actionTPS_InSVerVersion[]
Definition: wparser_def.c:1197
static const TParserStateActionItem actionTPS_InTagCloseFirst[]
Definition: wparser_def.c:1308
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
char * defname
Definition: parsenodes.h:719
static const TParserStateActionItem actionTPS_InHyphenWord[]
Definition: wparser_def.c:1614
static const TParserStateActionItem actionTPS_InSpace[]
Definition: wparser_def.c:1149
#define A_PUSH
Definition: wparser_def.c:221
bool lc_ctype_is_c(Oid collation)
Definition: pg_locale.c:1178
#define SIGNEDINT
Definition: wparser_def.c:54
#define A_NEXT
Definition: wparser_def.c:218
QueryOperand qoperand
Definition: ts_type.h:206
#define A_RERUN
Definition: wparser_def.c:222
#define EMAIL
Definition: wparser_def.c:37
static const TParserStateActionItem actionTPS_InTagEnd[]
Definition: wparser_def.c:1375
#define DECIMAL_T
Definition: wparser_def.c:53
static const TParserStateActionItem actionTPS_InHyphenNumWordPart[]
Definition: wparser_def.c:1679
static const TParserStateActionItem actionTPS_InProtocolEnd[]
Definition: wparser_def.c:1585
Definition: pg_list.h:45
static const TParserStateActionItem actionTPS_InProtocolSecond[]
Definition: wparser_def.c:1579
TParserState tostate
Definition: wparser_def.c:212
#define HWORD
Definition: wparser_def.c:50
#define TS_EXEC_EMPTY
Definition: ts_utils.h:159
long val
Definition: informix.c:689
#define HOST
Definition: wparser_def.c:39
static const TParserStateActionItem actionTPS_InVersionFirst[]
Definition: wparser_def.c:1204
#define ASCIIHWORD
Definition: wparser_def.c:49
static const TParserStateActionItem actionTPS_InXMLBegin[]
Definition: wparser_def.c:1300
static bool checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
Definition: wparser_def.c:2033
int32 pg_atoi(const char *s, int size, int c)
Definition: numutils.c:37
static const TParserStateActionItem actionTPS_InCloseCommentLast[]
Definition: wparser_def.c:1406
static const TParserStateActionItem actionTPS_InPathSecond[]
Definition: wparser_def.c:1524
Datum prsd_nexttoken(PG_FUNCTION_ARGS)
Definition: wparser_def.c:1991
static const TParserStateActionItem actionTPS_InFileFirst[]
Definition: wparser_def.c:1488
HeadlineWordEntry * words
Definition: wparser_def.c:2028
static const TParserStateActionItem actionTPS_InURLPathFirst[]
Definition: wparser_def.c:1551
static const TParserStateActionItem actionTPS_InTagBeginEnd[]
Definition: wparser_def.c:1328
static const TParserStateActionItem actionTPS_InXMLEntity[]
Definition: wparser_def.c:1246