PostgreSQL Source Code  git master
tsvector_parser.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector_parser.c
4  * Parser for tsvector
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/tsvector_parser.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19 
20 
21 /*
22  * Private state of tsvector parser. Note that tsquery also uses this code to
23  * parse its input, hence the boolean flags. The two flags are both true or
24  * both false in current usage, but we keep them separate for clarity.
25  * is_tsquery affects *only* the content of error messages.
26  */
28 {
29  char *prsbuf; /* next input character */
30  char *bufstart; /* whole string (used only for errors) */
31  char *word; /* buffer to hold the current word */
32  int len; /* size in bytes allocated for 'word' */
33  int eml; /* max bytes per character */
34  bool oprisdelim; /* treat ! | * ( ) as delimiters? */
35  bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
36  bool is_web; /* we're in websearch_to_tsquery() */
37 };
38 
39 
40 /*
41  * Initializes parser for the input string. If oprisdelim is set, the
42  * following characters are treated as delimiters in addition to whitespace:
43  * ! | & ( )
44  */
46 init_tsvector_parser(char *input, int flags)
47 {
49 
50  state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
51  state->prsbuf = input;
52  state->bufstart = input;
53  state->len = 32;
54  state->word = (char *) palloc(state->len);
56  state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
57  state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
58  state->is_web = (flags & P_TSV_IS_WEB) != 0;
59 
60  return state;
61 }
62 
63 /*
64  * Reinitializes parser to parse 'input', instead of previous input.
65  */
66 void
68 {
69  state->prsbuf = input;
70 }
71 
72 /*
73  * Shuts down a tsvector parser.
74  */
75 void
77 {
78  pfree(state->word);
79  pfree(state);
80 }
81 
82 /* increase the size of 'word' if needed to hold one more character */
83 #define RESIZEPRSBUF \
84 do { \
85  int clen = curpos - state->word; \
86  if ( clen + state->eml >= state->len ) \
87  { \
88  state->len *= 2; \
89  state->word = (char *) repalloc(state->word, state->len); \
90  curpos = state->word + clen; \
91  } \
92 } while (0)
93 
94 /* Fills gettoken_tsvector's output parameters, and returns true */
95 #define RETURN_TOKEN \
96 do { \
97  if (pos_ptr != NULL) \
98  { \
99  *pos_ptr = pos; \
100  *poslen = npos; \
101  } \
102  else if (pos != NULL) \
103  pfree(pos); \
104  \
105  if (strval != NULL) \
106  *strval = state->word; \
107  if (lenval != NULL) \
108  *lenval = curpos - state->word; \
109  if (endptr != NULL) \
110  *endptr = state->prsbuf; \
111  return true; \
112 } while(0)
113 
114 
115 /* State codes used in gettoken_tsvector */
116 #define WAITWORD 1
117 #define WAITENDWORD 2
118 #define WAITNEXTCHAR 3
119 #define WAITENDCMPLX 4
120 #define WAITPOSINFO 5
121 #define INPOSINFO 6
122 #define WAITPOSDELIM 7
123 #define WAITCHARCMPLX 8
124 
125 #define PRSSYNTAXERROR prssyntaxerror(state)
126 
127 static void
129 {
130  ereport(ERROR,
131  (errcode(ERRCODE_SYNTAX_ERROR),
132  state->is_tsquery ?
133  errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
134  errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
135 }
136 
137 
138 /*
139  * Get next token from string being parsed. Returns true if successful,
140  * false if end of input string is reached. On success, these output
141  * parameters are filled in:
142  *
143  * *strval pointer to token
144  * *lenval length of *strval
145  * *pos_ptr pointer to a palloc'd array of positions and weights
146  * associated with the token. If the caller is not interested
147  * in the information, NULL can be supplied. Otherwise
148  * the caller is responsible for pfreeing the array.
149  * *poslen number of elements in *pos_ptr
150  * *endptr scan resumption point
151  *
152  * Pass NULL for unwanted output parameters.
153  */
154 bool
156  char **strval, int *lenval,
157  WordEntryPos **pos_ptr, int *poslen,
158  char **endptr)
159 {
160  int oldstate = 0;
161  char *curpos = state->word;
162  int statecode = WAITWORD;
163 
164  /*
165  * pos is for collecting the comma delimited list of positions followed by
166  * the actual token.
167  */
168  WordEntryPos *pos = NULL;
169  int npos = 0; /* elements of pos used */
170  int posalen = 0; /* allocated size of pos */
171 
172  while (1)
173  {
174  if (statecode == WAITWORD)
175  {
176  if (*(state->prsbuf) == '\0')
177  return false;
178  else if (!state->is_web && t_iseq(state->prsbuf, '\''))
179  statecode = WAITENDCMPLX;
180  else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
181  {
182  statecode = WAITNEXTCHAR;
183  oldstate = WAITENDWORD;
184  }
185  else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
186  (state->is_web && t_iseq(state->prsbuf, '"')))
188  else if (!t_isspace(state->prsbuf))
189  {
190  COPYCHAR(curpos, state->prsbuf);
191  curpos += pg_mblen(state->prsbuf);
192  statecode = WAITENDWORD;
193  }
194  }
195  else if (statecode == WAITNEXTCHAR)
196  {
197  if (*(state->prsbuf) == '\0')
198  ereport(ERROR,
199  (errcode(ERRCODE_SYNTAX_ERROR),
200  errmsg("there is no escaped character: \"%s\"",
201  state->bufstart)));
202  else
203  {
204  RESIZEPRSBUF;
205  COPYCHAR(curpos, state->prsbuf);
206  curpos += pg_mblen(state->prsbuf);
207  Assert(oldstate != 0);
208  statecode = oldstate;
209  }
210  }
211  else if (statecode == WAITENDWORD)
212  {
213  if (!state->is_web && t_iseq(state->prsbuf, '\\'))
214  {
215  statecode = WAITNEXTCHAR;
216  oldstate = WAITENDWORD;
217  }
218  else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
219  (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
220  (state->is_web && t_iseq(state->prsbuf, '"')))
221  {
222  RESIZEPRSBUF;
223  if (curpos == state->word)
225  *(curpos) = '\0';
226  RETURN_TOKEN;
227  }
228  else if (t_iseq(state->prsbuf, ':'))
229  {
230  if (curpos == state->word)
232  *(curpos) = '\0';
233  if (state->oprisdelim)
234  RETURN_TOKEN;
235  else
236  statecode = INPOSINFO;
237  }
238  else
239  {
240  RESIZEPRSBUF;
241  COPYCHAR(curpos, state->prsbuf);
242  curpos += pg_mblen(state->prsbuf);
243  }
244  }
245  else if (statecode == WAITENDCMPLX)
246  {
247  if (!state->is_web && t_iseq(state->prsbuf, '\''))
248  {
249  statecode = WAITCHARCMPLX;
250  }
251  else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
252  {
253  statecode = WAITNEXTCHAR;
254  oldstate = WAITENDCMPLX;
255  }
256  else if (*(state->prsbuf) == '\0')
258  else
259  {
260  RESIZEPRSBUF;
261  COPYCHAR(curpos, state->prsbuf);
262  curpos += pg_mblen(state->prsbuf);
263  }
264  }
265  else if (statecode == WAITCHARCMPLX)
266  {
267  if (!state->is_web && t_iseq(state->prsbuf, '\''))
268  {
269  RESIZEPRSBUF;
270  COPYCHAR(curpos, state->prsbuf);
271  curpos += pg_mblen(state->prsbuf);
272  statecode = WAITENDCMPLX;
273  }
274  else
275  {
276  RESIZEPRSBUF;
277  *(curpos) = '\0';
278  if (curpos == state->word)
280  if (state->oprisdelim)
281  {
282  /* state->prsbuf+=pg_mblen(state->prsbuf); */
283  RETURN_TOKEN;
284  }
285  else
286  statecode = WAITPOSINFO;
287  continue; /* recheck current character */
288  }
289  }
290  else if (statecode == WAITPOSINFO)
291  {
292  if (t_iseq(state->prsbuf, ':'))
293  statecode = INPOSINFO;
294  else
295  RETURN_TOKEN;
296  }
297  else if (statecode == INPOSINFO)
298  {
299  if (t_isdigit(state->prsbuf))
300  {
301  if (posalen == 0)
302  {
303  posalen = 4;
304  pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
305  npos = 0;
306  }
307  else if (npos + 1 >= posalen)
308  {
309  posalen *= 2;
310  pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
311  }
312  npos++;
313  WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
314  /* we cannot get here in tsquery, so no need for 2 errmsgs */
315  if (WEP_GETPOS(pos[npos - 1]) == 0)
316  ereport(ERROR,
317  (errcode(ERRCODE_SYNTAX_ERROR),
318  errmsg("wrong position info in tsvector: \"%s\"",
319  state->bufstart)));
320  WEP_SETWEIGHT(pos[npos - 1], 0);
321  statecode = WAITPOSDELIM;
322  }
323  else
325  }
326  else if (statecode == WAITPOSDELIM)
327  {
328  if (t_iseq(state->prsbuf, ','))
329  statecode = INPOSINFO;
330  else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
331  {
332  if (WEP_GETWEIGHT(pos[npos - 1]))
334  WEP_SETWEIGHT(pos[npos - 1], 3);
335  }
336  else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
337  {
338  if (WEP_GETWEIGHT(pos[npos - 1]))
340  WEP_SETWEIGHT(pos[npos - 1], 2);
341  }
342  else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
343  {
344  if (WEP_GETWEIGHT(pos[npos - 1]))
346  WEP_SETWEIGHT(pos[npos - 1], 1);
347  }
348  else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
349  {
350  if (WEP_GETWEIGHT(pos[npos - 1]))
352  WEP_SETWEIGHT(pos[npos - 1], 0);
353  }
354  else if (t_isspace(state->prsbuf) ||
355  *(state->prsbuf) == '\0')
356  RETURN_TOKEN;
357  else if (!t_isdigit(state->prsbuf))
359  }
360  else /* internal error */
361  elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
362  statecode);
363 
364  /* get next char */
365  state->prsbuf += pg_mblen(state->prsbuf);
366  }
367 }
#define COPYCHAR(d, s)
Definition: ts_locale.h:47
uint16 WordEntryPos
Definition: ts_type.h:63
#define WAITPOSDELIM
#define P_TSV_IS_TSQUERY
Definition: ts_utils.h:29
#define P_TSV_IS_WEB
Definition: ts_utils.h:30
struct TSVectorParseStateData * TSVectorParseState
Definition: ts_utils.h:26
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
void close_tsvector_parser(TSVectorParseState state)
int errcode(int sqlerrcode)
Definition: elog.c:575
#define WEP_SETPOS(x, v)
Definition: ts_type.h:83
#define ISOPERATOR(x)
Definition: ltree.h:120
int t_isdigit(const char *ptr)
Definition: ts_locale.c:25
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ERROR
Definition: elog.h:43
#define WAITCHARCMPLX
#define WEP_GETPOS(x)
Definition: ts_type.h:80
static void prssyntaxerror(TSVectorParseState state)
#define PRSSYNTAXERROR
int t_isspace(const char *ptr)
Definition: ts_locale.c:41
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
#define t_iseq(x, c)
Definition: ts_locale.h:45
#define RETURN_TOKEN
#define ereport(elevel, rest)
Definition: elog.h:122
void reset_tsvector_parser(TSVectorParseState state, char *input)
#define P_TSV_OPR_IS_DELIM
Definition: ts_utils.h:28
#define Assert(condition)
Definition: c.h:699
#define INPOSINFO
TSVectorParseState init_tsvector_parser(char *input, int flags)
Definition: regguts.h:298
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
#define WAITNEXTCHAR
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1044
#define WAITPOSINFO
#define RESIZEPRSBUF
#define WAITENDWORD
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define LIMITPOS(x)
Definition: ts_type.h:87
#define elog
Definition: elog.h:219
#define WAITWORD
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define WAITENDCMPLX