PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
tsvector_parser.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector_parser.c
4  * Parser for tsvector
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/tsvector_parser.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19 
20 
21 /*
22  * Private state of tsvector parser. Note that tsquery also uses this code to
23  * parse its input, hence the boolean flags. The two flags are both true or
24  * both false in current usage, but we keep them separate for clarity.
25  * is_tsquery affects *only* the content of error messages.
26  */
28 {
29  char *prsbuf; /* next input character */
30  char *bufstart; /* whole string (used only for errors) */
31  char *word; /* buffer to hold the current word */
32  int len; /* size in bytes allocated for 'word' */
33  int eml; /* max bytes per character */
34  bool oprisdelim; /* treat ! | * ( ) as delimiters? */
35  bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
36 };
37 
38 
39 /*
40  * Initializes parser for the input string. If oprisdelim is set, the
41  * following characters are treated as delimiters in addition to whitespace:
42  * ! | & ( )
43  */
45 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
46 {
48 
49  state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
50  state->prsbuf = input;
51  state->bufstart = input;
52  state->len = 32;
53  state->word = (char *) palloc(state->len);
55  state->oprisdelim = oprisdelim;
56  state->is_tsquery = is_tsquery;
57 
58  return state;
59 }
60 
61 /*
62  * Reinitializes parser to parse 'input', instead of previous input.
63  */
64 void
66 {
67  state->prsbuf = input;
68 }
69 
70 /*
71  * Shuts down a tsvector parser.
72  */
73 void
75 {
76  pfree(state->word);
77  pfree(state);
78 }
79 
80 /* increase the size of 'word' if needed to hold one more character */
81 #define RESIZEPRSBUF \
82 do { \
83  int clen = curpos - state->word; \
84  if ( clen + state->eml >= state->len ) \
85  { \
86  state->len *= 2; \
87  state->word = (char *) repalloc(state->word, state->len); \
88  curpos = state->word + clen; \
89  } \
90 } while (0)
91 
92 /* phrase operator begins with '<' */
93 #define ISOPERATOR(x) \
94  ( pg_mblen(x) == 1 && ( *(x) == '!' || \
95  *(x) == '&' || \
96  *(x) == '|' || \
97  *(x) == '(' || \
98  *(x) == ')' || \
99  *(x) == '<' \
100  ) )
101 
102 /* Fills gettoken_tsvector's output parameters, and returns true */
103 #define RETURN_TOKEN \
104 do { \
105  if (pos_ptr != NULL) \
106  { \
107  *pos_ptr = pos; \
108  *poslen = npos; \
109  } \
110  else if (pos != NULL) \
111  pfree(pos); \
112  \
113  if (strval != NULL) \
114  *strval = state->word; \
115  if (lenval != NULL) \
116  *lenval = curpos - state->word; \
117  if (endptr != NULL) \
118  *endptr = state->prsbuf; \
119  return true; \
120 } while(0)
121 
122 
123 /* State codes used in gettoken_tsvector */
124 #define WAITWORD 1
125 #define WAITENDWORD 2
126 #define WAITNEXTCHAR 3
127 #define WAITENDCMPLX 4
128 #define WAITPOSINFO 5
129 #define INPOSINFO 6
130 #define WAITPOSDELIM 7
131 #define WAITCHARCMPLX 8
132 
133 #define PRSSYNTAXERROR prssyntaxerror(state)
134 
135 static void
137 {
138  ereport(ERROR,
139  (errcode(ERRCODE_SYNTAX_ERROR),
140  state->is_tsquery ?
141  errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
142  errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
143 }
144 
145 
146 /*
147  * Get next token from string being parsed. Returns true if successful,
148  * false if end of input string is reached. On success, these output
149  * parameters are filled in:
150  *
151  * *strval pointer to token
152  * *lenval length of *strval
153  * *pos_ptr pointer to a palloc'd array of positions and weights
154  * associated with the token. If the caller is not interested
155  * in the information, NULL can be supplied. Otherwise
156  * the caller is responsible for pfreeing the array.
157  * *poslen number of elements in *pos_ptr
158  * *endptr scan resumption point
159  *
160  * Pass NULL for unwanted output parameters.
161  */
162 bool
164  char **strval, int *lenval,
165  WordEntryPos **pos_ptr, int *poslen,
166  char **endptr)
167 {
168  int oldstate = 0;
169  char *curpos = state->word;
170  int statecode = WAITWORD;
171 
172  /*
173  * pos is for collecting the comma delimited list of positions followed by
174  * the actual token.
175  */
176  WordEntryPos *pos = NULL;
177  int npos = 0; /* elements of pos used */
178  int posalen = 0; /* allocated size of pos */
179 
180  while (1)
181  {
182  if (statecode == WAITWORD)
183  {
184  if (*(state->prsbuf) == '\0')
185  return false;
186  else if (t_iseq(state->prsbuf, '\''))
187  statecode = WAITENDCMPLX;
188  else if (t_iseq(state->prsbuf, '\\'))
189  {
190  statecode = WAITNEXTCHAR;
191  oldstate = WAITENDWORD;
192  }
193  else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
195  else if (!t_isspace(state->prsbuf))
196  {
197  COPYCHAR(curpos, state->prsbuf);
198  curpos += pg_mblen(state->prsbuf);
199  statecode = WAITENDWORD;
200  }
201  }
202  else if (statecode == WAITNEXTCHAR)
203  {
204  if (*(state->prsbuf) == '\0')
205  ereport(ERROR,
206  (errcode(ERRCODE_SYNTAX_ERROR),
207  errmsg("there is no escaped character: \"%s\"",
208  state->bufstart)));
209  else
210  {
211  RESIZEPRSBUF;
212  COPYCHAR(curpos, state->prsbuf);
213  curpos += pg_mblen(state->prsbuf);
214  Assert(oldstate != 0);
215  statecode = oldstate;
216  }
217  }
218  else if (statecode == WAITENDWORD)
219  {
220  if (t_iseq(state->prsbuf, '\\'))
221  {
222  statecode = WAITNEXTCHAR;
223  oldstate = WAITENDWORD;
224  }
225  else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
226  (state->oprisdelim && ISOPERATOR(state->prsbuf)))
227  {
228  RESIZEPRSBUF;
229  if (curpos == state->word)
231  *(curpos) = '\0';
232  RETURN_TOKEN;
233  }
234  else if (t_iseq(state->prsbuf, ':'))
235  {
236  if (curpos == state->word)
238  *(curpos) = '\0';
239  if (state->oprisdelim)
240  RETURN_TOKEN;
241  else
242  statecode = INPOSINFO;
243  }
244  else
245  {
246  RESIZEPRSBUF;
247  COPYCHAR(curpos, state->prsbuf);
248  curpos += pg_mblen(state->prsbuf);
249  }
250  }
251  else if (statecode == WAITENDCMPLX)
252  {
253  if (t_iseq(state->prsbuf, '\''))
254  {
255  statecode = WAITCHARCMPLX;
256  }
257  else if (t_iseq(state->prsbuf, '\\'))
258  {
259  statecode = WAITNEXTCHAR;
260  oldstate = WAITENDCMPLX;
261  }
262  else if (*(state->prsbuf) == '\0')
264  else
265  {
266  RESIZEPRSBUF;
267  COPYCHAR(curpos, state->prsbuf);
268  curpos += pg_mblen(state->prsbuf);
269  }
270  }
271  else if (statecode == WAITCHARCMPLX)
272  {
273  if (t_iseq(state->prsbuf, '\''))
274  {
275  RESIZEPRSBUF;
276  COPYCHAR(curpos, state->prsbuf);
277  curpos += pg_mblen(state->prsbuf);
278  statecode = WAITENDCMPLX;
279  }
280  else
281  {
282  RESIZEPRSBUF;
283  *(curpos) = '\0';
284  if (curpos == state->word)
286  if (state->oprisdelim)
287  {
288  /* state->prsbuf+=pg_mblen(state->prsbuf); */
289  RETURN_TOKEN;
290  }
291  else
292  statecode = WAITPOSINFO;
293  continue; /* recheck current character */
294  }
295  }
296  else if (statecode == WAITPOSINFO)
297  {
298  if (t_iseq(state->prsbuf, ':'))
299  statecode = INPOSINFO;
300  else
301  RETURN_TOKEN;
302  }
303  else if (statecode == INPOSINFO)
304  {
305  if (t_isdigit(state->prsbuf))
306  {
307  if (posalen == 0)
308  {
309  posalen = 4;
310  pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
311  npos = 0;
312  }
313  else if (npos + 1 >= posalen)
314  {
315  posalen *= 2;
316  pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
317  }
318  npos++;
319  WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
320  /* we cannot get here in tsquery, so no need for 2 errmsgs */
321  if (WEP_GETPOS(pos[npos - 1]) == 0)
322  ereport(ERROR,
323  (errcode(ERRCODE_SYNTAX_ERROR),
324  errmsg("wrong position info in tsvector: \"%s\"",
325  state->bufstart)));
326  WEP_SETWEIGHT(pos[npos - 1], 0);
327  statecode = WAITPOSDELIM;
328  }
329  else
331  }
332  else if (statecode == WAITPOSDELIM)
333  {
334  if (t_iseq(state->prsbuf, ','))
335  statecode = INPOSINFO;
336  else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
337  {
338  if (WEP_GETWEIGHT(pos[npos - 1]))
340  WEP_SETWEIGHT(pos[npos - 1], 3);
341  }
342  else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
343  {
344  if (WEP_GETWEIGHT(pos[npos - 1]))
346  WEP_SETWEIGHT(pos[npos - 1], 2);
347  }
348  else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
349  {
350  if (WEP_GETWEIGHT(pos[npos - 1]))
352  WEP_SETWEIGHT(pos[npos - 1], 1);
353  }
354  else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
355  {
356  if (WEP_GETWEIGHT(pos[npos - 1]))
358  WEP_SETWEIGHT(pos[npos - 1], 0);
359  }
360  else if (t_isspace(state->prsbuf) ||
361  *(state->prsbuf) == '\0')
362  RETURN_TOKEN;
363  else if (!t_isdigit(state->prsbuf))
365  }
366  else /* internal error */
367  elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
368  statecode);
369 
370  /* get next char */
371  state->prsbuf += pg_mblen(state->prsbuf);
372  }
373 }
#define COPYCHAR(d, s)
Definition: ts_locale.h:47
uint16 WordEntryPos
Definition: ts_type.h:63
#define WAITPOSDELIM
struct TSVectorParseStateData * TSVectorParseState
Definition: ts_utils.h:26
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
void close_tsvector_parser(TSVectorParseState state)
int errcode(int sqlerrcode)
Definition: elog.c:575
#define WEP_SETPOS(x, v)
Definition: ts_type.h:83
int t_isdigit(const char *ptr)
Definition: ts_locale.c:25
void pfree(void *pointer)
Definition: mcxt.c:949
#define ERROR
Definition: elog.h:43
#define WAITCHARCMPLX
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define ISOPERATOR(x)
static void prssyntaxerror(TSVectorParseState state)
#define PRSSYNTAXERROR
int t_isspace(const char *ptr)
Definition: ts_locale.c:41
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
#define t_iseq(x, c)
Definition: ts_locale.h:45
TSVectorParseState init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
#define RETURN_TOKEN
#define ereport(elevel, rest)
Definition: elog.h:122
void reset_tsvector_parser(TSVectorParseState state, char *input)
#define Assert(condition)
Definition: c.h:664
#define INPOSINFO
Definition: regguts.h:298
int pg_mblen(const char *mbstr)
Definition: mbutils.c:771
#define WAITNEXTCHAR
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
#define WAITPOSINFO
#define RESIZEPRSBUF
#define WAITENDWORD
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define LIMITPOS(x)
Definition: ts_type.h:87
#define elog
Definition: elog.h:219
#define WAITWORD
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define WAITENDCMPLX