PostgreSQL Source Code  git master
tsvector_parser.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector_parser.c
4  * Parser for tsvector
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/tsvector_parser.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19 
20 
21 /*
22  * Private state of tsvector parser. Note that tsquery also uses this code to
23  * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24  * flags are both true or both false in current usage, but we keep them
25  * separate for clarity.
26  *
27  * If oprisdelim is set, the following characters are treated as delimiters
28  * (in addition to whitespace): ! | & ( )
29  *
30  * is_tsquery affects *only* the content of error messages.
31  *
32  * is_web can be true to further modify tsquery parsing.
33  *
34  * If escontext is an ErrorSaveContext node, then soft errors can be
35  * captured there rather than being thrown.
36  */
38 {
39  char *prsbuf; /* next input character */
40  char *bufstart; /* whole string (used only for errors) */
41  char *word; /* buffer to hold the current word */
42  int len; /* size in bytes allocated for 'word' */
43  int eml; /* max bytes per character */
44  bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45  bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46  bool is_web; /* we're in websearch_to_tsquery() */
47  Node *escontext; /* for soft error reporting */
48 };
49 
50 
51 /*
52  * Initializes a parser state object for the given input string.
53  * A bitmask of flags (see ts_utils.h) and an error context object
54  * can be provided as well.
55  */
57 init_tsvector_parser(char *input, int flags, Node *escontext)
58 {
60 
62  state->prsbuf = input;
63  state->bufstart = input;
64  state->len = 32;
65  state->word = (char *) palloc(state->len);
67  state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68  state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69  state->is_web = (flags & P_TSV_IS_WEB) != 0;
70  state->escontext = escontext;
71 
72  return state;
73 }
74 
75 /*
76  * Reinitializes parser to parse 'input', instead of previous input.
77  *
78  * Note that bufstart (the string reported in errors) is not changed.
79  */
80 void
82 {
83  state->prsbuf = input;
84 }
85 
86 /*
87  * Shuts down a tsvector parser.
88  */
89 void
91 {
92  pfree(state->word);
93  pfree(state);
94 }
95 
96 /* increase the size of 'word' if needed to hold one more character */
97 #define RESIZEPRSBUF \
98 do { \
99  int clen = curpos - state->word; \
100  if ( clen + state->eml >= state->len ) \
101  { \
102  state->len *= 2; \
103  state->word = (char *) repalloc(state->word, state->len); \
104  curpos = state->word + clen; \
105  } \
106 } while (0)
107 
108 /* Fills gettoken_tsvector's output parameters, and returns true */
109 #define RETURN_TOKEN \
110 do { \
111  if (pos_ptr != NULL) \
112  { \
113  *pos_ptr = pos; \
114  *poslen = npos; \
115  } \
116  else if (pos != NULL) \
117  pfree(pos); \
118  \
119  if (strval != NULL) \
120  *strval = state->word; \
121  if (lenval != NULL) \
122  *lenval = curpos - state->word; \
123  if (endptr != NULL) \
124  *endptr = state->prsbuf; \
125  return true; \
126 } while(0)
127 
128 
129 /* State codes used in gettoken_tsvector */
130 #define WAITWORD 1
131 #define WAITENDWORD 2
132 #define WAITNEXTCHAR 3
133 #define WAITENDCMPLX 4
134 #define WAITPOSINFO 5
135 #define INPOSINFO 6
136 #define WAITPOSDELIM 7
137 #define WAITCHARCMPLX 8
138 
139 #define PRSSYNTAXERROR return prssyntaxerror(state)
140 
141 static bool
143 {
144  errsave(state->escontext,
145  (errcode(ERRCODE_SYNTAX_ERROR),
146  state->is_tsquery ?
147  errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148  errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149  /* In soft error situation, return false as convenience for caller */
150  return false;
151 }
152 
153 
154 /*
155  * Get next token from string being parsed. Returns true if successful,
156  * false if end of input string is reached or soft error.
157  *
158  * On success, these output parameters are filled in:
159  *
160  * *strval pointer to token
161  * *lenval length of *strval
162  * *pos_ptr pointer to a palloc'd array of positions and weights
163  * associated with the token. If the caller is not interested
164  * in the information, NULL can be supplied. Otherwise
165  * the caller is responsible for pfreeing the array.
166  * *poslen number of elements in *pos_ptr
167  * *endptr scan resumption point
168  *
169  * Pass NULL for any unwanted output parameters.
170  *
171  * If state->escontext is an ErrorSaveContext, then caller must check
172  * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173  * error or normal end-of-string.
174  */
175 bool
177  char **strval, int *lenval,
178  WordEntryPos **pos_ptr, int *poslen,
179  char **endptr)
180 {
181  int oldstate = 0;
182  char *curpos = state->word;
183  int statecode = WAITWORD;
184 
185  /*
186  * pos is for collecting the comma delimited list of positions followed by
187  * the actual token.
188  */
189  WordEntryPos *pos = NULL;
190  int npos = 0; /* elements of pos used */
191  int posalen = 0; /* allocated size of pos */
192 
193  while (1)
194  {
195  if (statecode == WAITWORD)
196  {
197  if (*(state->prsbuf) == '\0')
198  return false;
199  else if (!state->is_web && t_iseq(state->prsbuf, '\''))
200  statecode = WAITENDCMPLX;
201  else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202  {
203  statecode = WAITNEXTCHAR;
204  oldstate = WAITENDWORD;
205  }
206  else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207  (state->is_web && t_iseq(state->prsbuf, '"')))
209  else if (!t_isspace(state->prsbuf))
210  {
211  COPYCHAR(curpos, state->prsbuf);
212  curpos += pg_mblen(state->prsbuf);
213  statecode = WAITENDWORD;
214  }
215  }
216  else if (statecode == WAITNEXTCHAR)
217  {
218  if (*(state->prsbuf) == '\0')
219  ereturn(state->escontext, false,
220  (errcode(ERRCODE_SYNTAX_ERROR),
221  errmsg("there is no escaped character: \"%s\"",
222  state->bufstart)));
223  else
224  {
225  RESIZEPRSBUF;
226  COPYCHAR(curpos, state->prsbuf);
227  curpos += pg_mblen(state->prsbuf);
228  Assert(oldstate != 0);
229  statecode = oldstate;
230  }
231  }
232  else if (statecode == WAITENDWORD)
233  {
234  if (!state->is_web && t_iseq(state->prsbuf, '\\'))
235  {
236  statecode = WAITNEXTCHAR;
237  oldstate = WAITENDWORD;
238  }
239  else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
240  (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
241  (state->is_web && t_iseq(state->prsbuf, '"')))
242  {
243  RESIZEPRSBUF;
244  if (curpos == state->word)
246  *(curpos) = '\0';
247  RETURN_TOKEN;
248  }
249  else if (t_iseq(state->prsbuf, ':'))
250  {
251  if (curpos == state->word)
253  *(curpos) = '\0';
254  if (state->oprisdelim)
255  RETURN_TOKEN;
256  else
257  statecode = INPOSINFO;
258  }
259  else
260  {
261  RESIZEPRSBUF;
262  COPYCHAR(curpos, state->prsbuf);
263  curpos += pg_mblen(state->prsbuf);
264  }
265  }
266  else if (statecode == WAITENDCMPLX)
267  {
268  if (!state->is_web && t_iseq(state->prsbuf, '\''))
269  {
270  statecode = WAITCHARCMPLX;
271  }
272  else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
273  {
274  statecode = WAITNEXTCHAR;
275  oldstate = WAITENDCMPLX;
276  }
277  else if (*(state->prsbuf) == '\0')
279  else
280  {
281  RESIZEPRSBUF;
282  COPYCHAR(curpos, state->prsbuf);
283  curpos += pg_mblen(state->prsbuf);
284  }
285  }
286  else if (statecode == WAITCHARCMPLX)
287  {
288  if (!state->is_web && t_iseq(state->prsbuf, '\''))
289  {
290  RESIZEPRSBUF;
291  COPYCHAR(curpos, state->prsbuf);
292  curpos += pg_mblen(state->prsbuf);
293  statecode = WAITENDCMPLX;
294  }
295  else
296  {
297  RESIZEPRSBUF;
298  *(curpos) = '\0';
299  if (curpos == state->word)
301  if (state->oprisdelim)
302  {
303  /* state->prsbuf+=pg_mblen(state->prsbuf); */
304  RETURN_TOKEN;
305  }
306  else
307  statecode = WAITPOSINFO;
308  continue; /* recheck current character */
309  }
310  }
311  else if (statecode == WAITPOSINFO)
312  {
313  if (t_iseq(state->prsbuf, ':'))
314  statecode = INPOSINFO;
315  else
316  RETURN_TOKEN;
317  }
318  else if (statecode == INPOSINFO)
319  {
320  if (t_isdigit(state->prsbuf))
321  {
322  if (posalen == 0)
323  {
324  posalen = 4;
325  pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
326  npos = 0;
327  }
328  else if (npos + 1 >= posalen)
329  {
330  posalen *= 2;
331  pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
332  }
333  npos++;
334  WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
335  /* we cannot get here in tsquery, so no need for 2 errmsgs */
336  if (WEP_GETPOS(pos[npos - 1]) == 0)
337  ereturn(state->escontext, false,
338  (errcode(ERRCODE_SYNTAX_ERROR),
339  errmsg("wrong position info in tsvector: \"%s\"",
340  state->bufstart)));
341  WEP_SETWEIGHT(pos[npos - 1], 0);
342  statecode = WAITPOSDELIM;
343  }
344  else
346  }
347  else if (statecode == WAITPOSDELIM)
348  {
349  if (t_iseq(state->prsbuf, ','))
350  statecode = INPOSINFO;
351  else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
352  {
353  if (WEP_GETWEIGHT(pos[npos - 1]))
355  WEP_SETWEIGHT(pos[npos - 1], 3);
356  }
357  else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
358  {
359  if (WEP_GETWEIGHT(pos[npos - 1]))
361  WEP_SETWEIGHT(pos[npos - 1], 2);
362  }
363  else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
364  {
365  if (WEP_GETWEIGHT(pos[npos - 1]))
367  WEP_SETWEIGHT(pos[npos - 1], 1);
368  }
369  else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
370  {
371  if (WEP_GETWEIGHT(pos[npos - 1]))
373  WEP_SETWEIGHT(pos[npos - 1], 0);
374  }
375  else if (t_isspace(state->prsbuf) ||
376  *(state->prsbuf) == '\0')
377  RETURN_TOKEN;
378  else if (!t_isdigit(state->prsbuf))
380  }
381  else /* internal error */
382  elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
383  statecode);
384 
385  /* get next char */
386  state->prsbuf += pg_mblen(state->prsbuf);
387  }
388 }
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define ereturn(context, dummy_value,...)
Definition: elog.h:276
#define errsave(context,...)
Definition: elog.h:260
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
FILE * input
Assert(fmt[strlen(fmt) - 1] !='\n')
#define ISOPERATOR(x)
Definition: ltree.h:167
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:1508
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1528
void * palloc(Size size)
Definition: mcxt.c:1304
Definition: nodes.h:129
Definition: regguts.h:323
int t_isspace(const char *ptr)
Definition: ts_locale.c:50
int t_isdigit(const char *ptr)
Definition: ts_locale.c:35
#define t_iseq(x, c)
Definition: ts_locale.h:38
#define COPYCHAR(d, s)
Definition: ts_locale.h:40
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define WEP_SETPOS(x, v)
Definition: ts_type.h:83
uint16 WordEntryPos
Definition: ts_type.h:63
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
#define LIMITPOS(x)
Definition: ts_type.h:87
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define P_TSV_IS_TSQUERY
Definition: ts_utils.h:30
struct TSVectorParseStateData * TSVectorParseState
Definition: ts_utils.h:26
#define P_TSV_IS_WEB
Definition: ts_utils.h:31
#define P_TSV_OPR_IS_DELIM
Definition: ts_utils.h:29
void reset_tsvector_parser(TSVectorParseState state, char *input)
#define WAITNEXTCHAR
#define PRSSYNTAXERROR
void close_tsvector_parser(TSVectorParseState state)
#define WAITENDCMPLX
#define WAITENDWORD
#define WAITCHARCMPLX
#define WAITPOSINFO
static bool prssyntaxerror(TSVectorParseState state)
#define WAITWORD
#define INPOSINFO
#define RESIZEPRSBUF
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
#define RETURN_TOKEN
TSVectorParseState init_tsvector_parser(char *input, int flags, Node *escontext)
#define WAITPOSDELIM