PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
tsvector_parser.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tsvector_parser.c
4 * Parser for tsvector
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_parser.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "tsearch/ts_locale.h"
18#include "tsearch/ts_utils.h"
19
20
21/*
22 * Private state of tsvector parser. Note that tsquery also uses this code to
23 * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 * flags are both true or both false in current usage, but we keep them
25 * separate for clarity.
26 *
27 * If oprisdelim is set, the following characters are treated as delimiters
28 * (in addition to whitespace): ! | & ( )
29 *
30 * is_tsquery affects *only* the content of error messages.
31 *
32 * is_web can be true to further modify tsquery parsing.
33 *
34 * If escontext is an ErrorSaveContext node, then soft errors can be
35 * captured there rather than being thrown.
36 */
38{
39 char *prsbuf; /* next input character */
40 char *bufstart; /* whole string (used only for errors) */
41 char *word; /* buffer to hold the current word */
42 int len; /* size in bytes allocated for 'word' */
43 int eml; /* max bytes per character */
44 bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 bool is_web; /* we're in websearch_to_tsquery() */
47 Node *escontext; /* for soft error reporting */
48};
49
50
51/*
52 * Initializes a parser state object for the given input string.
53 * A bitmask of flags (see ts_utils.h) and an error context object
54 * can be provided as well.
55 */
57init_tsvector_parser(char *input, int flags, Node *escontext)
58{
60
62 state->prsbuf = input;
63 state->bufstart = input;
64 state->len = 32;
65 state->word = (char *) palloc(state->len);
67 state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68 state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69 state->is_web = (flags & P_TSV_IS_WEB) != 0;
70 state->escontext = escontext;
71
72 return state;
73}
74
75/*
76 * Reinitializes parser to parse 'input', instead of previous input.
77 *
78 * Note that bufstart (the string reported in errors) is not changed.
79 */
80void
82{
83 state->prsbuf = input;
84}
85
86/*
87 * Shuts down a tsvector parser.
88 */
89void
91{
92 pfree(state->word);
93 pfree(state);
94}
95
96/* increase the size of 'word' if needed to hold one more character */
97#define RESIZEPRSBUF \
98do { \
99 int clen = curpos - state->word; \
100 if ( clen + state->eml >= state->len ) \
101 { \
102 state->len *= 2; \
103 state->word = (char *) repalloc(state->word, state->len); \
104 curpos = state->word + clen; \
105 } \
106} while (0)
107
108/* Fills gettoken_tsvector's output parameters, and returns true */
109#define RETURN_TOKEN \
110do { \
111 if (pos_ptr != NULL) \
112 { \
113 *pos_ptr = pos; \
114 *poslen = npos; \
115 } \
116 else if (pos != NULL) \
117 pfree(pos); \
118 \
119 if (strval != NULL) \
120 *strval = state->word; \
121 if (lenval != NULL) \
122 *lenval = curpos - state->word; \
123 if (endptr != NULL) \
124 *endptr = state->prsbuf; \
125 return true; \
126} while(0)
127
128
129/* State codes used in gettoken_tsvector */
130#define WAITWORD 1
131#define WAITENDWORD 2
132#define WAITNEXTCHAR 3
133#define WAITENDCMPLX 4
134#define WAITPOSINFO 5
135#define INPOSINFO 6
136#define WAITPOSDELIM 7
137#define WAITCHARCMPLX 8
138
139#define PRSSYNTAXERROR return prssyntaxerror(state)
140
141static bool
143{
144 errsave(state->escontext,
145 (errcode(ERRCODE_SYNTAX_ERROR),
146 state->is_tsquery ?
147 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 /* In soft error situation, return false as convenience for caller */
150 return false;
151}
152
153
154/*
155 * Get next token from string being parsed. Returns true if successful,
156 * false if end of input string is reached or soft error.
157 *
158 * On success, these output parameters are filled in:
159 *
160 * *strval pointer to token
161 * *lenval length of *strval
162 * *pos_ptr pointer to a palloc'd array of positions and weights
163 * associated with the token. If the caller is not interested
164 * in the information, NULL can be supplied. Otherwise
165 * the caller is responsible for pfreeing the array.
166 * *poslen number of elements in *pos_ptr
167 * *endptr scan resumption point
168 *
169 * Pass NULL for any unwanted output parameters.
170 *
171 * If state->escontext is an ErrorSaveContext, then caller must check
172 * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 * error or normal end-of-string.
174 */
175bool
177 char **strval, int *lenval,
178 WordEntryPos **pos_ptr, int *poslen,
179 char **endptr)
180{
181 int oldstate = 0;
182 char *curpos = state->word;
183 int statecode = WAITWORD;
184
185 /*
186 * pos is for collecting the comma delimited list of positions followed by
187 * the actual token.
188 */
189 WordEntryPos *pos = NULL;
190 int npos = 0; /* elements of pos used */
191 int posalen = 0; /* allocated size of pos */
192
193 while (1)
194 {
195 if (statecode == WAITWORD)
196 {
197 if (*(state->prsbuf) == '\0')
198 return false;
199 else if (!state->is_web && t_iseq(state->prsbuf, '\''))
200 statecode = WAITENDCMPLX;
201 else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202 {
203 statecode = WAITNEXTCHAR;
204 oldstate = WAITENDWORD;
205 }
206 else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 (state->is_web && t_iseq(state->prsbuf, '"')))
209 else if (!isspace((unsigned char) *state->prsbuf))
210 {
211 COPYCHAR(curpos, state->prsbuf);
212 curpos += pg_mblen(state->prsbuf);
213 statecode = WAITENDWORD;
214 }
215 }
216 else if (statecode == WAITNEXTCHAR)
217 {
218 if (*(state->prsbuf) == '\0')
219 ereturn(state->escontext, false,
220 (errcode(ERRCODE_SYNTAX_ERROR),
221 errmsg("there is no escaped character: \"%s\"",
222 state->bufstart)));
223 else
224 {
226 COPYCHAR(curpos, state->prsbuf);
227 curpos += pg_mblen(state->prsbuf);
228 Assert(oldstate != 0);
229 statecode = oldstate;
230 }
231 }
232 else if (statecode == WAITENDWORD)
233 {
234 if (!state->is_web && t_iseq(state->prsbuf, '\\'))
235 {
236 statecode = WAITNEXTCHAR;
237 oldstate = WAITENDWORD;
238 }
239 else if (isspace((unsigned char) *state->prsbuf) || *(state->prsbuf) == '\0' ||
240 (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
241 (state->is_web && t_iseq(state->prsbuf, '"')))
242 {
244 if (curpos == state->word)
246 *(curpos) = '\0';
248 }
249 else if (t_iseq(state->prsbuf, ':'))
250 {
251 if (curpos == state->word)
253 *(curpos) = '\0';
254 if (state->oprisdelim)
256 else
257 statecode = INPOSINFO;
258 }
259 else
260 {
262 COPYCHAR(curpos, state->prsbuf);
263 curpos += pg_mblen(state->prsbuf);
264 }
265 }
266 else if (statecode == WAITENDCMPLX)
267 {
268 if (!state->is_web && t_iseq(state->prsbuf, '\''))
269 {
270 statecode = WAITCHARCMPLX;
271 }
272 else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
273 {
274 statecode = WAITNEXTCHAR;
275 oldstate = WAITENDCMPLX;
276 }
277 else if (*(state->prsbuf) == '\0')
279 else
280 {
282 COPYCHAR(curpos, state->prsbuf);
283 curpos += pg_mblen(state->prsbuf);
284 }
285 }
286 else if (statecode == WAITCHARCMPLX)
287 {
288 if (!state->is_web && t_iseq(state->prsbuf, '\''))
289 {
291 COPYCHAR(curpos, state->prsbuf);
292 curpos += pg_mblen(state->prsbuf);
293 statecode = WAITENDCMPLX;
294 }
295 else
296 {
298 *(curpos) = '\0';
299 if (curpos == state->word)
301 if (state->oprisdelim)
302 {
303 /* state->prsbuf+=pg_mblen(state->prsbuf); */
305 }
306 else
307 statecode = WAITPOSINFO;
308 continue; /* recheck current character */
309 }
310 }
311 else if (statecode == WAITPOSINFO)
312 {
313 if (t_iseq(state->prsbuf, ':'))
314 statecode = INPOSINFO;
315 else
317 }
318 else if (statecode == INPOSINFO)
319 {
320 if (isdigit((unsigned char) *state->prsbuf))
321 {
322 if (posalen == 0)
323 {
324 posalen = 4;
325 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
326 npos = 0;
327 }
328 else if (npos + 1 >= posalen)
329 {
330 posalen *= 2;
331 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
332 }
333 npos++;
334 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
335 /* we cannot get here in tsquery, so no need for 2 errmsgs */
336 if (WEP_GETPOS(pos[npos - 1]) == 0)
337 ereturn(state->escontext, false,
338 (errcode(ERRCODE_SYNTAX_ERROR),
339 errmsg("wrong position info in tsvector: \"%s\"",
340 state->bufstart)));
341 WEP_SETWEIGHT(pos[npos - 1], 0);
342 statecode = WAITPOSDELIM;
343 }
344 else
346 }
347 else if (statecode == WAITPOSDELIM)
348 {
349 if (t_iseq(state->prsbuf, ','))
350 statecode = INPOSINFO;
351 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
352 {
353 if (WEP_GETWEIGHT(pos[npos - 1]))
355 WEP_SETWEIGHT(pos[npos - 1], 3);
356 }
357 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
358 {
359 if (WEP_GETWEIGHT(pos[npos - 1]))
361 WEP_SETWEIGHT(pos[npos - 1], 2);
362 }
363 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
364 {
365 if (WEP_GETWEIGHT(pos[npos - 1]))
367 WEP_SETWEIGHT(pos[npos - 1], 1);
368 }
369 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
370 {
371 if (WEP_GETWEIGHT(pos[npos - 1]))
373 WEP_SETWEIGHT(pos[npos - 1], 0);
374 }
375 else if (isspace((unsigned char) *state->prsbuf) ||
376 *(state->prsbuf) == '\0')
378 else if (!isdigit((unsigned char) *state->prsbuf))
380 }
381 else /* internal error */
382 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
383 statecode);
384
385 /* get next char */
386 state->prsbuf += pg_mblen(state->prsbuf);
387 }
388}
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ereturn(context, dummy_value,...)
Definition: elog.h:277
#define errsave(context,...)
Definition: elog.h:261
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
Assert(PointerIsAligned(start, uint64))
FILE * input
#define ISOPERATOR(x)
Definition: ltree.h:167
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1544
void pfree(void *pointer)
Definition: mcxt.c:1524
void * palloc(Size size)
Definition: mcxt.c:1317
Definition: nodes.h:129
Definition: regguts.h:323
#define t_iseq(x, c)
Definition: ts_locale.h:38
#define COPYCHAR(d, s)
Definition: ts_locale.h:40
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define WEP_SETPOS(x, v)
Definition: ts_type.h:83
uint16 WordEntryPos
Definition: ts_type.h:63
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
#define LIMITPOS(x)
Definition: ts_type.h:87
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define P_TSV_IS_TSQUERY
Definition: ts_utils.h:30
struct TSVectorParseStateData * TSVectorParseState
Definition: ts_utils.h:26
#define P_TSV_IS_WEB
Definition: ts_utils.h:31
#define P_TSV_OPR_IS_DELIM
Definition: ts_utils.h:29
void reset_tsvector_parser(TSVectorParseState state, char *input)
#define WAITNEXTCHAR
#define PRSSYNTAXERROR
void close_tsvector_parser(TSVectorParseState state)
#define WAITENDCMPLX
#define WAITENDWORD
#define WAITCHARCMPLX
#define WAITPOSINFO
static bool prssyntaxerror(TSVectorParseState state)
#define WAITWORD
#define INPOSINFO
#define RESIZEPRSBUF
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
#define RETURN_TOKEN
TSVectorParseState init_tsvector_parser(char *input, int flags, Node *escontext)
#define WAITPOSDELIM