PostgreSQL Source Code git master
Loading...
Searching...
No Matches
tsvector_parser.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tsvector_parser.c
4 * Parser for tsvector
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_parser.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "tsearch/ts_locale.h"
18#include "tsearch/ts_utils.h"
19
20
21/*
22 * Private state of tsvector parser. Note that tsquery also uses this code to
23 * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 * flags are both true or both false in current usage, but we keep them
25 * separate for clarity.
26 *
27 * If oprisdelim is set, the following characters are treated as delimiters
28 * (in addition to whitespace): ! | & ( )
29 *
30 * is_tsquery affects *only* the content of error messages.
31 *
32 * is_web can be true to further modify tsquery parsing.
33 *
34 * If escontext is an ErrorSaveContext node, then soft errors can be
35 * captured there rather than being thrown.
36 */
38{
39 char *prsbuf; /* next input character */
40 char *bufstart; /* whole string (used only for errors) */
41 char *word; /* buffer to hold the current word */
42 int len; /* size in bytes allocated for 'word' */
43 int eml; /* max bytes per character */
44 bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 bool is_web; /* we're in websearch_to_tsquery() */
47 Node *escontext; /* for soft error reporting */
48};
49
50
51/*
52 * Initializes a parser state object for the given input string.
53 * A bitmask of flags (see ts_utils.h) and an error context object
54 * can be provided as well.
55 */
57init_tsvector_parser(char *input, int flags, Node *escontext)
58{
60
62 state->prsbuf = input;
63 state->bufstart = input;
64 state->len = 32;
65 state->word = (char *) palloc(state->len);
67 state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68 state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69 state->is_web = (flags & P_TSV_IS_WEB) != 0;
70 state->escontext = escontext;
71
72 return state;
73}
74
75/*
76 * Reinitializes parser to parse 'input', instead of previous input.
77 *
78 * Note that bufstart (the string reported in errors) is not changed.
79 */
80void
85
86/*
87 * Shuts down a tsvector parser.
88 */
89void
95
96/* increase the size of 'word' if needed to hold one more character */
97#define RESIZEPRSBUF \
98do { \
99 int clen = curpos - state->word; \
100 if ( clen + state->eml >= state->len ) \
101 { \
102 state->len *= 2; \
103 state->word = (char *) repalloc(state->word, state->len); \
104 curpos = state->word + clen; \
105 } \
106} while (0)
107
108/* Fills gettoken_tsvector's output parameters, and returns true */
109#define RETURN_TOKEN \
110do { \
111 if (pos_ptr != NULL) \
112 { \
113 *pos_ptr = pos; \
114 *poslen = npos; \
115 } \
116 else if (pos != NULL) \
117 pfree(pos); \
118 \
119 if (strval != NULL) \
120 *strval = state->word; \
121 if (lenval != NULL) \
122 *lenval = curpos - state->word; \
123 if (endptr != NULL) \
124 *endptr = state->prsbuf; \
125 return true; \
126} while(0)
127
128
129/* State codes used in gettoken_tsvector */
130#define WAITWORD 1
131#define WAITENDWORD 2
132#define WAITNEXTCHAR 3
133#define WAITENDCMPLX 4
134#define WAITPOSINFO 5
135#define INPOSINFO 6
136#define WAITPOSDELIM 7
137#define WAITCHARCMPLX 8
138
139#define PRSSYNTAXERROR return prssyntaxerror(state)
140
141static bool
143{
144 errsave(state->escontext,
146 state->is_tsquery ?
147 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 /* In soft error situation, return false as convenience for caller */
150 return false;
151}
152
153
154/*
155 * Get next token from string being parsed. Returns true if successful,
156 * false if end of input string is reached or soft error.
157 *
158 * On success, these output parameters are filled in:
159 *
160 * *strval pointer to token
161 * *lenval length of *strval
162 * *pos_ptr pointer to a palloc'd array of positions and weights
163 * associated with the token. If the caller is not interested
164 * in the information, NULL can be supplied. Otherwise
165 * the caller is responsible for pfreeing the array.
166 * *poslen number of elements in *pos_ptr
167 * *endptr scan resumption point
168 *
169 * Pass NULL for any unwanted output parameters.
170 *
171 * If state->escontext is an ErrorSaveContext, then caller must check
172 * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 * error or normal end-of-string.
174 */
175bool
177 char **strval, int *lenval,
178 WordEntryPos **pos_ptr, int *poslen,
179 char **endptr)
180{
181 int oldstate = 0;
182 char *curpos = state->word;
183 int statecode = WAITWORD;
184
185 /*
186 * pos is for collecting the comma delimited list of positions followed by
187 * the actual token.
188 */
189 WordEntryPos *pos = NULL;
190 int npos = 0; /* elements of pos used */
191 int posalen = 0; /* allocated size of pos */
192
193 while (1)
194 {
195 if (statecode == WAITWORD)
196 {
197 if (*(state->prsbuf) == '\0')
198 return false;
199 else if (!state->is_web && t_iseq(state->prsbuf, '\''))
201 else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202 {
205 }
206 else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 (state->is_web && t_iseq(state->prsbuf, '"')))
209 else if (!isspace((unsigned char) *state->prsbuf))
210 {
211 curpos += ts_copychar_cstr(curpos, state->prsbuf);
213 }
214 }
215 else if (statecode == WAITNEXTCHAR)
216 {
217 if (*(state->prsbuf) == '\0')
218 ereturn(state->escontext, false,
220 errmsg("there is no escaped character: \"%s\"",
221 state->bufstart)));
222 else
223 {
225 curpos += ts_copychar_cstr(curpos, state->prsbuf);
226 Assert(oldstate != 0);
228 }
229 }
230 else if (statecode == WAITENDWORD)
231 {
232 if (!state->is_web && t_iseq(state->prsbuf, '\\'))
233 {
236 }
237 else if (isspace((unsigned char) *state->prsbuf) || *(state->prsbuf) == '\0' ||
238 (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
239 (state->is_web && t_iseq(state->prsbuf, '"')))
240 {
242 if (curpos == state->word)
244 *(curpos) = '\0';
246 }
247 else if (t_iseq(state->prsbuf, ':'))
248 {
249 if (curpos == state->word)
251 *(curpos) = '\0';
252 if (state->oprisdelim)
254 else
256 }
257 else
258 {
260 curpos += ts_copychar_cstr(curpos, state->prsbuf);
261 }
262 }
263 else if (statecode == WAITENDCMPLX)
264 {
265 if (!state->is_web && t_iseq(state->prsbuf, '\''))
266 {
268 }
269 else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
270 {
273 }
274 else if (*(state->prsbuf) == '\0')
276 else
277 {
279 curpos += ts_copychar_cstr(curpos, state->prsbuf);
280 }
281 }
282 else if (statecode == WAITCHARCMPLX)
283 {
284 if (!state->is_web && t_iseq(state->prsbuf, '\''))
285 {
287 curpos += ts_copychar_cstr(curpos, state->prsbuf);
289 }
290 else
291 {
293 *(curpos) = '\0';
294 if (curpos == state->word)
296 if (state->oprisdelim)
297 {
298 /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
300 }
301 else
303 continue; /* recheck current character */
304 }
305 }
306 else if (statecode == WAITPOSINFO)
307 {
308 if (t_iseq(state->prsbuf, ':'))
310 else
312 }
313 else if (statecode == INPOSINFO)
314 {
315 if (isdigit((unsigned char) *state->prsbuf))
316 {
317 if (posalen == 0)
318 {
319 posalen = 4;
321 npos = 0;
322 }
323 else if (npos + 1 >= posalen)
324 {
325 posalen *= 2;
327 }
328 npos++;
329 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
330 /* we cannot get here in tsquery, so no need for 2 errmsgs */
331 if (WEP_GETPOS(pos[npos - 1]) == 0)
332 ereturn(state->escontext, false,
334 errmsg("wrong position info in tsvector: \"%s\"",
335 state->bufstart)));
336 WEP_SETWEIGHT(pos[npos - 1], 0);
338 }
339 else
341 }
342 else if (statecode == WAITPOSDELIM)
343 {
344 if (t_iseq(state->prsbuf, ','))
346 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
347 {
348 if (WEP_GETWEIGHT(pos[npos - 1]))
350 WEP_SETWEIGHT(pos[npos - 1], 3);
351 }
352 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
353 {
354 if (WEP_GETWEIGHT(pos[npos - 1]))
356 WEP_SETWEIGHT(pos[npos - 1], 2);
357 }
358 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
359 {
360 if (WEP_GETWEIGHT(pos[npos - 1]))
362 WEP_SETWEIGHT(pos[npos - 1], 1);
363 }
364 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
365 {
366 if (WEP_GETWEIGHT(pos[npos - 1]))
368 WEP_SETWEIGHT(pos[npos - 1], 0);
369 }
370 else if (isspace((unsigned char) *state->prsbuf) ||
371 *(state->prsbuf) == '\0')
373 else if (!isdigit((unsigned char) *state->prsbuf))
375 }
376 else /* internal error */
377 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
378 statecode);
379
380 /* get next char */
381 state->prsbuf += pg_mblen_cstr(state->prsbuf);
382 }
383}
#define Assert(condition)
Definition c.h:885
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define ereturn(context, dummy_value,...)
Definition elog.h:278
#define errsave(context,...)
Definition elog.h:262
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define palloc_object(type)
Definition fe_memutils.h:74
#define repalloc_array(pointer, type, count)
Definition fe_memutils.h:78
#define palloc_array(type, count)
Definition fe_memutils.h:76
FILE * input
#define ISOPERATOR(x)
Definition ltree.h:167
int pg_mblen_cstr(const char *mbstr)
Definition mbutils.c:1045
int pg_database_encoding_max_length(void)
Definition mbutils.c:1674
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
static int fb(int x)
Definition nodes.h:135
static int ts_copychar_cstr(void *dest, const void *src)
Definition ts_locale.h:50
#define t_iseq(x, c)
Definition ts_locale.h:38
#define WEP_GETPOS(x)
Definition ts_type.h:80
#define WEP_SETPOS(x, v)
Definition ts_type.h:83
uint16 WordEntryPos
Definition ts_type.h:63
#define WEP_SETWEIGHT(x, v)
Definition ts_type.h:82
#define LIMITPOS(x)
Definition ts_type.h:87
#define WEP_GETWEIGHT(x)
Definition ts_type.h:79
#define P_TSV_IS_TSQUERY
Definition ts_utils.h:30
#define P_TSV_IS_WEB
Definition ts_utils.h:31
#define P_TSV_OPR_IS_DELIM
Definition ts_utils.h:29
void reset_tsvector_parser(TSVectorParseState state, char *input)
#define WAITNEXTCHAR
#define PRSSYNTAXERROR
void close_tsvector_parser(TSVectorParseState state)
#define WAITENDCMPLX
#define WAITENDWORD
#define WAITCHARCMPLX
#define WAITPOSINFO
static bool prssyntaxerror(TSVectorParseState state)
#define WAITWORD
#define INPOSINFO
#define RESIZEPRSBUF
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
#define RETURN_TOKEN
TSVectorParseState init_tsvector_parser(char *input, int flags, Node *escontext)
#define WAITPOSDELIM