PostgreSQL Source Code git master
tsquery_cleanup.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tsquery_cleanup.c
4 * Cleanup query from NOT values and/or stopword
5 * Utility functions to correct work.
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/tsquery_cleanup.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "miscadmin.h"
19#include "tsearch/ts_utils.h"
20#include "varatt.h"
21
22typedef struct NODE
23{
24 struct NODE *left;
25 struct NODE *right;
28
29/*
30 * make query tree from plain view of query
31 */
32static NODE *
34{
35 NODE *node = (NODE *) palloc(sizeof(NODE));
36
37 /* since this function recurses, it could be driven to stack overflow. */
39
40 node->valnode = in;
41 node->right = node->left = NULL;
42 if (in->type == QI_OPR)
43 {
44 node->right = maketree(in + 1);
45 if (in->qoperator.oper != OP_NOT)
46 node->left = maketree(in + in->qoperator.left);
47 }
48 return node;
49}
50
51/*
52 * Internal state for plaintree and plainnode
53 */
54typedef struct
55{
57 int len; /* allocated size of ptr */
58 int cur; /* number of elements in ptr */
59} PLAINTREE;
60
61static void
63{
64 /* since this function recurses, it could be driven to stack overflow. */
66
67 if (state->cur == state->len)
68 {
69 state->len *= 2;
70 state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
71 }
72 memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
73 if (node->valnode->type == QI_VAL)
74 state->cur++;
75 else if (node->valnode->qoperator.oper == OP_NOT)
76 {
77 state->ptr[state->cur].qoperator.left = 1;
78 state->cur++;
79 plainnode(state, node->right);
80 }
81 else
82 {
83 int cur = state->cur;
84
85 state->cur++;
86 plainnode(state, node->right);
87 state->ptr[cur].qoperator.left = state->cur - cur;
88 plainnode(state, node->left);
89 }
90 pfree(node);
91}
92
93/*
94 * make plain view of tree from a NODE-tree representation
95 */
96static QueryItem *
98{
99 PLAINTREE pl;
100
101 pl.cur = 0;
102 pl.len = 16;
103 if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
104 {
105 pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
106 plainnode(&pl, root);
107 }
108 else
109 pl.ptr = NULL;
110 *len = pl.cur;
111 return pl.ptr;
112}
113
114static void
116{
117 /* since this function recurses, it could be driven to stack overflow. */
119
120 if (!node)
121 return;
122 if (node->left)
123 freetree(node->left);
124 if (node->right)
125 freetree(node->right);
126 pfree(node);
127}
128
129/*
130 * clean tree for ! operator.
131 * It's useful for debug, but in
132 * other case, such view is used with search in index.
133 * Operator ! always return TRUE
134 */
135static NODE *
137{
138 /* since this function recurses, it could be driven to stack overflow. */
140
141 if (node->valnode->type == QI_VAL)
142 return node;
143
144 if (node->valnode->qoperator.oper == OP_NOT)
145 {
146 freetree(node);
147 return NULL;
148 }
149
150 /* operator & or | */
151 if (node->valnode->qoperator.oper == OP_OR)
152 {
153 if ((node->left = clean_NOT_intree(node->left)) == NULL ||
154 (node->right = clean_NOT_intree(node->right)) == NULL)
155 {
156 freetree(node);
157 return NULL;
158 }
159 }
160 else
161 {
162 NODE *res = node;
163
164 Assert(node->valnode->qoperator.oper == OP_AND ||
165 node->valnode->qoperator.oper == OP_PHRASE);
166
167 node->left = clean_NOT_intree(node->left);
168 node->right = clean_NOT_intree(node->right);
169 if (node->left == NULL && node->right == NULL)
170 {
171 pfree(node);
172 res = NULL;
173 }
174 else if (node->left == NULL)
175 {
176 res = node->right;
177 pfree(node);
178 }
179 else if (node->right == NULL)
180 {
181 res = node->left;
182 pfree(node);
183 }
184 return res;
185 }
186 return node;
187}
188
189QueryItem *
191{
192 NODE *root = maketree(ptr);
193
195}
196
197
198/*
199 * Remove QI_VALSTOP (stopword) nodes from query tree.
200 *
201 * Returns NULL if the query degenerates to nothing. Input must not be NULL.
202 *
203 * When we remove a phrase operator due to removing one or both of its
204 * arguments, we might need to adjust the distance of a parent phrase
205 * operator. For example, 'a' is a stopword, so:
206 * (b <-> a) <-> c should become b <2> c
207 * b <-> (a <-> c) should become b <2> c
208 * (b <-> (a <-> a)) <-> c should become b <3> c
209 * b <-> ((a <-> a) <-> c) should become b <3> c
210 * To handle that, we define two output parameters:
211 * ladd: amount to add to a phrase distance to the left of this node
212 * radd: amount to add to a phrase distance to the right of this node
213 * We need two outputs because we could need to bubble up adjustments to two
214 * different parent phrase operators. Consider
215 * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
216 * After we've removed the two a's and are considering the <2> node (which is
217 * now just x <2> y), we have an ladd distance of 1 that needs to propagate
218 * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
219 * propagate to the rightmost <->, so that we'll end up with
220 * w <2> ((x <2> y) <4> z)
221 * Near the bottom of the tree, we may have subtrees consisting only of
222 * stopwords. The distances of any phrase operators within such a subtree are
223 * summed and propagated to both ladd and radd, since we don't know which side
224 * of the lowest surviving phrase operator we are in. The rule is that any
225 * subtree that degenerates to NULL must return equal values of ladd and radd,
226 * and the parent node dealing with it should incorporate only one of those.
227 *
228 * Currently, we only implement this adjustment for adjacent phrase operators.
229 * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
230 * isn't ideal, but there is no way to represent the really desired semantics
231 * without some redesign of the tsquery structure. Certainly it would not be
232 * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
233 * corner case, let it go for now. But we can fix it in cases where the
234 * intervening non-phrase operator also gets removed, for example
235 * '((x <-> a) | a) <-> y' will become 'x <2> y'.
236 */
237static NODE *
238clean_stopword_intree(NODE *node, int *ladd, int *radd)
239{
240 /* since this function recurses, it could be driven to stack overflow. */
242
243 /* default output parameters indicate no change in parent distance */
244 *ladd = *radd = 0;
245
246 if (node->valnode->type == QI_VAL)
247 return node;
248 else if (node->valnode->type == QI_VALSTOP)
249 {
250 pfree(node);
251 return NULL;
252 }
253
254 Assert(node->valnode->type == QI_OPR);
255
256 if (node->valnode->qoperator.oper == OP_NOT)
257 {
258 /* NOT doesn't change pattern width, so just report child distances */
259 node->right = clean_stopword_intree(node->right, ladd, radd);
260 if (!node->right)
261 {
262 freetree(node);
263 return NULL;
264 }
265 }
266 else
267 {
268 NODE *res = node;
269 bool isphrase;
270 int ndistance,
271 lladd,
272 lradd,
273 rladd,
274 rradd;
275
276 /* First, recurse */
277 node->left = clean_stopword_intree(node->left, &lladd, &lradd);
278 node->right = clean_stopword_intree(node->right, &rladd, &rradd);
279
280 /* Check if current node is OP_PHRASE, get its distance */
281 isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
282 ndistance = isphrase ? node->valnode->qoperator.distance : 0;
283
284 if (node->left == NULL && node->right == NULL)
285 {
286 /*
287 * When we collapse out a phrase node entirely, propagate its own
288 * distance into both *ladd and *radd; it is the responsibility of
289 * the parent node to count it only once. Also, for a phrase
290 * node, distances coming from children are summed and propagated
291 * up to parent (we assume lladd == lradd and rladd == rradd, else
292 * rule was broken at a lower level). But if this isn't a phrase
293 * node, take the larger of the two child distances; that
294 * corresponds to what TS_execute will do in non-stopword cases.
295 */
296 if (isphrase)
297 *ladd = *radd = lladd + ndistance + rladd;
298 else
299 *ladd = *radd = Max(lladd, rladd);
300 freetree(node);
301 return NULL;
302 }
303 else if (node->left == NULL)
304 {
305 /* Removing this operator and left subnode */
306 /* lladd and lradd are equal/redundant, don't count both */
307 if (isphrase)
308 {
309 /* operator's own distance must propagate to left */
310 *ladd = lladd + ndistance + rladd;
311 *radd = rradd;
312 }
313 else
314 {
315 /* at non-phrase op, just forget the left subnode entirely */
316 *ladd = rladd;
317 *radd = rradd;
318 }
319 res = node->right;
320 pfree(node);
321 }
322 else if (node->right == NULL)
323 {
324 /* Removing this operator and right subnode */
325 /* rladd and rradd are equal/redundant, don't count both */
326 if (isphrase)
327 {
328 /* operator's own distance must propagate to right */
329 *ladd = lladd;
330 *radd = lradd + ndistance + rradd;
331 }
332 else
333 {
334 /* at non-phrase op, just forget the right subnode entirely */
335 *ladd = lladd;
336 *radd = lradd;
337 }
338 res = node->left;
339 pfree(node);
340 }
341 else if (isphrase)
342 {
343 /* Absorb appropriate corrections at this level */
344 node->valnode->qoperator.distance += lradd + rladd;
345 /* Propagate up any unaccounted-for corrections */
346 *ladd = lladd;
347 *radd = rradd;
348 }
349 else
350 {
351 /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
352 }
353
354 return res;
355 }
356 return node;
357}
358
359/*
360 * Number of elements in query tree
361 */
362static int32
364{
365 int32 size = 0;
366
367 if (node->valnode->type == QI_VAL)
368 {
369 size = node->valnode->qoperand.length + 1;
370 }
371 else
372 {
373 Assert(node->valnode->type == QI_OPR);
374
375 size = calcstrlen(node->right);
376 if (node->valnode->qoperator.oper != OP_NOT)
377 size += calcstrlen(node->left);
378 }
379
380 return size;
381}
382
383/*
384 * Remove QI_VALSTOP (stopword) nodes from TSQuery.
385 */
388{
389 int32 len,
390 lenstr,
391 commonlen,
392 i;
393 NODE *root;
394 int ladd,
395 radd;
396 TSQuery out;
398 char *operands;
399
400 if (in->size == 0)
401 return in;
402
403 /* eliminate stop words */
404 root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
405 if (root == NULL)
406 {
407 if (noisy)
409 (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
410 out = palloc(HDRSIZETQ);
411 out->size = 0;
413 return out;
414 }
415
416 /*
417 * Build TSQuery from plain view
418 */
419
420 lenstr = calcstrlen(root);
421 items = plaintree(root, &len);
422 commonlen = COMPUTESIZE(len, lenstr);
423
424 out = palloc(commonlen);
425 SET_VARSIZE(out, commonlen);
426 out->size = len;
427
428 memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
429
430 items = GETQUERY(out);
431 operands = GETOPERAND(out);
432 for (i = 0; i < out->size; i++)
433 {
434 QueryOperand *op = (QueryOperand *) &items[i];
435
436 if (op->type != QI_VAL)
437 continue;
438
439 memcpy(operands, GETOPERAND(in) + op->distance, op->length);
440 operands[op->length] = '\0';
441 op->distance = operands - GETOPERAND(out);
442 operands += op->length + 1;
443 }
444
445 return out;
446}
#define COMPUTESIZE(size)
Definition: _int.h:155
#define GETQUERY(x)
Definition: _int.h:157
#define Max(x, y)
Definition: c.h:969
int32_t int32
Definition: c.h:498
struct cursor * cur
Definition: ecpg.c:29
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define NOTICE
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:149
Assert(PointerIsAligned(start, uint64))
int i
Definition: isn.c:74
#define GETOPERAND(x)
Definition: ltree.h:165
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1544
void pfree(void *pointer)
Definition: mcxt.c:1524
void * palloc(Size size)
Definition: mcxt.c:1317
const void size_t len
tree ctl root
Definition: radixtree.h:1857
void check_stack_depth(void)
Definition: stack_depth.c:95
Definition: _int_bool.c:26
struct NODE * right
QueryItem * valnode
struct NODE * left
QueryItem * ptr
QueryItemType type
Definition: ts_type.h:158
uint32 distance
Definition: ts_type.h:172
uint32 length
Definition: ts_type.h:171
int16 distance
Definition: ts_type.h:196
uint32 left
Definition: ts_type.h:197
int32 size
Definition: ts_type.h:221
Definition: regguts.h:323
static ItemArray items
Definition: test_tidstore.c:48
#define HDRSIZETQ
Definition: ts_type.h:227
#define QI_OPR
Definition: ts_type.h:150
#define QI_VAL
Definition: ts_type.h:149
#define QI_VALSTOP
Definition: ts_type.h:151
#define OP_AND
Definition: ts_type.h:180
#define OP_PHRASE
Definition: ts_type.h:182
#define OP_OR
Definition: ts_type.h:181
#define OP_NOT
Definition: ts_type.h:179
static void freetree(NODE *node)
static NODE * clean_NOT_intree(NODE *node)
static NODE * maketree(QueryItem *in)
TSQuery cleanup_tsquery_stopwords(TSQuery in, bool noisy)
static QueryItem * plaintree(NODE *root, int *len)
static void plainnode(PLAINTREE *state, NODE *node)
struct NODE NODE
QueryItem * clean_NOT(QueryItem *ptr, int *len)
static int32 calcstrlen(NODE *node)
static NODE * clean_stopword_intree(NODE *node, int *ladd, int *radd)
QueryOperator qoperator
Definition: ts_type.h:209
QueryOperand qoperand
Definition: ts_type.h:210
QueryItemType type
Definition: ts_type.h:208
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305