PostgreSQL Source Code  git master
tsquery_cleanup.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsquery_cleanup.c
4  * Cleanup query from NOT values and/or stopword
5  * Utility functions to correct work.
6  *
7  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/tsquery_cleanup.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "miscadmin.h"
19 #include "tsearch/ts_utils.h"
20 #include "varatt.h"
21 
22 typedef struct NODE
23 {
24  struct NODE *left;
25  struct NODE *right;
27 } NODE;
28 
29 /*
30  * make query tree from plain view of query
31  */
32 static NODE *
34 {
35  NODE *node = (NODE *) palloc(sizeof(NODE));
36 
37  /* since this function recurses, it could be driven to stack overflow. */
39 
40  node->valnode = in;
41  node->right = node->left = NULL;
42  if (in->type == QI_OPR)
43  {
44  node->right = maketree(in + 1);
45  if (in->qoperator.oper != OP_NOT)
46  node->left = maketree(in + in->qoperator.left);
47  }
48  return node;
49 }
50 
51 /*
52  * Internal state for plaintree and plainnode
53  */
54 typedef struct
55 {
57  int len; /* allocated size of ptr */
58  int cur; /* number of elements in ptr */
59 } PLAINTREE;
60 
61 static void
63 {
64  /* since this function recurses, it could be driven to stack overflow. */
66 
67  if (state->cur == state->len)
68  {
69  state->len *= 2;
70  state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
71  }
72  memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
73  if (node->valnode->type == QI_VAL)
74  state->cur++;
75  else if (node->valnode->qoperator.oper == OP_NOT)
76  {
77  state->ptr[state->cur].qoperator.left = 1;
78  state->cur++;
79  plainnode(state, node->right);
80  }
81  else
82  {
83  int cur = state->cur;
84 
85  state->cur++;
86  plainnode(state, node->right);
87  state->ptr[cur].qoperator.left = state->cur - cur;
88  plainnode(state, node->left);
89  }
90  pfree(node);
91 }
92 
93 /*
94  * make plain view of tree from a NODE-tree representation
95  */
96 static QueryItem *
97 plaintree(NODE *root, int *len)
98 {
99  PLAINTREE pl;
100 
101  pl.cur = 0;
102  pl.len = 16;
103  if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
104  {
105  pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
106  plainnode(&pl, root);
107  }
108  else
109  pl.ptr = NULL;
110  *len = pl.cur;
111  return pl.ptr;
112 }
113 
114 static void
116 {
117  /* since this function recurses, it could be driven to stack overflow. */
119 
120  if (!node)
121  return;
122  if (node->left)
123  freetree(node->left);
124  if (node->right)
125  freetree(node->right);
126  pfree(node);
127 }
128 
129 /*
130  * clean tree for ! operator.
131  * It's useful for debug, but in
132  * other case, such view is used with search in index.
133  * Operator ! always return TRUE
134  */
135 static NODE *
137 {
138  /* since this function recurses, it could be driven to stack overflow. */
140 
141  if (node->valnode->type == QI_VAL)
142  return node;
143 
144  if (node->valnode->qoperator.oper == OP_NOT)
145  {
146  freetree(node);
147  return NULL;
148  }
149 
150  /* operator & or | */
151  if (node->valnode->qoperator.oper == OP_OR)
152  {
153  if ((node->left = clean_NOT_intree(node->left)) == NULL ||
154  (node->right = clean_NOT_intree(node->right)) == NULL)
155  {
156  freetree(node);
157  return NULL;
158  }
159  }
160  else
161  {
162  NODE *res = node;
163 
164  Assert(node->valnode->qoperator.oper == OP_AND ||
165  node->valnode->qoperator.oper == OP_PHRASE);
166 
167  node->left = clean_NOT_intree(node->left);
168  node->right = clean_NOT_intree(node->right);
169  if (node->left == NULL && node->right == NULL)
170  {
171  pfree(node);
172  res = NULL;
173  }
174  else if (node->left == NULL)
175  {
176  res = node->right;
177  pfree(node);
178  }
179  else if (node->right == NULL)
180  {
181  res = node->left;
182  pfree(node);
183  }
184  return res;
185  }
186  return node;
187 }
188 
189 QueryItem *
191 {
192  NODE *root = maketree(ptr);
193 
194  return plaintree(clean_NOT_intree(root), len);
195 }
196 
197 
198 /*
199  * Remove QI_VALSTOP (stopword) nodes from query tree.
200  *
201  * Returns NULL if the query degenerates to nothing. Input must not be NULL.
202  *
203  * When we remove a phrase operator due to removing one or both of its
204  * arguments, we might need to adjust the distance of a parent phrase
205  * operator. For example, 'a' is a stopword, so:
206  * (b <-> a) <-> c should become b <2> c
207  * b <-> (a <-> c) should become b <2> c
208  * (b <-> (a <-> a)) <-> c should become b <3> c
209  * b <-> ((a <-> a) <-> c) should become b <3> c
210  * To handle that, we define two output parameters:
211  * ladd: amount to add to a phrase distance to the left of this node
212  * radd: amount to add to a phrase distance to the right of this node
213  * We need two outputs because we could need to bubble up adjustments to two
214  * different parent phrase operators. Consider
215  * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
216  * After we've removed the two a's and are considering the <2> node (which is
217  * now just x <2> y), we have an ladd distance of 1 that needs to propagate
218  * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
219  * propagate to the rightmost <->, so that we'll end up with
220  * w <2> ((x <2> y) <4> z)
221  * Near the bottom of the tree, we may have subtrees consisting only of
222  * stopwords. The distances of any phrase operators within such a subtree are
223  * summed and propagated to both ladd and radd, since we don't know which side
224  * of the lowest surviving phrase operator we are in. The rule is that any
225  * subtree that degenerates to NULL must return equal values of ladd and radd,
226  * and the parent node dealing with it should incorporate only one of those.
227  *
228  * Currently, we only implement this adjustment for adjacent phrase operators.
229  * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
230  * isn't ideal, but there is no way to represent the really desired semantics
231  * without some redesign of the tsquery structure. Certainly it would not be
232  * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
233  * corner case, let it go for now. But we can fix it in cases where the
234  * intervening non-phrase operator also gets removed, for example
235  * '((x <-> a) | a) <-> y' will become 'x <2> y'.
236  */
237 static NODE *
238 clean_stopword_intree(NODE *node, int *ladd, int *radd)
239 {
240  /* since this function recurses, it could be driven to stack overflow. */
242 
243  /* default output parameters indicate no change in parent distance */
244  *ladd = *radd = 0;
245 
246  if (node->valnode->type == QI_VAL)
247  return node;
248  else if (node->valnode->type == QI_VALSTOP)
249  {
250  pfree(node);
251  return NULL;
252  }
253 
254  Assert(node->valnode->type == QI_OPR);
255 
256  if (node->valnode->qoperator.oper == OP_NOT)
257  {
258  /* NOT doesn't change pattern width, so just report child distances */
259  node->right = clean_stopword_intree(node->right, ladd, radd);
260  if (!node->right)
261  {
262  freetree(node);
263  return NULL;
264  }
265  }
266  else
267  {
268  NODE *res = node;
269  bool isphrase;
270  int ndistance,
271  lladd,
272  lradd,
273  rladd,
274  rradd;
275 
276  /* First, recurse */
277  node->left = clean_stopword_intree(node->left, &lladd, &lradd);
278  node->right = clean_stopword_intree(node->right, &rladd, &rradd);
279 
280  /* Check if current node is OP_PHRASE, get its distance */
281  isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
282  ndistance = isphrase ? node->valnode->qoperator.distance : 0;
283 
284  if (node->left == NULL && node->right == NULL)
285  {
286  /*
287  * When we collapse out a phrase node entirely, propagate its own
288  * distance into both *ladd and *radd; it is the responsibility of
289  * the parent node to count it only once. Also, for a phrase
290  * node, distances coming from children are summed and propagated
291  * up to parent (we assume lladd == lradd and rladd == rradd, else
292  * rule was broken at a lower level). But if this isn't a phrase
293  * node, take the larger of the two child distances; that
294  * corresponds to what TS_execute will do in non-stopword cases.
295  */
296  if (isphrase)
297  *ladd = *radd = lladd + ndistance + rladd;
298  else
299  *ladd = *radd = Max(lladd, rladd);
300  freetree(node);
301  return NULL;
302  }
303  else if (node->left == NULL)
304  {
305  /* Removing this operator and left subnode */
306  /* lladd and lradd are equal/redundant, don't count both */
307  if (isphrase)
308  {
309  /* operator's own distance must propagate to left */
310  *ladd = lladd + ndistance + rladd;
311  *radd = rradd;
312  }
313  else
314  {
315  /* at non-phrase op, just forget the left subnode entirely */
316  *ladd = rladd;
317  *radd = rradd;
318  }
319  res = node->right;
320  pfree(node);
321  }
322  else if (node->right == NULL)
323  {
324  /* Removing this operator and right subnode */
325  /* rladd and rradd are equal/redundant, don't count both */
326  if (isphrase)
327  {
328  /* operator's own distance must propagate to right */
329  *ladd = lladd;
330  *radd = lradd + ndistance + rradd;
331  }
332  else
333  {
334  /* at non-phrase op, just forget the right subnode entirely */
335  *ladd = lladd;
336  *radd = lradd;
337  }
338  res = node->left;
339  pfree(node);
340  }
341  else if (isphrase)
342  {
343  /* Absorb appropriate corrections at this level */
344  node->valnode->qoperator.distance += lradd + rladd;
345  /* Propagate up any unaccounted-for corrections */
346  *ladd = lladd;
347  *radd = rradd;
348  }
349  else
350  {
351  /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
352  }
353 
354  return res;
355  }
356  return node;
357 }
358 
359 /*
360  * Number of elements in query tree
361  */
362 static int32
364 {
365  int32 size = 0;
366 
367  if (node->valnode->type == QI_VAL)
368  {
369  size = node->valnode->qoperand.length + 1;
370  }
371  else
372  {
373  Assert(node->valnode->type == QI_OPR);
374 
375  size = calcstrlen(node->right);
376  if (node->valnode->qoperator.oper != OP_NOT)
377  size += calcstrlen(node->left);
378  }
379 
380  return size;
381 }
382 
383 /*
384  * Remove QI_VALSTOP (stopword) nodes from TSQuery.
385  */
386 TSQuery
388 {
389  int32 len,
390  lenstr,
391  commonlen,
392  i;
393  NODE *root;
394  int ladd,
395  radd;
396  TSQuery out;
397  QueryItem *items;
398  char *operands;
399 
400  if (in->size == 0)
401  return in;
402 
403  /* eliminate stop words */
404  root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
405  if (root == NULL)
406  {
407  if (noisy)
408  ereport(NOTICE,
409  (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
410  out = palloc(HDRSIZETQ);
411  out->size = 0;
412  SET_VARSIZE(out, HDRSIZETQ);
413  return out;
414  }
415 
416  /*
417  * Build TSQuery from plain view
418  */
419 
420  lenstr = calcstrlen(root);
421  items = plaintree(root, &len);
422  commonlen = COMPUTESIZE(len, lenstr);
423 
424  out = palloc(commonlen);
425  SET_VARSIZE(out, commonlen);
426  out->size = len;
427 
428  memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
429 
430  items = GETQUERY(out);
431  operands = GETOPERAND(out);
432  for (i = 0; i < out->size; i++)
433  {
434  QueryOperand *op = (QueryOperand *) &items[i];
435 
436  if (op->type != QI_VAL)
437  continue;
438 
439  memcpy(operands, GETOPERAND(in) + op->distance, op->length);
440  operands[op->length] = '\0';
441  op->distance = operands - GETOPERAND(out);
442  operands += op->length + 1;
443  }
444 
445  return out;
446 }
#define COMPUTESIZE(size)
Definition: _int.h:155
#define GETQUERY(x)
Definition: _int.h:157
signed int int32
Definition: c.h:483
#define Max(x, y)
Definition: c.h:987
struct cursor * cur
Definition: ecpg.c:28
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define NOTICE
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:149
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
#define GETOPERAND(x)
Definition: ltree.h:165
void pfree(void *pointer)
Definition: mcxt.c:1456
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * palloc(Size size)
Definition: mcxt.c:1226
const void size_t len
void check_stack_depth(void)
Definition: postgres.c:3520
Definition: _int_bool.c:27
struct NODE * right
QueryItem * valnode
struct NODE * left
QueryItem * ptr
QueryItemType type
Definition: ts_type.h:158
uint32 distance
Definition: ts_type.h:172
uint32 length
Definition: ts_type.h:171
int16 distance
Definition: ts_type.h:196
uint32 left
Definition: ts_type.h:197
int32 size
Definition: ts_type.h:221
Definition: regguts.h:323
#define HDRSIZETQ
Definition: ts_type.h:227
#define QI_OPR
Definition: ts_type.h:150
#define QI_VAL
Definition: ts_type.h:149
#define QI_VALSTOP
Definition: ts_type.h:151
#define OP_AND
Definition: ts_type.h:180
#define OP_PHRASE
Definition: ts_type.h:182
#define OP_OR
Definition: ts_type.h:181
#define OP_NOT
Definition: ts_type.h:179
QueryItem * clean_NOT(QueryItem *ptr, int *len)
static void freetree(NODE *node)
static NODE * clean_stopword_intree(NODE *node, int *ladd, int *radd)
TSQuery cleanup_tsquery_stopwords(TSQuery in, bool noisy)
static NODE * clean_NOT_intree(NODE *node)
static void plainnode(PLAINTREE *state, NODE *node)
static NODE * maketree(QueryItem *in)
static QueryItem * plaintree(NODE *root, int *len)
struct NODE NODE
static int32 calcstrlen(NODE *node)
QueryOperator qoperator
Definition: ts_type.h:209
QueryOperand qoperand
Definition: ts_type.h:210
QueryItemType type
Definition: ts_type.h:208
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305