PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
to_tsany.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * to_tsany.c
4  * to_ts* function definitions
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/tsearch/to_tsany.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "tsearch/ts_cache.h"
17 #include "tsearch/ts_utils.h"
18 #include "utils/builtins.h"
19 #include "utils/jsonapi.h"
20 
21 
22 typedef struct MorphOpaque
23 {
25  int qoperator; /* query operator */
26 } MorphOpaque;
27 
28 typedef struct TSVectorBuildState
29 {
33 
34 static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
35 
36 
37 Datum
39 {
41 }
42 
43 /*
44  * to_tsvector
45  */
46 static int
47 compareWORD(const void *a, const void *b)
48 {
49  int res;
50 
51  res = tsCompareString(
52  ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
53  ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
54  false);
55 
56  if (res == 0)
57  {
58  if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
59  return 0;
60 
61  res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
62  }
63 
64  return res;
65 }
66 
67 static int
69 {
70  ParsedWord *ptr,
71  *res;
72  int tmppos;
73 
74  if (l == 1)
75  {
76  tmppos = LIMITPOS(a->pos.pos);
77  a->alen = 2;
78  a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
79  a->pos.apos[0] = 1;
80  a->pos.apos[1] = tmppos;
81  return l;
82  }
83 
84  res = a;
85  ptr = a + 1;
86 
87  /*
88  * Sort words with its positions
89  */
90  qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
91 
92  /*
93  * Initialize first word and its first position
94  */
95  tmppos = LIMITPOS(a->pos.pos);
96  a->alen = 2;
97  a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
98  a->pos.apos[0] = 1;
99  a->pos.apos[1] = tmppos;
100 
101  /*
102  * Summarize position information for each word
103  */
104  while (ptr - a < l)
105  {
106  if (!(ptr->len == res->len &&
107  strncmp(ptr->word, res->word, res->len) == 0))
108  {
109  /*
110  * Got a new word, so put it in result
111  */
112  res++;
113  res->len = ptr->len;
114  res->word = ptr->word;
115  tmppos = LIMITPOS(ptr->pos.pos);
116  res->alen = 2;
117  res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
118  res->pos.apos[0] = 1;
119  res->pos.apos[1] = tmppos;
120  }
121  else
122  {
123  /*
124  * The word already exists, so adjust position information. But
125  * before we should check size of position's array, max allowed
126  * value for position and uniqueness of position
127  */
128  pfree(ptr->word);
129  if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
130  res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
131  {
132  if (res->pos.apos[0] + 1 >= res->alen)
133  {
134  res->alen *= 2;
135  res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
136  }
137  if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
138  {
139  res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
140  res->pos.apos[0]++;
141  }
142  }
143  }
144  ptr++;
145  }
146 
147  return res + 1 - a;
148 }
149 
150 /*
151  * make value of tsvector, given parsed text
152  *
153  * Note: frees prs->words and subsidiary data.
154  */
155 TSVector
157 {
158  int i,
159  j,
160  lenstr = 0,
161  totallen;
162  TSVector in;
163  WordEntry *ptr;
164  char *str;
165  int stroff;
166 
167  /* Merge duplicate words */
168  if (prs->curwords > 0)
169  prs->curwords = uniqueWORD(prs->words, prs->curwords);
170 
171  /* Determine space needed */
172  for (i = 0; i < prs->curwords; i++)
173  {
174  lenstr += prs->words[i].len;
175  if (prs->words[i].alen)
176  {
177  lenstr = SHORTALIGN(lenstr);
178  lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
179  }
180  }
181 
182  if (lenstr > MAXSTRPOS)
183  ereport(ERROR,
184  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
185  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
186 
187  totallen = CALCDATASIZE(prs->curwords, lenstr);
188  in = (TSVector) palloc0(totallen);
189  SET_VARSIZE(in, totallen);
190  in->size = prs->curwords;
191 
192  ptr = ARRPTR(in);
193  str = STRPTR(in);
194  stroff = 0;
195  for (i = 0; i < prs->curwords; i++)
196  {
197  ptr->len = prs->words[i].len;
198  ptr->pos = stroff;
199  memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
200  stroff += prs->words[i].len;
201  pfree(prs->words[i].word);
202  if (prs->words[i].alen)
203  {
204  int k = prs->words[i].pos.apos[0];
205  WordEntryPos *wptr;
206 
207  if (k > 0xFFFF)
208  elog(ERROR, "positions array too long");
209 
210  ptr->haspos = 1;
211  stroff = SHORTALIGN(stroff);
212  *(uint16 *) (str + stroff) = (uint16) k;
213  wptr = POSDATAPTR(in, ptr);
214  for (j = 0; j < k; j++)
215  {
216  WEP_SETWEIGHT(wptr[j], 0);
217  WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
218  }
219  stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
220  pfree(prs->words[i].pos.apos);
221  }
222  else
223  ptr->haspos = 0;
224  ptr++;
225  }
226 
227  if (prs->words)
228  pfree(prs->words);
229 
230  return in;
231 }
232 
233 Datum
235 {
236  Oid cfgId = PG_GETARG_OID(0);
237  text *in = PG_GETARG_TEXT_PP(1);
238  ParsedText prs;
239  TSVector out;
240 
241  prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
242  * number */
243  if (prs.lenwords < 2)
244  prs.lenwords = 2;
245  prs.curwords = 0;
246  prs.pos = 0;
247  prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
248 
249  parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
250 
251  PG_FREE_IF_COPY(in, 1);
252 
253  out = make_tsvector(&prs);
254 
255  PG_RETURN_TSVECTOR(out);
256 }
257 
258 Datum
260 {
261  text *in = PG_GETARG_TEXT_PP(0);
262  Oid cfgId;
263 
264  cfgId = getTSCurrentConfig(true);
266  ObjectIdGetDatum(cfgId),
267  PointerGetDatum(in)));
268 }
269 
270 Datum
272 {
273  Oid cfgId = PG_GETARG_OID(0);
274  Jsonb *jb = PG_GETARG_JSONB(1);
277  ParsedText prs;
278 
279  prs.words = NULL;
280  prs.curwords = 0;
281  state.prs = &prs;
282  state.cfgId = cfgId;
283 
285 
286  PG_FREE_IF_COPY(jb, 1);
287 
288  result = make_tsvector(&prs);
289 
290  PG_RETURN_TSVECTOR(result);
291 }
292 
293 Datum
295 {
296  Jsonb *jb = PG_GETARG_JSONB(0);
297  Oid cfgId;
298 
299  cfgId = getTSCurrentConfig(true);
301  ObjectIdGetDatum(cfgId),
302  JsonbGetDatum(jb)));
303 }
304 
305 Datum
307 {
308  Oid cfgId = PG_GETARG_OID(0);
309  text *json = PG_GETARG_TEXT_P(1);
312  ParsedText prs;
313 
314  prs.words = NULL;
315  prs.curwords = 0;
316  state.prs = &prs;
317  state.cfgId = cfgId;
318 
320 
321  PG_FREE_IF_COPY(json, 1);
322 
323  result = make_tsvector(&prs);
324 
325  PG_RETURN_TSVECTOR(result);
326 }
327 
328 Datum
330 {
331  text *json = PG_GETARG_TEXT_P(0);
332  Oid cfgId;
333 
334  cfgId = getTSCurrentConfig(true);
336  ObjectIdGetDatum(cfgId),
337  PointerGetDatum(json)));
338 }
339 
340 /*
341  * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
342  */
343 static void
344 add_to_tsvector(void *_state, char *elem_value, int elem_len)
345 {
347  ParsedText *prs = state->prs;
348  int32 prevwords;
349 
350  if (prs->words == NULL)
351  {
352  /*
353  * First time through: initialize words array to a reasonable size.
354  * (parsetext() will realloc it bigger as needed.)
355  */
356  prs->lenwords = Max(elem_len / 6, 64);
357  prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
358  prs->curwords = 0;
359  prs->pos = 0;
360  }
361 
362  prevwords = prs->curwords;
363 
364  parsetext(state->cfgId, prs, elem_value, elem_len);
365 
366  /*
367  * If we extracted any words from this JSON element, advance pos to create
368  * an artificial break between elements. This is because we don't want
369  * phrase searches to think that the last word in this element is adjacent
370  * to the first word in the next one.
371  */
372  if (prs->curwords > prevwords)
373  prs->pos += 1;
374 }
375 
376 
377 /*
378  * to_tsquery
379  */
380 
381 
382 /*
383  * This function is used for morph parsing.
384  *
385  * The value is passed to parsetext which will call the right dictionary to
386  * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
387  * to the stack.
388  *
389  * All words belonging to the same variant are pushed as an ANDed list,
390  * and different variants are ORed together.
391  */
392 static void
393 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
394 {
395  int32 count = 0;
396  ParsedText prs;
397  uint32 variant,
398  pos = 0,
399  cntvar = 0,
400  cntpos = 0,
401  cnt = 0;
402  MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
403 
404  prs.lenwords = 4;
405  prs.curwords = 0;
406  prs.pos = 0;
407  prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
408 
409  parsetext(data->cfg_id, &prs, strval, lenval);
410 
411  if (prs.curwords > 0)
412  {
413  while (count < prs.curwords)
414  {
415  /*
416  * Were any stop words removed? If so, fill empty positions with
417  * placeholders linked by an appropriate operator.
418  */
419  if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
420  {
421  while (pos + 1 < prs.words[count].pos.pos)
422  {
423  /* put placeholders for each missing stop word */
424  pushStop(state);
425  if (cntpos)
426  pushOperator(state, data->qoperator, 1);
427  cntpos++;
428  pos++;
429  }
430  }
431 
432  /* save current word's position */
433  pos = prs.words[count].pos.pos;
434 
435  /* Go through all variants obtained from this token */
436  cntvar = 0;
437  while (count < prs.curwords && pos == prs.words[count].pos.pos)
438  {
439  variant = prs.words[count].nvariant;
440 
441  /* Push all words belonging to the same variant */
442  cnt = 0;
443  while (count < prs.curwords &&
444  pos == prs.words[count].pos.pos &&
445  variant == prs.words[count].nvariant)
446  {
447  pushValue(state,
448  prs.words[count].word,
449  prs.words[count].len,
450  weight,
451  ((prs.words[count].flags & TSL_PREFIX) || prefix));
452  pfree(prs.words[count].word);
453  if (cnt)
454  pushOperator(state, OP_AND, 0);
455  cnt++;
456  count++;
457  }
458 
459  if (cntvar)
460  pushOperator(state, OP_OR, 0);
461  cntvar++;
462  }
463 
464  if (cntpos)
465  {
466  /* distance may be useful */
467  pushOperator(state, data->qoperator, 1);
468  }
469 
470  cntpos++;
471  }
472 
473  pfree(prs.words);
474 
475  }
476  else
477  pushStop(state);
478 }
479 
480 Datum
482 {
483  text *in = PG_GETARG_TEXT_PP(1);
484  TSQuery query;
485  MorphOpaque data;
486 
487  data.cfg_id = PG_GETARG_OID(0);
488  data.qoperator = OP_AND;
489 
490  query = parse_tsquery(text_to_cstring(in),
492  PointerGetDatum(&data),
493  false);
494 
495  PG_RETURN_TSQUERY(query);
496 }
497 
498 Datum
500 {
501  text *in = PG_GETARG_TEXT_PP(0);
502  Oid cfgId;
503 
504  cfgId = getTSCurrentConfig(true);
506  ObjectIdGetDatum(cfgId),
507  PointerGetDatum(in)));
508 }
509 
510 Datum
512 {
513  text *in = PG_GETARG_TEXT_PP(1);
514  TSQuery query;
515  MorphOpaque data;
516 
517  data.cfg_id = PG_GETARG_OID(0);
518  data.qoperator = OP_AND;
519 
520  query = parse_tsquery(text_to_cstring(in),
522  PointerGetDatum(&data),
523  true);
524 
525  PG_RETURN_POINTER(query);
526 }
527 
528 Datum
530 {
531  text *in = PG_GETARG_TEXT_PP(0);
532  Oid cfgId;
533 
534  cfgId = getTSCurrentConfig(true);
536  ObjectIdGetDatum(cfgId),
537  PointerGetDatum(in)));
538 }
539 
540 
541 Datum
543 {
544  text *in = PG_GETARG_TEXT_PP(1);
545  TSQuery query;
546  MorphOpaque data;
547 
548  data.cfg_id = PG_GETARG_OID(0);
549  data.qoperator = OP_PHRASE;
550 
551  query = parse_tsquery(text_to_cstring(in),
553  PointerGetDatum(&data),
554  true);
555 
556  PG_RETURN_TSQUERY(query);
557 }
558 
559 Datum
561 {
562  text *in = PG_GETARG_TEXT_PP(0);
563  Oid cfgId;
564 
565  cfgId = getTSCurrentConfig(true);
567  ObjectIdGetDatum(cfgId),
568  PointerGetDatum(in)));
569 }
Datum plainto_tsquery_byid(PG_FUNCTION_ARGS)
Definition: to_tsany.c:511
uint16 WordEntryPos
Definition: ts_type.h:63
signed short int16
Definition: c.h:255
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
void iterate_json_string_values(text *json, void *action_state, JsonIterateStringValuesAction action)
Definition: jsonfuncs.c:4837
struct TSVectorBuildState TSVectorBuildState
#define TSL_PREFIX
Definition: ts_public.h:116
TSVector make_tsvector(ParsedText *prs)
Definition: to_tsany.c:156
#define VARDATA_ANY(PTR)
Definition: postgres.h:347
Definition: jsonb.h:215
Oid getTSCurrentConfig(bool emitError)
Definition: ts_cache.c:556
#define PointerGetDatum(X)
Definition: postgres.h:562
struct MorphOpaque MorphOpaque
uint16 nvariant
Definition: ts_utils.h:65
uint32 len
Definition: ts_type.h:44
#define PG_RETURN_TSVECTOR(x)
Definition: ts_type.h:122
Datum plainto_tsquery(PG_FUNCTION_ARGS)
Definition: to_tsany.c:529
static void add_to_tsvector(void *_state, char *elem_value, int elem_len)
Definition: to_tsany.c:344
int errcode(int sqlerrcode)
Definition: elog.c:575
return result
Definition: formatting.c:1633
int32 lenwords
Definition: ts_utils.h:85
#define MAXSTRPOS
Definition: ts_type.h:50
unsigned int Oid
Definition: postgres_ext.h:31
#define OP_OR
Definition: ts_type.h:168
#define WEP_SETPOS(x, v)
Definition: ts_type.h:83
int32 curwords
Definition: ts_utils.h:86
signed int int32
Definition: c.h:256
Datum get_current_ts_config(PG_FUNCTION_ARGS)
Definition: to_tsany.c:38
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:273
#define POSDATAPTR(x, e)
Definition: ts_type.h:111
Datum to_tsvector_byid(PG_FUNCTION_ARGS)
Definition: to_tsany.c:234
#define PG_RETURN_TSQUERY(x)
Definition: ts_type.h:240
Datum phraseto_tsquery(PG_FUNCTION_ARGS)
Definition: to_tsany.c:560
#define OP_AND
Definition: ts_type.h:167
unsigned short uint16
Definition: c.h:267
void pfree(void *pointer)
Definition: mcxt.c:950
#define ObjectIdGetDatum(X)
Definition: postgres.h:513
#define ERROR
Definition: elog.h:43
void parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
Definition: ts_parse.c:358
Datum json_to_tsvector(PG_FUNCTION_ARGS)
Definition: to_tsany.c:329
#define MAXNUMPOS
Definition: ts_type.h:86
int qoperator
Definition: to_tsany.c:25
void pushStop(TSQueryParserState state)
Definition: tsquery.c:420
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
TSQuery parse_tsquery(char *buf, PushFunction pushval, Datum opaque, bool isplain)
Definition: tsquery.c:605
#define JsonbGetDatum(p)
Definition: jsonb.h:69
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
int32 size
Definition: ts_type.h:93
void pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
Definition: tsquery.c:384
unsigned int uint32
Definition: c.h:268
ParsedWord * words
Definition: ts_utils.h:84
uint16 pos
Definition: ts_utils.h:68
static int compareWORD(const void *a, const void *b)
Definition: to_tsany.c:47
uint32 haspos
Definition: ts_type.h:44
#define ereport(elevel, rest)
Definition: elog.h:122
ParsedText * prs
Definition: to_tsany.c:30
#define CALCDATASIZE(x, lenstr)
Definition: hstore.h:72
void * palloc0(Size size)
Definition: mcxt.c:878
uintptr_t Datum
Definition: postgres.h:372
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:313
TSVectorData * TSVector
Definition: ts_type.h:98
void pushOperator(TSQueryParserState state, int8 oper, int16 distance)
Definition: tsquery.c:335
#define Max(x, y)
Definition: c.h:801
#define NULL
Definition: c.h:229
Definition: regguts.h:298
#define OP_PHRASE
Definition: ts_type.h:169
uint16 len
Definition: ts_utils.h:64
Datum to_tsvector(PG_FUNCTION_ARGS)
Definition: to_tsany.c:259
int32 pos
Definition: ts_utils.h:87
Oid cfg_id
Definition: to_tsany.c:24
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:225
int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
Definition: tsvector_op.c:1160
uint32 pos
Definition: ts_type.h:44
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:963
#define DatumGetPointer(X)
Definition: postgres.h:555
uint16 flags
Definition: ts_utils.h:77
static void word(struct vars *, int, struct state *, struct state *)
Definition: regcomp.c:1243
Datum to_tsquery_byid(PG_FUNCTION_ARGS)
Definition: to_tsany.c:481
uint32 alen
Definition: ts_utils.h:79
char * text_to_cstring(const text *t)
Definition: varlena.c:182
#define PG_GETARG_JSONB(x)
Definition: jsonb.h:70
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:340
void * palloc(Size size)
Definition: mcxt.c:849
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define PG_GETARG_TEXT_P(n)
Definition: fmgr.h:300
#define STRPTR(x)
Definition: hstore.h:76
int i
Datum jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
Definition: to_tsany.c:271
Definition: c.h:439
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define LIMITPOS(x)
Definition: ts_type.h:87
static int uniqueWORD(ParsedWord *a, int32 l)
Definition: to_tsany.c:68
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:328
#define ARRPTR(x)
Definition: cube.c:26
#define elog
Definition: elog.h:219
Datum phraseto_tsquery_byid(PG_FUNCTION_ARGS)
Definition: to_tsany.c:542
#define qsort(a, b, c, d)
Definition: port.h:443
#define SHORTALIGN(LEN)
Definition: c.h:584
#define MAXENTRYPOS
Definition: ts_type.h:85
#define PG_RETURN_OID(x)
Definition: fmgr.h:320
void iterate_jsonb_string_values(Jsonb *jb, void *state, JsonIterateStringValuesAction action)
Definition: jsonfuncs.c:4815
Datum jsonb_to_tsvector(PG_FUNCTION_ARGS)
Definition: to_tsany.c:294
static void pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
Definition: to_tsany.c:393
#define DirectFunctionCall2(func, arg1, arg2)
Definition: fmgr.h:586
Datum to_tsquery(PG_FUNCTION_ARGS)
Definition: to_tsany.c:499
char * word
Definition: ts_utils.h:78
Datum json_to_tsvector_byid(PG_FUNCTION_ARGS)
Definition: to_tsany.c:306