PostgreSQL Source Code  git master
tsvector.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  * I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "nodes/miscnodes.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/builtins.h"
22 #include "utils/memutils.h"
23 #include "varatt.h"
24 
25 typedef struct
26 {
27  WordEntry entry; /* must be first! */
29  int poslen; /* number of elements in pos */
30 } WordEntryIN;
31 
32 
33 /* Compare two WordEntryPos values for qsort */
34 int
35 compareWordEntryPos(const void *a, const void *b)
36 {
37  int apos = WEP_GETPOS(*(const WordEntryPos *) a);
38  int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
39 
40  if (apos == bpos)
41  return 0;
42  return (apos > bpos) ? 1 : -1;
43 }
44 
45 /*
46  * Removes duplicate pos entries. If there's two entries with same pos but
47  * different weight, the higher weight is retained, so we can't use
48  * qunique here.
49  *
50  * Returns new length.
51  */
52 static int
54 {
55  WordEntryPos *ptr,
56  *res;
57 
58  if (l <= 1)
59  return l;
60 
61  qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
62 
63  res = a;
64  ptr = a + 1;
65  while (ptr - a < l)
66  {
67  if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
68  {
69  res++;
70  *res = *ptr;
71  if (res - a >= MAXNUMPOS - 1 ||
72  WEP_GETPOS(*res) == MAXENTRYPOS - 1)
73  break;
74  }
75  else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
77  ptr++;
78  }
79 
80  return res + 1 - a;
81 }
82 
83 /* Compare two WordEntryIN values for qsort */
84 static int
85 compareentry(const void *va, const void *vb, void *arg)
86 {
87  const WordEntryIN *a = (const WordEntryIN *) va;
88  const WordEntryIN *b = (const WordEntryIN *) vb;
89  char *BufferStr = (char *) arg;
90 
91  return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
92  &BufferStr[b->entry.pos], b->entry.len,
93  false);
94 }
95 
96 /*
97  * Sort an array of WordEntryIN, remove duplicates.
98  * *outbuflen receives the amount of space needed for strings and positions.
99  */
100 static int
101 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
102 {
103  int buflen;
104  WordEntryIN *ptr,
105  *res;
106 
107  Assert(l >= 1);
108 
109  if (l > 1)
110  qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
111  (void *) buf);
112 
113  buflen = 0;
114  res = a;
115  ptr = a + 1;
116  while (ptr - a < l)
117  {
118  if (!(ptr->entry.len == res->entry.len &&
119  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
120  res->entry.len) == 0))
121  {
122  /* done accumulating data into *res, count space needed */
123  buflen += res->entry.len;
124  if (res->entry.haspos)
125  {
126  res->poslen = uniquePos(res->pos, res->poslen);
127  buflen = SHORTALIGN(buflen);
128  buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
129  }
130  res++;
131  if (res != ptr)
132  memcpy(res, ptr, sizeof(WordEntryIN));
133  }
134  else if (ptr->entry.haspos)
135  {
136  if (res->entry.haspos)
137  {
138  /* append ptr's positions to res's positions */
139  int newlen = ptr->poslen + res->poslen;
140 
141  res->pos = (WordEntryPos *)
142  repalloc(res->pos, newlen * sizeof(WordEntryPos));
143  memcpy(&res->pos[res->poslen], ptr->pos,
144  ptr->poslen * sizeof(WordEntryPos));
145  res->poslen = newlen;
146  pfree(ptr->pos);
147  }
148  else
149  {
150  /* just give ptr's positions to pos */
151  res->entry.haspos = 1;
152  res->pos = ptr->pos;
153  res->poslen = ptr->poslen;
154  }
155  }
156  ptr++;
157  }
158 
159  /* count space needed for last item */
160  buflen += res->entry.len;
161  if (res->entry.haspos)
162  {
163  res->poslen = uniquePos(res->pos, res->poslen);
164  buflen = SHORTALIGN(buflen);
165  buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
166  }
167 
168  *outbuflen = buflen;
169  return res + 1 - a;
170 }
171 
172 static int
174 {
175  return compareentry(a, b, buf);
176 }
177 
178 
179 Datum
181 {
182  char *buf = PG_GETARG_CSTRING(0);
183  Node *escontext = fcinfo->context;
185  WordEntryIN *arr;
186  int totallen;
187  int arrlen; /* allocated size of arr */
188  WordEntry *inarr;
189  int len = 0;
190  TSVector in;
191  int i;
192  char *token;
193  int toklen;
194  WordEntryPos *pos;
195  int poslen;
196  char *strbuf;
197  int stroff;
198 
199  /*
200  * Tokens are appended to tmpbuf, cur is a pointer to the end of used
201  * space in tmpbuf.
202  */
203  char *tmpbuf;
204  char *cur;
205  int buflen = 256; /* allocated size of tmpbuf */
206 
207  state = init_tsvector_parser(buf, 0, escontext);
208 
209  arrlen = 64;
210  arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
211  cur = tmpbuf = (char *) palloc(buflen);
212 
213  while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
214  {
215  if (toklen >= MAXSTRLEN)
216  ereturn(escontext, (Datum) 0,
217  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
218  errmsg("word is too long (%ld bytes, max %ld bytes)",
219  (long) toklen,
220  (long) (MAXSTRLEN - 1))));
221 
222  if (cur - tmpbuf > MAXSTRPOS)
223  ereturn(escontext, (Datum) 0,
224  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
225  errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
226  (long) (cur - tmpbuf), (long) MAXSTRPOS)));
227 
228  /*
229  * Enlarge buffers if needed
230  */
231  if (len >= arrlen)
232  {
233  arrlen *= 2;
234  arr = (WordEntryIN *)
235  repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
236  }
237  while ((cur - tmpbuf) + toklen >= buflen)
238  {
239  int dist = cur - tmpbuf;
240 
241  buflen *= 2;
242  tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
243  cur = tmpbuf + dist;
244  }
245  arr[len].entry.len = toklen;
246  arr[len].entry.pos = cur - tmpbuf;
247  memcpy((void *) cur, (void *) token, toklen);
248  cur += toklen;
249 
250  if (poslen != 0)
251  {
252  arr[len].entry.haspos = 1;
253  arr[len].pos = pos;
254  arr[len].poslen = poslen;
255  }
256  else
257  {
258  arr[len].entry.haspos = 0;
259  arr[len].pos = NULL;
260  arr[len].poslen = 0;
261  }
262  len++;
263  }
264 
266 
267  /* Did gettoken_tsvector fail? */
268  if (SOFT_ERROR_OCCURRED(escontext))
269  PG_RETURN_NULL();
270 
271  if (len > 0)
272  len = uniqueentry(arr, len, tmpbuf, &buflen);
273  else
274  buflen = 0;
275 
276  if (buflen > MAXSTRPOS)
277  ereturn(escontext, (Datum) 0,
278  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
279  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
280 
281  totallen = CALCDATASIZE(len, buflen);
282  in = (TSVector) palloc0(totallen);
283  SET_VARSIZE(in, totallen);
284  in->size = len;
285  inarr = ARRPTR(in);
286  strbuf = STRPTR(in);
287  stroff = 0;
288  for (i = 0; i < len; i++)
289  {
290  memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
291  arr[i].entry.pos = stroff;
292  stroff += arr[i].entry.len;
293  if (arr[i].entry.haspos)
294  {
295  /* This should be unreachable because of MAXNUMPOS restrictions */
296  if (arr[i].poslen > 0xFFFF)
297  elog(ERROR, "positions array too long");
298 
299  /* Copy number of positions */
300  stroff = SHORTALIGN(stroff);
301  *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
302  stroff += sizeof(uint16);
303 
304  /* Copy positions */
305  memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
306  stroff += arr[i].poslen * sizeof(WordEntryPos);
307 
308  pfree(arr[i].pos);
309  }
310  inarr[i] = arr[i].entry;
311  }
312 
313  Assert((strbuf + stroff - (char *) in) == totallen);
314 
315  PG_RETURN_TSVECTOR(in);
316 }
317 
318 Datum
320 {
321  TSVector out = PG_GETARG_TSVECTOR(0);
322  char *outbuf;
323  int32 i,
324  lenbuf = 0,
325  pp;
326  WordEntry *ptr = ARRPTR(out);
327  char *curbegin,
328  *curin,
329  *curout;
330 
331  lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
332  for (i = 0; i < out->size; i++)
333  {
334  lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
335  if (ptr[i].haspos)
336  lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
337  }
338 
339  curout = outbuf = (char *) palloc(lenbuf);
340  for (i = 0; i < out->size; i++)
341  {
342  curbegin = curin = STRPTR(out) + ptr->pos;
343  if (i != 0)
344  *curout++ = ' ';
345  *curout++ = '\'';
346  while (curin - curbegin < ptr->len)
347  {
348  int len = pg_mblen(curin);
349 
350  if (t_iseq(curin, '\''))
351  *curout++ = '\'';
352  else if (t_iseq(curin, '\\'))
353  *curout++ = '\\';
354 
355  while (len--)
356  *curout++ = *curin++;
357  }
358 
359  *curout++ = '\'';
360  if ((pp = POSDATALEN(out, ptr)) != 0)
361  {
362  WordEntryPos *wptr;
363 
364  *curout++ = ':';
365  wptr = POSDATAPTR(out, ptr);
366  while (pp)
367  {
368  curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
369  switch (WEP_GETWEIGHT(*wptr))
370  {
371  case 3:
372  *curout++ = 'A';
373  break;
374  case 2:
375  *curout++ = 'B';
376  break;
377  case 1:
378  *curout++ = 'C';
379  break;
380  case 0:
381  default:
382  break;
383  }
384 
385  if (pp > 1)
386  *curout++ = ',';
387  pp--;
388  wptr++;
389  }
390  }
391  ptr++;
392  }
393 
394  *curout = '\0';
395  PG_FREE_IF_COPY(out, 0);
396  PG_RETURN_CSTRING(outbuf);
397 }
398 
399 /*
400  * Binary Input / Output functions. The binary format is as follows:
401  *
402  * uint32 number of lexemes
403  *
404  * for each lexeme:
405  * lexeme text in client encoding, null-terminated
406  * uint16 number of positions
407  * for each position:
408  * uint16 WordEntryPos
409  */
410 
411 Datum
413 {
414  TSVector vec = PG_GETARG_TSVECTOR(0);
416  int i,
417  j;
418  WordEntry *weptr = ARRPTR(vec);
419 
421 
422  pq_sendint32(&buf, vec->size);
423  for (i = 0; i < vec->size; i++)
424  {
425  uint16 npos;
426 
427  /*
428  * the strings in the TSVector array are not null-terminated, so we
429  * have to send the null-terminator separately
430  */
431  pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
432  pq_sendbyte(&buf, '\0');
433 
434  npos = POSDATALEN(vec, weptr);
435  pq_sendint16(&buf, npos);
436 
437  if (npos > 0)
438  {
439  WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
440 
441  for (j = 0; j < npos; j++)
442  pq_sendint16(&buf, wepptr[j]);
443  }
444  weptr++;
445  }
446 
448 }
449 
450 Datum
452 {
454  TSVector vec;
455  int i;
456  int32 nentries;
457  int datalen; /* number of bytes used in the variable size
458  * area after fixed size TSVector header and
459  * WordEntries */
460  Size hdrlen;
461  Size len; /* allocated size of vec */
462  bool needSort = false;
463 
464  nentries = pq_getmsgint(buf, sizeof(int32));
465  if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
466  elog(ERROR, "invalid size of tsvector");
467 
468  hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
469 
470  len = hdrlen * 2; /* times two to make room for lexemes */
471  vec = (TSVector) palloc0(len);
472  vec->size = nentries;
473 
474  datalen = 0;
475  for (i = 0; i < nentries; i++)
476  {
477  const char *lexeme;
478  uint16 npos;
479  size_t lex_len;
480 
481  lexeme = pq_getmsgstring(buf);
482  npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
483 
484  /* sanity checks */
485 
486  lex_len = strlen(lexeme);
487  if (lex_len > MAXSTRLEN)
488  elog(ERROR, "invalid tsvector: lexeme too long");
489 
490  if (datalen > MAXSTRPOS)
491  elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
492 
493  if (npos > MAXNUMPOS)
494  elog(ERROR, "unexpected number of tsvector positions");
495 
496  /*
497  * Looks valid. Fill the WordEntry struct, and copy lexeme.
498  *
499  * But make sure the buffer is large enough first.
500  */
501  while (hdrlen + SHORTALIGN(datalen + lex_len) +
502  (npos + 1) * sizeof(WordEntryPos) >= len)
503  {
504  len *= 2;
505  vec = (TSVector) repalloc(vec, len);
506  }
507 
508  vec->entries[i].haspos = (npos > 0) ? 1 : 0;
509  vec->entries[i].len = lex_len;
510  vec->entries[i].pos = datalen;
511 
512  memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
513 
514  datalen += lex_len;
515 
516  if (i > 0 && WordEntryCMP(&vec->entries[i],
517  &vec->entries[i - 1],
518  STRPTR(vec)) <= 0)
519  needSort = true;
520 
521  /* Receive positions */
522  if (npos > 0)
523  {
524  uint16 j;
525  WordEntryPos *wepptr;
526 
527  /*
528  * Pad to 2-byte alignment if necessary. Though we used palloc0
529  * for the initial allocation, subsequent repalloc'd memory areas
530  * are not initialized to zero.
531  */
532  if (datalen != SHORTALIGN(datalen))
533  {
534  *(STRPTR(vec) + datalen) = '\0';
535  datalen = SHORTALIGN(datalen);
536  }
537 
538  memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
539 
540  wepptr = POSDATAPTR(vec, &vec->entries[i]);
541  for (j = 0; j < npos; j++)
542  {
543  wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
544  if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
545  elog(ERROR, "position information is misordered");
546  }
547 
548  datalen += (npos + 1) * sizeof(WordEntry);
549  }
550  }
551 
552  SET_VARSIZE(vec, hdrlen + datalen);
553 
554  if (needSort)
555  qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
556  compareentry, (void *) STRPTR(vec));
557 
558  PG_RETURN_TSVECTOR(vec);
559 }
unsigned short uint16
Definition: c.h:489
signed int int32
Definition: c.h:478
#define SHORTALIGN(LEN)
Definition: c.h:791
size_t Size
Definition: c.h:589
#define ARRPTR(x)
Definition: cube.c:25
struct cursor * cur
Definition: ecpg.c:28
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define ereturn(context, dummy_value,...)
Definition: elog.h:276
#define ERROR
Definition: elog.h:39
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_CSTRING(x)
Definition: fmgr.h:362
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:277
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define CALCDATASIZE(x, lenstr)
Definition: hstore.h:72
#define STRPTR(x)
Definition: hstore.h:76
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1553
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc0(Size size)
Definition: mcxt.c:1241
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1210
#define MaxAllocSize
Definition: memutils.h:40
#define SOFT_ERROR_OCCURRED(escontext)
Definition: miscnodes.h:52
void * arg
const void size_t len
static char * buf
Definition: pg_test_fsync.c:67
#define sprintf
Definition: port.h:240
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:445
uintptr_t Datum
Definition: postgres.h:64
unsigned int pq_getmsgint(StringInfo msg, int b)
Definition: pqformat.c:418
void pq_sendtext(StringInfo buf, const char *str, int slen)
Definition: pqformat.c:175
const char * pq_getmsgstring(StringInfo msg)
Definition: pqformat.c:582
void pq_begintypsend(StringInfo buf)
Definition: pqformat.c:329
bytea * pq_endtypsend(StringInfo buf)
Definition: pqformat.c:349
static void pq_sendint32(StringInfo buf, uint32 i)
Definition: pqformat.h:145
static void pq_sendbyte(StringInfo buf, uint8 byt)
Definition: pqformat.h:161
static void pq_sendint16(StringInfo buf, uint16 i)
Definition: pqformat.h:137
StringInfoData * StringInfo
Definition: stringinfo.h:44
Definition: nodes.h:129
int32 size
Definition: ts_type.h:93
WordEntry entries[FLEXIBLE_ARRAY_MEMBER]
Definition: ts_type.h:94
int poslen
Definition: tsvector.c:29
WordEntryPos * pos
Definition: tsvector.c:28
WordEntry entry
Definition: tsvector.c:27
uint32 pos
Definition: ts_type.h:46
uint32 haspos
Definition: ts_type.h:44
uint32 len
Definition: ts_type.h:45
Definition: regguts.h:318
#define t_iseq(x, c)
Definition: ts_locale.h:38
#define PG_GETARG_TSVECTOR(n)
Definition: ts_type.h:135
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define PG_RETURN_TSVECTOR(x)
Definition: ts_type.h:137
#define MAXENTRYPOS
Definition: ts_type.h:85
#define POSDATALEN(x, e)
Definition: ts_type.h:110
uint16 WordEntryPos
Definition: ts_type.h:63
#define MAXNUMPOS
Definition: ts_type.h:86
TSVectorData * TSVector
Definition: ts_type.h:98
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
#define DATAHDRSIZE
Definition: ts_type.h:100
#define MAXSTRLEN
Definition: ts_type.h:49
#define POSDATAPTR(x, e)
Definition: ts_type.h:111
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define MAXSTRPOS
Definition: ts_type.h:50
Datum tsvectorout(PG_FUNCTION_ARGS)
Definition: tsvector.c:319
static int uniquePos(WordEntryPos *a, int l)
Definition: tsvector.c:53
Datum tsvectorrecv(PG_FUNCTION_ARGS)
Definition: tsvector.c:451
static int compareentry(const void *va, const void *vb, void *arg)
Definition: tsvector.c:85
Datum tsvectorin(PG_FUNCTION_ARGS)
Definition: tsvector.c:180
int compareWordEntryPos(const void *a, const void *b)
Definition: tsvector.c:35
Datum tsvectorsend(PG_FUNCTION_ARGS)
Definition: tsvector.c:412
static int WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
Definition: tsvector.c:173
static int uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
Definition: tsvector.c:101
int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
Definition: tsvector_op.c:1154
void close_tsvector_parser(TSVectorParseState state)
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
TSVectorParseState init_tsvector_parser(char *input, int flags, Node *escontext)
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
static StringInfoData tmpbuf
Definition: walsender.c:160