PostgreSQL Source Code  git master
tsvector.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  * I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "common/int.h"
18 #include "libpq/pqformat.h"
19 #include "nodes/miscnodes.h"
20 #include "tsearch/ts_locale.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/fmgrprotos.h"
23 #include "utils/memutils.h"
24 #include "varatt.h"
25 
26 typedef struct
27 {
28  WordEntry entry; /* must be first! */
30  int poslen; /* number of elements in pos */
31 } WordEntryIN;
32 
33 
34 /* Compare two WordEntryPos values for qsort */
35 int
36 compareWordEntryPos(const void *a, const void *b)
37 {
38  int apos = WEP_GETPOS(*(const WordEntryPos *) a);
39  int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
40 
41  return pg_cmp_s32(apos, bpos);
42 }
43 
44 /*
45  * Removes duplicate pos entries. If there's two entries with same pos but
46  * different weight, the higher weight is retained, so we can't use
47  * qunique here.
48  *
49  * Returns new length.
50  */
51 static int
53 {
54  WordEntryPos *ptr,
55  *res;
56 
57  if (l <= 1)
58  return l;
59 
61 
62  res = a;
63  ptr = a + 1;
64  while (ptr - a < l)
65  {
66  if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
67  {
68  res++;
69  *res = *ptr;
70  if (res - a >= MAXNUMPOS - 1 ||
71  WEP_GETPOS(*res) == MAXENTRYPOS - 1)
72  break;
73  }
74  else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
76  ptr++;
77  }
78 
79  return res + 1 - a;
80 }
81 
82 /* Compare two WordEntryIN values for qsort */
83 static int
84 compareentry(const void *va, const void *vb, void *arg)
85 {
86  const WordEntryIN *a = (const WordEntryIN *) va;
87  const WordEntryIN *b = (const WordEntryIN *) vb;
88  char *BufferStr = (char *) arg;
89 
90  return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
91  &BufferStr[b->entry.pos], b->entry.len,
92  false);
93 }
94 
95 /*
96  * Sort an array of WordEntryIN, remove duplicates.
97  * *outbuflen receives the amount of space needed for strings and positions.
98  */
99 static int
100 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
101 {
102  int buflen;
103  WordEntryIN *ptr,
104  *res;
105 
106  Assert(l >= 1);
107 
108  if (l > 1)
109  qsort_arg(a, l, sizeof(WordEntryIN), compareentry, buf);
110 
111  buflen = 0;
112  res = a;
113  ptr = a + 1;
114  while (ptr - a < l)
115  {
116  if (!(ptr->entry.len == res->entry.len &&
117  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
118  res->entry.len) == 0))
119  {
120  /* done accumulating data into *res, count space needed */
121  buflen += res->entry.len;
122  if (res->entry.haspos)
123  {
124  res->poslen = uniquePos(res->pos, res->poslen);
125  buflen = SHORTALIGN(buflen);
126  buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
127  }
128  res++;
129  if (res != ptr)
130  memcpy(res, ptr, sizeof(WordEntryIN));
131  }
132  else if (ptr->entry.haspos)
133  {
134  if (res->entry.haspos)
135  {
136  /* append ptr's positions to res's positions */
137  int newlen = ptr->poslen + res->poslen;
138 
139  res->pos = (WordEntryPos *)
140  repalloc(res->pos, newlen * sizeof(WordEntryPos));
141  memcpy(&res->pos[res->poslen], ptr->pos,
142  ptr->poslen * sizeof(WordEntryPos));
143  res->poslen = newlen;
144  pfree(ptr->pos);
145  }
146  else
147  {
148  /* just give ptr's positions to pos */
149  res->entry.haspos = 1;
150  res->pos = ptr->pos;
151  res->poslen = ptr->poslen;
152  }
153  }
154  ptr++;
155  }
156 
157  /* count space needed for last item */
158  buflen += res->entry.len;
159  if (res->entry.haspos)
160  {
161  res->poslen = uniquePos(res->pos, res->poslen);
162  buflen = SHORTALIGN(buflen);
163  buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
164  }
165 
166  *outbuflen = buflen;
167  return res + 1 - a;
168 }
169 
170 static int
172 {
173  return compareentry(a, b, buf);
174 }
175 
176 
177 Datum
179 {
180  char *buf = PG_GETARG_CSTRING(0);
181  Node *escontext = fcinfo->context;
183  WordEntryIN *arr;
184  int totallen;
185  int arrlen; /* allocated size of arr */
186  WordEntry *inarr;
187  int len = 0;
188  TSVector in;
189  int i;
190  char *token;
191  int toklen;
192  WordEntryPos *pos;
193  int poslen;
194  char *strbuf;
195  int stroff;
196 
197  /*
198  * Tokens are appended to tmpbuf, cur is a pointer to the end of used
199  * space in tmpbuf.
200  */
201  char *tmpbuf;
202  char *cur;
203  int buflen = 256; /* allocated size of tmpbuf */
204 
205  state = init_tsvector_parser(buf, 0, escontext);
206 
207  arrlen = 64;
208  arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
209  cur = tmpbuf = (char *) palloc(buflen);
210 
211  while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
212  {
213  if (toklen >= MAXSTRLEN)
214  ereturn(escontext, (Datum) 0,
215  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
216  errmsg("word is too long (%ld bytes, max %ld bytes)",
217  (long) toklen,
218  (long) (MAXSTRLEN - 1))));
219 
220  if (cur - tmpbuf > MAXSTRPOS)
221  ereturn(escontext, (Datum) 0,
222  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
223  errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
224  (long) (cur - tmpbuf), (long) MAXSTRPOS)));
225 
226  /*
227  * Enlarge buffers if needed
228  */
229  if (len >= arrlen)
230  {
231  arrlen *= 2;
232  arr = (WordEntryIN *)
233  repalloc(arr, sizeof(WordEntryIN) * arrlen);
234  }
235  while ((cur - tmpbuf) + toklen >= buflen)
236  {
237  int dist = cur - tmpbuf;
238 
239  buflen *= 2;
240  tmpbuf = (char *) repalloc(tmpbuf, buflen);
241  cur = tmpbuf + dist;
242  }
243  arr[len].entry.len = toklen;
244  arr[len].entry.pos = cur - tmpbuf;
245  memcpy(cur, token, toklen);
246  cur += toklen;
247 
248  if (poslen != 0)
249  {
250  arr[len].entry.haspos = 1;
251  arr[len].pos = pos;
252  arr[len].poslen = poslen;
253  }
254  else
255  {
256  arr[len].entry.haspos = 0;
257  arr[len].pos = NULL;
258  arr[len].poslen = 0;
259  }
260  len++;
261  }
262 
264 
265  /* Did gettoken_tsvector fail? */
266  if (SOFT_ERROR_OCCURRED(escontext))
267  PG_RETURN_NULL();
268 
269  if (len > 0)
270  len = uniqueentry(arr, len, tmpbuf, &buflen);
271  else
272  buflen = 0;
273 
274  if (buflen > MAXSTRPOS)
275  ereturn(escontext, (Datum) 0,
276  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
277  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
278 
279  totallen = CALCDATASIZE(len, buflen);
280  in = (TSVector) palloc0(totallen);
281  SET_VARSIZE(in, totallen);
282  in->size = len;
283  inarr = ARRPTR(in);
284  strbuf = STRPTR(in);
285  stroff = 0;
286  for (i = 0; i < len; i++)
287  {
288  memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
289  arr[i].entry.pos = stroff;
290  stroff += arr[i].entry.len;
291  if (arr[i].entry.haspos)
292  {
293  /* This should be unreachable because of MAXNUMPOS restrictions */
294  if (arr[i].poslen > 0xFFFF)
295  elog(ERROR, "positions array too long");
296 
297  /* Copy number of positions */
298  stroff = SHORTALIGN(stroff);
299  *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
300  stroff += sizeof(uint16);
301 
302  /* Copy positions */
303  memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
304  stroff += arr[i].poslen * sizeof(WordEntryPos);
305 
306  pfree(arr[i].pos);
307  }
308  inarr[i] = arr[i].entry;
309  }
310 
311  Assert((strbuf + stroff - (char *) in) == totallen);
312 
313  PG_RETURN_TSVECTOR(in);
314 }
315 
316 Datum
318 {
319  TSVector out = PG_GETARG_TSVECTOR(0);
320  char *outbuf;
321  int32 i,
322  lenbuf = 0,
323  pp;
324  WordEntry *ptr = ARRPTR(out);
325  char *curbegin,
326  *curin,
327  *curout;
328 
329  lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
330  for (i = 0; i < out->size; i++)
331  {
332  lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
333  if (ptr[i].haspos)
334  lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
335  }
336 
337  curout = outbuf = (char *) palloc(lenbuf);
338  for (i = 0; i < out->size; i++)
339  {
340  curbegin = curin = STRPTR(out) + ptr->pos;
341  if (i != 0)
342  *curout++ = ' ';
343  *curout++ = '\'';
344  while (curin - curbegin < ptr->len)
345  {
346  int len = pg_mblen(curin);
347 
348  if (t_iseq(curin, '\''))
349  *curout++ = '\'';
350  else if (t_iseq(curin, '\\'))
351  *curout++ = '\\';
352 
353  while (len--)
354  *curout++ = *curin++;
355  }
356 
357  *curout++ = '\'';
358  if ((pp = POSDATALEN(out, ptr)) != 0)
359  {
360  WordEntryPos *wptr;
361 
362  *curout++ = ':';
363  wptr = POSDATAPTR(out, ptr);
364  while (pp)
365  {
366  curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
367  switch (WEP_GETWEIGHT(*wptr))
368  {
369  case 3:
370  *curout++ = 'A';
371  break;
372  case 2:
373  *curout++ = 'B';
374  break;
375  case 1:
376  *curout++ = 'C';
377  break;
378  case 0:
379  default:
380  break;
381  }
382 
383  if (pp > 1)
384  *curout++ = ',';
385  pp--;
386  wptr++;
387  }
388  }
389  ptr++;
390  }
391 
392  *curout = '\0';
393  PG_FREE_IF_COPY(out, 0);
394  PG_RETURN_CSTRING(outbuf);
395 }
396 
397 /*
398  * Binary Input / Output functions. The binary format is as follows:
399  *
400  * uint32 number of lexemes
401  *
402  * for each lexeme:
403  * lexeme text in client encoding, null-terminated
404  * uint16 number of positions
405  * for each position:
406  * uint16 WordEntryPos
407  */
408 
409 Datum
411 {
412  TSVector vec = PG_GETARG_TSVECTOR(0);
414  int i,
415  j;
416  WordEntry *weptr = ARRPTR(vec);
417 
419 
420  pq_sendint32(&buf, vec->size);
421  for (i = 0; i < vec->size; i++)
422  {
423  uint16 npos;
424 
425  /*
426  * the strings in the TSVector array are not null-terminated, so we
427  * have to send the null-terminator separately
428  */
429  pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
430  pq_sendbyte(&buf, '\0');
431 
432  npos = POSDATALEN(vec, weptr);
433  pq_sendint16(&buf, npos);
434 
435  if (npos > 0)
436  {
437  WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
438 
439  for (j = 0; j < npos; j++)
440  pq_sendint16(&buf, wepptr[j]);
441  }
442  weptr++;
443  }
444 
446 }
447 
448 Datum
450 {
452  TSVector vec;
453  int i;
454  int32 nentries;
455  int datalen; /* number of bytes used in the variable size
456  * area after fixed size TSVector header and
457  * WordEntries */
458  Size hdrlen;
459  Size len; /* allocated size of vec */
460  bool needSort = false;
461 
462  nentries = pq_getmsgint(buf, sizeof(int32));
463  if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
464  elog(ERROR, "invalid size of tsvector");
465 
466  hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
467 
468  len = hdrlen * 2; /* times two to make room for lexemes */
469  vec = (TSVector) palloc0(len);
470  vec->size = nentries;
471 
472  datalen = 0;
473  for (i = 0; i < nentries; i++)
474  {
475  const char *lexeme;
476  uint16 npos;
477  size_t lex_len;
478 
479  lexeme = pq_getmsgstring(buf);
480  npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
481 
482  /* sanity checks */
483 
484  lex_len = strlen(lexeme);
485  if (lex_len > MAXSTRLEN)
486  elog(ERROR, "invalid tsvector: lexeme too long");
487 
488  if (datalen > MAXSTRPOS)
489  elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
490 
491  if (npos > MAXNUMPOS)
492  elog(ERROR, "unexpected number of tsvector positions");
493 
494  /*
495  * Looks valid. Fill the WordEntry struct, and copy lexeme.
496  *
497  * But make sure the buffer is large enough first.
498  */
499  while (hdrlen + SHORTALIGN(datalen + lex_len) +
500  sizeof(uint16) + npos * sizeof(WordEntryPos) >= len)
501  {
502  len *= 2;
503  vec = (TSVector) repalloc(vec, len);
504  }
505 
506  vec->entries[i].haspos = (npos > 0) ? 1 : 0;
507  vec->entries[i].len = lex_len;
508  vec->entries[i].pos = datalen;
509 
510  memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
511 
512  datalen += lex_len;
513 
514  if (i > 0 && WordEntryCMP(&vec->entries[i],
515  &vec->entries[i - 1],
516  STRPTR(vec)) <= 0)
517  needSort = true;
518 
519  /* Receive positions */
520  if (npos > 0)
521  {
522  uint16 j;
523  WordEntryPos *wepptr;
524 
525  /*
526  * Pad to 2-byte alignment if necessary. Though we used palloc0
527  * for the initial allocation, subsequent repalloc'd memory areas
528  * are not initialized to zero.
529  */
530  if (datalen != SHORTALIGN(datalen))
531  {
532  *(STRPTR(vec) + datalen) = '\0';
533  datalen = SHORTALIGN(datalen);
534  }
535 
536  memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
537 
538  wepptr = POSDATAPTR(vec, &vec->entries[i]);
539  for (j = 0; j < npos; j++)
540  {
541  wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
542  if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
543  elog(ERROR, "position information is misordered");
544  }
545 
546  datalen += sizeof(uint16) + npos * sizeof(WordEntryPos);
547  }
548  }
549 
550  SET_VARSIZE(vec, hdrlen + datalen);
551 
552  if (needSort)
553  qsort_arg(ARRPTR(vec), vec->size, sizeof(WordEntry),
554  compareentry, STRPTR(vec));
555 
556  PG_RETURN_TSVECTOR(vec);
557 }
unsigned short uint16
Definition: c.h:508
signed int int32
Definition: c.h:497
#define Assert(condition)
Definition: c.h:861
#define SHORTALIGN(LEN)
Definition: c.h:810
size_t Size
Definition: c.h:608
#define ARRPTR(x)
Definition: cube.c:25
struct cursor * cur
Definition: ecpg.c:28
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ereturn(context, dummy_value,...)
Definition: elog.h:277
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_CSTRING(x)
Definition: fmgr.h:362
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:277
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define CALCDATASIZE(x, lenstr)
Definition: hstore.h:72
#define STRPTR(x)
Definition: hstore.h:76
#define token
Definition: indent_globs.h:126
static int pg_cmp_s32(int32 a, int32 b)
Definition: int.h:598
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
#define MaxAllocSize
Definition: memutils.h:40
#define SOFT_ERROR_OCCURRED(escontext)
Definition: miscnodes.h:52
void * arg
const void size_t len
static char * buf
Definition: pg_test_fsync.c:73
#define sprintf
Definition: port.h:240
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:447
uintptr_t Datum
Definition: postgres.h:64
unsigned int pq_getmsgint(StringInfo msg, int b)
Definition: pqformat.c:415
void pq_sendtext(StringInfo buf, const char *str, int slen)
Definition: pqformat.c:172
const char * pq_getmsgstring(StringInfo msg)
Definition: pqformat.c:579
void pq_begintypsend(StringInfo buf)
Definition: pqformat.c:326
bytea * pq_endtypsend(StringInfo buf)
Definition: pqformat.c:346
static void pq_sendint32(StringInfo buf, uint32 i)
Definition: pqformat.h:144
static void pq_sendbyte(StringInfo buf, uint8 byt)
Definition: pqformat.h:160
static void pq_sendint16(StringInfo buf, uint16 i)
Definition: pqformat.h:136
StringInfoData * StringInfo
Definition: stringinfo.h:54
Definition: nodes.h:129
int32 size
Definition: ts_type.h:93
WordEntry entries[FLEXIBLE_ARRAY_MEMBER]
Definition: ts_type.h:94
int poslen
Definition: tsvector.c:30
WordEntryPos * pos
Definition: tsvector.c:29
WordEntry entry
Definition: tsvector.c:28
uint32 pos
Definition: ts_type.h:46
uint32 haspos
Definition: ts_type.h:44
uint32 len
Definition: ts_type.h:45
Definition: regguts.h:323
#define t_iseq(x, c)
Definition: ts_locale.h:38
#define PG_GETARG_TSVECTOR(n)
Definition: ts_type.h:135
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define PG_RETURN_TSVECTOR(x)
Definition: ts_type.h:137
#define MAXENTRYPOS
Definition: ts_type.h:85
#define POSDATALEN(x, e)
Definition: ts_type.h:110
uint16 WordEntryPos
Definition: ts_type.h:63
#define MAXNUMPOS
Definition: ts_type.h:86
TSVectorData * TSVector
Definition: ts_type.h:98
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
#define DATAHDRSIZE
Definition: ts_type.h:100
#define MAXSTRLEN
Definition: ts_type.h:49
#define POSDATAPTR(x, e)
Definition: ts_type.h:111
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define MAXSTRPOS
Definition: ts_type.h:50
Datum tsvectorout(PG_FUNCTION_ARGS)
Definition: tsvector.c:317
static int uniquePos(WordEntryPos *a, int l)
Definition: tsvector.c:52
Datum tsvectorrecv(PG_FUNCTION_ARGS)
Definition: tsvector.c:449
static int compareentry(const void *va, const void *vb, void *arg)
Definition: tsvector.c:84
Datum tsvectorin(PG_FUNCTION_ARGS)
Definition: tsvector.c:178
int compareWordEntryPos(const void *a, const void *b)
Definition: tsvector.c:36
Datum tsvectorsend(PG_FUNCTION_ARGS)
Definition: tsvector.c:410
static int WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
Definition: tsvector.c:171
static int uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
Definition: tsvector.c:100
int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
Definition: tsvector_op.c:1152
void close_tsvector_parser(TSVectorParseState state)
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
TSVectorParseState init_tsvector_parser(char *input, int flags, Node *escontext)
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
static StringInfoData tmpbuf
Definition: walsender.c:170