PostgreSQL Source Code  git master
tsvector.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  * I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  * src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/builtins.h"
21 #include "utils/memutils.h"
22 
23 typedef struct
24 {
25  WordEntry entry; /* must be first! */
27  int poslen; /* number of elements in pos */
28 } WordEntryIN;
29 
30 
31 /* Compare two WordEntryPos values for qsort */
32 int
33 compareWordEntryPos(const void *a, const void *b)
34 {
35  int apos = WEP_GETPOS(*(const WordEntryPos *) a);
36  int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
37 
38  if (apos == bpos)
39  return 0;
40  return (apos > bpos) ? 1 : -1;
41 }
42 
43 /*
44  * Removes duplicate pos entries. If there's two entries with same pos
45  * but different weight, the higher weight is retained.
46  *
47  * Returns new length.
48  */
49 static int
51 {
52  WordEntryPos *ptr,
53  *res;
54 
55  if (l <= 1)
56  return l;
57 
58  qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
59 
60  res = a;
61  ptr = a + 1;
62  while (ptr - a < l)
63  {
64  if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
65  {
66  res++;
67  *res = *ptr;
68  if (res - a >= MAXNUMPOS - 1 ||
69  WEP_GETPOS(*res) == MAXENTRYPOS - 1)
70  break;
71  }
72  else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
73  WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
74  ptr++;
75  }
76 
77  return res + 1 - a;
78 }
79 
80 /* Compare two WordEntryIN values for qsort */
81 static int
82 compareentry(const void *va, const void *vb, void *arg)
83 {
84  const WordEntryIN *a = (const WordEntryIN *) va;
85  const WordEntryIN *b = (const WordEntryIN *) vb;
86  char *BufferStr = (char *) arg;
87 
88  return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
89  &BufferStr[b->entry.pos], b->entry.len,
90  false);
91 }
92 
93 /*
94  * Sort an array of WordEntryIN, remove duplicates.
95  * *outbuflen receives the amount of space needed for strings and positions.
96  */
97 static int
98 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
99 {
100  int buflen;
101  WordEntryIN *ptr,
102  *res;
103 
104  Assert(l >= 1);
105 
106  if (l > 1)
107  qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
108  (void *) buf);
109 
110  buflen = 0;
111  res = a;
112  ptr = a + 1;
113  while (ptr - a < l)
114  {
115  if (!(ptr->entry.len == res->entry.len &&
116  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
117  res->entry.len) == 0))
118  {
119  /* done accumulating data into *res, count space needed */
120  buflen += res->entry.len;
121  if (res->entry.haspos)
122  {
123  res->poslen = uniquePos(res->pos, res->poslen);
124  buflen = SHORTALIGN(buflen);
125  buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
126  }
127  res++;
128  if (res != ptr)
129  memcpy(res, ptr, sizeof(WordEntryIN));
130  }
131  else if (ptr->entry.haspos)
132  {
133  if (res->entry.haspos)
134  {
135  /* append ptr's positions to res's positions */
136  int newlen = ptr->poslen + res->poslen;
137 
138  res->pos = (WordEntryPos *)
139  repalloc(res->pos, newlen * sizeof(WordEntryPos));
140  memcpy(&res->pos[res->poslen], ptr->pos,
141  ptr->poslen * sizeof(WordEntryPos));
142  res->poslen = newlen;
143  pfree(ptr->pos);
144  }
145  else
146  {
147  /* just give ptr's positions to pos */
148  res->entry.haspos = 1;
149  res->pos = ptr->pos;
150  res->poslen = ptr->poslen;
151  }
152  }
153  ptr++;
154  }
155 
156  /* count space needed for last item */
157  buflen += res->entry.len;
158  if (res->entry.haspos)
159  {
160  res->poslen = uniquePos(res->pos, res->poslen);
161  buflen = SHORTALIGN(buflen);
162  buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
163  }
164 
165  *outbuflen = buflen;
166  return res + 1 - a;
167 }
168 
169 static int
171 {
172  return compareentry(a, b, buf);
173 }
174 
175 
176 Datum
178 {
179  char *buf = PG_GETARG_CSTRING(0);
181  WordEntryIN *arr;
182  int totallen;
183  int arrlen; /* allocated size of arr */
184  WordEntry *inarr;
185  int len = 0;
186  TSVector in;
187  int i;
188  char *token;
189  int toklen;
190  WordEntryPos *pos;
191  int poslen;
192  char *strbuf;
193  int stroff;
194 
195  /*
196  * Tokens are appended to tmpbuf, cur is a pointer to the end of used
197  * space in tmpbuf.
198  */
199  char *tmpbuf;
200  char *cur;
201  int buflen = 256; /* allocated size of tmpbuf */
202 
203  state = init_tsvector_parser(buf, false, false);
204 
205  arrlen = 64;
206  arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
207  cur = tmpbuf = (char *) palloc(buflen);
208 
209  while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
210  {
211  if (toklen >= MAXSTRLEN)
212  ereport(ERROR,
213  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
214  errmsg("word is too long (%ld bytes, max %ld bytes)",
215  (long) toklen,
216  (long) (MAXSTRLEN - 1))));
217 
218  if (cur - tmpbuf > MAXSTRPOS)
219  ereport(ERROR,
220  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
221  errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
222  (long) (cur - tmpbuf), (long) MAXSTRPOS)));
223 
224  /*
225  * Enlarge buffers if needed
226  */
227  if (len >= arrlen)
228  {
229  arrlen *= 2;
230  arr = (WordEntryIN *)
231  repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
232  }
233  while ((cur - tmpbuf) + toklen >= buflen)
234  {
235  int dist = cur - tmpbuf;
236 
237  buflen *= 2;
238  tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
239  cur = tmpbuf + dist;
240  }
241  arr[len].entry.len = toklen;
242  arr[len].entry.pos = cur - tmpbuf;
243  memcpy((void *) cur, (void *) token, toklen);
244  cur += toklen;
245 
246  if (poslen != 0)
247  {
248  arr[len].entry.haspos = 1;
249  arr[len].pos = pos;
250  arr[len].poslen = poslen;
251  }
252  else
253  {
254  arr[len].entry.haspos = 0;
255  arr[len].pos = NULL;
256  arr[len].poslen = 0;
257  }
258  len++;
259  }
260 
261  close_tsvector_parser(state);
262 
263  if (len > 0)
264  len = uniqueentry(arr, len, tmpbuf, &buflen);
265  else
266  buflen = 0;
267 
268  if (buflen > MAXSTRPOS)
269  ereport(ERROR,
270  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
271  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
272 
273  totallen = CALCDATASIZE(len, buflen);
274  in = (TSVector) palloc0(totallen);
275  SET_VARSIZE(in, totallen);
276  in->size = len;
277  inarr = ARRPTR(in);
278  strbuf = STRPTR(in);
279  stroff = 0;
280  for (i = 0; i < len; i++)
281  {
282  memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
283  arr[i].entry.pos = stroff;
284  stroff += arr[i].entry.len;
285  if (arr[i].entry.haspos)
286  {
287  if (arr[i].poslen > 0xFFFF)
288  elog(ERROR, "positions array too long");
289 
290  /* Copy number of positions */
291  stroff = SHORTALIGN(stroff);
292  *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
293  stroff += sizeof(uint16);
294 
295  /* Copy positions */
296  memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
297  stroff += arr[i].poslen * sizeof(WordEntryPos);
298 
299  pfree(arr[i].pos);
300  }
301  inarr[i] = arr[i].entry;
302  }
303 
304  Assert((strbuf + stroff - (char *) in) == totallen);
305 
306  PG_RETURN_TSVECTOR(in);
307 }
308 
309 Datum
311 {
312  TSVector out = PG_GETARG_TSVECTOR(0);
313  char *outbuf;
314  int32 i,
315  lenbuf = 0,
316  pp;
317  WordEntry *ptr = ARRPTR(out);
318  char *curbegin,
319  *curin,
320  *curout;
321 
322  lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
323  for (i = 0; i < out->size; i++)
324  {
325  lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
326  if (ptr[i].haspos)
327  lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
328  }
329 
330  curout = outbuf = (char *) palloc(lenbuf);
331  for (i = 0; i < out->size; i++)
332  {
333  curbegin = curin = STRPTR(out) + ptr->pos;
334  if (i != 0)
335  *curout++ = ' ';
336  *curout++ = '\'';
337  while (curin - curbegin < ptr->len)
338  {
339  int len = pg_mblen(curin);
340 
341  if (t_iseq(curin, '\''))
342  *curout++ = '\'';
343  else if (t_iseq(curin, '\\'))
344  *curout++ = '\\';
345 
346  while (len--)
347  *curout++ = *curin++;
348  }
349 
350  *curout++ = '\'';
351  if ((pp = POSDATALEN(out, ptr)) != 0)
352  {
353  WordEntryPos *wptr;
354 
355  *curout++ = ':';
356  wptr = POSDATAPTR(out, ptr);
357  while (pp)
358  {
359  curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
360  switch (WEP_GETWEIGHT(*wptr))
361  {
362  case 3:
363  *curout++ = 'A';
364  break;
365  case 2:
366  *curout++ = 'B';
367  break;
368  case 1:
369  *curout++ = 'C';
370  break;
371  case 0:
372  default:
373  break;
374  }
375 
376  if (pp > 1)
377  *curout++ = ',';
378  pp--;
379  wptr++;
380  }
381  }
382  ptr++;
383  }
384 
385  *curout = '\0';
386  PG_FREE_IF_COPY(out, 0);
387  PG_RETURN_CSTRING(outbuf);
388 }
389 
390 /*
391  * Binary Input / Output functions. The binary format is as follows:
392  *
393  * uint32 number of lexemes
394  *
395  * for each lexeme:
396  * lexeme text in client encoding, null-terminated
397  * uint16 number of positions
398  * for each position:
399  * uint16 WordEntryPos
400  */
401 
402 Datum
404 {
405  TSVector vec = PG_GETARG_TSVECTOR(0);
407  int i,
408  j;
409  WordEntry *weptr = ARRPTR(vec);
410 
411  pq_begintypsend(&buf);
412 
413  pq_sendint32(&buf, vec->size);
414  for (i = 0; i < vec->size; i++)
415  {
416  uint16 npos;
417 
418  /*
419  * the strings in the TSVector array are not null-terminated, so we
420  * have to send the null-terminator separately
421  */
422  pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
423  pq_sendbyte(&buf, '\0');
424 
425  npos = POSDATALEN(vec, weptr);
426  pq_sendint16(&buf, npos);
427 
428  if (npos > 0)
429  {
430  WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
431 
432  for (j = 0; j < npos; j++)
433  pq_sendint16(&buf, wepptr[j]);
434  }
435  weptr++;
436  }
437 
439 }
440 
441 Datum
443 {
445  TSVector vec;
446  int i;
447  int32 nentries;
448  int datalen; /* number of bytes used in the variable size
449  * area after fixed size TSVector header and
450  * WordEntries */
451  Size hdrlen;
452  Size len; /* allocated size of vec */
453  bool needSort = false;
454 
455  nentries = pq_getmsgint(buf, sizeof(int32));
456  if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
457  elog(ERROR, "invalid size of tsvector");
458 
459  hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
460 
461  len = hdrlen * 2; /* times two to make room for lexemes */
462  vec = (TSVector) palloc0(len);
463  vec->size = nentries;
464 
465  datalen = 0;
466  for (i = 0; i < nentries; i++)
467  {
468  const char *lexeme;
469  uint16 npos;
470  size_t lex_len;
471 
472  lexeme = pq_getmsgstring(buf);
473  npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
474 
475  /* sanity checks */
476 
477  lex_len = strlen(lexeme);
478  if (lex_len > MAXSTRLEN)
479  elog(ERROR, "invalid tsvector: lexeme too long");
480 
481  if (datalen > MAXSTRPOS)
482  elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
483 
484  if (npos > MAXNUMPOS)
485  elog(ERROR, "unexpected number of tsvector positions");
486 
487  /*
488  * Looks valid. Fill the WordEntry struct, and copy lexeme.
489  *
490  * But make sure the buffer is large enough first.
491  */
492  while (hdrlen + SHORTALIGN(datalen + lex_len) +
493  (npos + 1) * sizeof(WordEntryPos) >= len)
494  {
495  len *= 2;
496  vec = (TSVector) repalloc(vec, len);
497  }
498 
499  vec->entries[i].haspos = (npos > 0) ? 1 : 0;
500  vec->entries[i].len = lex_len;
501  vec->entries[i].pos = datalen;
502 
503  memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
504 
505  datalen += lex_len;
506 
507  if (i > 0 && WordEntryCMP(&vec->entries[i],
508  &vec->entries[i - 1],
509  STRPTR(vec)) <= 0)
510  needSort = true;
511 
512  /* Receive positions */
513  if (npos > 0)
514  {
515  uint16 j;
516  WordEntryPos *wepptr;
517 
518  /*
519  * Pad to 2-byte alignment if necessary. Though we used palloc0
520  * for the initial allocation, subsequent repalloc'd memory areas
521  * are not initialized to zero.
522  */
523  if (datalen != SHORTALIGN(datalen))
524  {
525  *(STRPTR(vec) + datalen) = '\0';
526  datalen = SHORTALIGN(datalen);
527  }
528 
529  memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
530 
531  wepptr = POSDATAPTR(vec, &vec->entries[i]);
532  for (j = 0; j < npos; j++)
533  {
534  wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
535  if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
536  elog(ERROR, "position information is misordered");
537  }
538 
539  datalen += (npos + 1) * sizeof(WordEntry);
540  }
541  }
542 
543  SET_VARSIZE(vec, hdrlen + datalen);
544 
545  if (needSort)
546  qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
547  compareentry, (void *) STRPTR(vec));
548 
549  PG_RETURN_TSVECTOR(vec);
550 }
uint16 WordEntryPos
Definition: ts_type.h:63
#define POSDATALEN(x, e)
Definition: ts_type.h:110
const char * pq_getmsgstring(StringInfo msg)
Definition: pqformat.c:581
static void pq_sendint32(StringInfo buf, int32 i)
Definition: pqformat.h:148
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
void pq_begintypsend(StringInfo buf)
Definition: pqformat.c:328
StringInfoData * StringInfo
Definition: stringinfo.h:43
void close_tsvector_parser(TSVectorParseState state)
uint32 len
Definition: ts_type.h:44
#define PG_RETURN_TSVECTOR(x)
Definition: ts_type.h:122
struct cursor * cur
Definition: ecpg.c:28
int errcode(int sqlerrcode)
Definition: elog.c:575
void pq_sendtext(StringInfo buf, const char *str, int slen)
Definition: pqformat.c:174
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:330
static void pq_sendbyte(StringInfo buf, int8 byt)
Definition: pqformat.h:164
#define MAXSTRPOS
Definition: ts_type.h:50
int compareWordEntryPos(const void *a, const void *b)
Definition: tsvector.c:33
bytea * pq_endtypsend(StringInfo buf)
Definition: pqformat.c:348
WordEntry entry
Definition: tsvector.c:25
signed int int32
Definition: c.h:284
#define POSDATAPTR(x, e)
Definition: ts_type.h:111
unsigned short uint16
Definition: c.h:295
void pfree(void *pointer)
Definition: mcxt.c:949
#define ERROR
Definition: elog.h:43
#define MAXNUMPOS
Definition: ts_type.h:86
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
static char * buf
Definition: pg_test_fsync.c:67
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
#define t_iseq(x, c)
Definition: ts_locale.h:45
TSVectorParseState init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
int32 size
Definition: ts_type.h:93
WordEntry entries[FLEXIBLE_ARRAY_MEMBER]
Definition: ts_type.h:94
static int compareentry(const void *va, const void *vb, void *arg)
Definition: tsvector.c:82
uint32 haspos
Definition: ts_type.h:44
#define ereport(elevel, rest)
Definition: elog.h:122
int poslen
Definition: tsvector.c:27
static int WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
Definition: tsvector.c:170
#define MaxAllocSize
Definition: memutils.h:40
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
static int uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
Definition: tsvector.c:98
#define CALCDATASIZE(x, lenstr)
Definition: hstore.h:72
static int uniquePos(WordEntryPos *a, int l)
Definition: tsvector.c:50
void * palloc0(Size size)
Definition: mcxt.c:877
uintptr_t Datum
Definition: postgres.h:372
TSVectorData * TSVector
Definition: ts_type.h:98
Datum tsvectorsend(PG_FUNCTION_ARGS)
Definition: tsvector.c:403
Datum tsvectorout(PG_FUNCTION_ARGS)
Definition: tsvector.c:310
#define Assert(condition)
Definition: c.h:670
Definition: regguts.h:298
#define PG_RETURN_CSTRING(x)
Definition: fmgr.h:322
size_t Size
Definition: c.h:404
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:225
int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
Definition: tsvector_op.c:1160
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
uint32 pos
Definition: ts_type.h:44
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
static StringInfoData tmpbuf
Definition: walsender.c:162
static void pq_sendint16(StringInfo buf, int16 i)
Definition: pqformat.h:140
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define STRPTR(x)
Definition: hstore.h:76
int i
WordEntryPos * pos
Definition: tsvector.c:26
void * arg
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:242
#define PG_GETARG_TSVECTOR(n)
Definition: ts_type.h:120
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
unsigned int pq_getmsgint(StringInfo msg, int b)
Definition: pqformat.c:417
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:328
#define ARRPTR(x)
Definition: cube.c:26
#define elog
Definition: elog.h:219
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define qsort(a, b, c, d)
Definition: port.h:408
Datum tsvectorrecv(PG_FUNCTION_ARGS)
Definition: tsvector.c:442
#define SHORTALIGN(LEN)
Definition: c.h:619
#define DATAHDRSIZE
Definition: ts_type.h:100
Datum tsvectorin(PG_FUNCTION_ARGS)
Definition: tsvector.c:177
#define MAXENTRYPOS
Definition: ts_type.h:85
#define MAXSTRLEN
Definition: ts_type.h:49