PostgreSQL Source Code git master
tsvector.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tsvector.c
4 * I/O functions for tsvector
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "common/int.h"
18#include "libpq/pqformat.h"
19#include "nodes/miscnodes.h"
20#include "tsearch/ts_locale.h"
21#include "tsearch/ts_utils.h"
22#include "utils/fmgrprotos.h"
23#include "utils/memutils.h"
24#include "varatt.h"
25
26typedef struct
27{
28 WordEntry entry; /* must be first! */
30 int poslen; /* number of elements in pos */
32
33
34/* Compare two WordEntryPos values for qsort */
35int
36compareWordEntryPos(const void *a, const void *b)
37{
38 int apos = WEP_GETPOS(*(const WordEntryPos *) a);
39 int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
40
41 return pg_cmp_s32(apos, bpos);
42}
43
44/*
45 * Removes duplicate pos entries. If there's two entries with same pos but
46 * different weight, the higher weight is retained, so we can't use
47 * qunique here.
48 *
49 * Returns new length.
50 */
51static int
53{
54 WordEntryPos *ptr,
55 *res;
56
57 if (l <= 1)
58 return l;
59
61
62 res = a;
63 ptr = a + 1;
64 while (ptr - a < l)
65 {
66 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
67 {
68 res++;
69 *res = *ptr;
70 if (res - a >= MAXNUMPOS - 1 ||
71 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
72 break;
73 }
74 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
76 ptr++;
77 }
78
79 return res + 1 - a;
80}
81
82/* Compare two WordEntryIN values for qsort */
83static int
84compareentry(const void *va, const void *vb, void *arg)
85{
86 const WordEntryIN *a = (const WordEntryIN *) va;
87 const WordEntryIN *b = (const WordEntryIN *) vb;
88 char *BufferStr = (char *) arg;
89
90 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
91 &BufferStr[b->entry.pos], b->entry.len,
92 false);
93}
94
95/*
96 * Sort an array of WordEntryIN, remove duplicates.
97 * *outbuflen receives the amount of space needed for strings and positions.
98 */
99static int
100uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
101{
102 int buflen;
103 WordEntryIN *ptr,
104 *res;
105
106 Assert(l >= 1);
107
108 if (l > 1)
109 qsort_arg(a, l, sizeof(WordEntryIN), compareentry, buf);
110
111 buflen = 0;
112 res = a;
113 ptr = a + 1;
114 while (ptr - a < l)
115 {
116 if (!(ptr->entry.len == res->entry.len &&
117 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
118 res->entry.len) == 0))
119 {
120 /* done accumulating data into *res, count space needed */
121 buflen += res->entry.len;
122 if (res->entry.haspos)
123 {
124 res->poslen = uniquePos(res->pos, res->poslen);
125 buflen = SHORTALIGN(buflen);
126 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
127 }
128 res++;
129 if (res != ptr)
130 memcpy(res, ptr, sizeof(WordEntryIN));
131 }
132 else if (ptr->entry.haspos)
133 {
134 if (res->entry.haspos)
135 {
136 /* append ptr's positions to res's positions */
137 int newlen = ptr->poslen + res->poslen;
138
139 res->pos = (WordEntryPos *)
140 repalloc(res->pos, newlen * sizeof(WordEntryPos));
141 memcpy(&res->pos[res->poslen], ptr->pos,
142 ptr->poslen * sizeof(WordEntryPos));
143 res->poslen = newlen;
144 pfree(ptr->pos);
145 }
146 else
147 {
148 /* just give ptr's positions to pos */
149 res->entry.haspos = 1;
150 res->pos = ptr->pos;
151 res->poslen = ptr->poslen;
152 }
153 }
154 ptr++;
155 }
156
157 /* count space needed for last item */
158 buflen += res->entry.len;
159 if (res->entry.haspos)
160 {
161 res->poslen = uniquePos(res->pos, res->poslen);
162 buflen = SHORTALIGN(buflen);
163 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
164 }
165
166 *outbuflen = buflen;
167 return res + 1 - a;
168}
169
170static int
172{
173 return compareentry(a, b, buf);
174}
175
176
177Datum
179{
180 char *buf = PG_GETARG_CSTRING(0);
181 Node *escontext = fcinfo->context;
183 WordEntryIN *arr;
184 int totallen;
185 int arrlen; /* allocated size of arr */
186 WordEntry *inarr;
187 int len = 0;
188 TSVector in;
189 int i;
190 char *token;
191 int toklen;
192 WordEntryPos *pos;
193 int poslen;
194 char *strbuf;
195 int stroff;
196
197 /*
198 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
199 * space in tmpbuf.
200 */
201 char *tmpbuf;
202 char *cur;
203 int buflen = 256; /* allocated size of tmpbuf */
204
205 state = init_tsvector_parser(buf, 0, escontext);
206
207 arrlen = 64;
208 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
209 cur = tmpbuf = (char *) palloc(buflen);
210
211 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
212 {
213 if (toklen >= MAXSTRLEN)
214 ereturn(escontext, (Datum) 0,
215 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
216 errmsg("word is too long (%ld bytes, max %ld bytes)",
217 (long) toklen,
218 (long) (MAXSTRLEN - 1))));
219
220 if (cur - tmpbuf > MAXSTRPOS)
221 ereturn(escontext, (Datum) 0,
222 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
223 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
224 (long) (cur - tmpbuf), (long) MAXSTRPOS)));
225
226 /*
227 * Enlarge buffers if needed
228 */
229 if (len >= arrlen)
230 {
231 arrlen *= 2;
232 arr = (WordEntryIN *)
233 repalloc(arr, sizeof(WordEntryIN) * arrlen);
234 }
235 while ((cur - tmpbuf) + toklen >= buflen)
236 {
237 int dist = cur - tmpbuf;
238
239 buflen *= 2;
240 tmpbuf = (char *) repalloc(tmpbuf, buflen);
241 cur = tmpbuf + dist;
242 }
243 arr[len].entry.len = toklen;
244 arr[len].entry.pos = cur - tmpbuf;
245 memcpy(cur, token, toklen);
246 cur += toklen;
247
248 if (poslen != 0)
249 {
250 arr[len].entry.haspos = 1;
251 arr[len].pos = pos;
252 arr[len].poslen = poslen;
253 }
254 else
255 {
256 arr[len].entry.haspos = 0;
257 arr[len].pos = NULL;
258 arr[len].poslen = 0;
259 }
260 len++;
261 }
262
264
265 /* Did gettoken_tsvector fail? */
266 if (SOFT_ERROR_OCCURRED(escontext))
268
269 if (len > 0)
270 len = uniqueentry(arr, len, tmpbuf, &buflen);
271 else
272 buflen = 0;
273
274 if (buflen > MAXSTRPOS)
275 ereturn(escontext, (Datum) 0,
276 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
277 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
278
279 totallen = CALCDATASIZE(len, buflen);
280 in = (TSVector) palloc0(totallen);
281 SET_VARSIZE(in, totallen);
282 in->size = len;
283 inarr = ARRPTR(in);
284 strbuf = STRPTR(in);
285 stroff = 0;
286 for (i = 0; i < len; i++)
287 {
288 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
289 arr[i].entry.pos = stroff;
290 stroff += arr[i].entry.len;
291 if (arr[i].entry.haspos)
292 {
293 /* This should be unreachable because of MAXNUMPOS restrictions */
294 if (arr[i].poslen > 0xFFFF)
295 elog(ERROR, "positions array too long");
296
297 /* Copy number of positions */
298 stroff = SHORTALIGN(stroff);
299 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
300 stroff += sizeof(uint16);
301
302 /* Copy positions */
303 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
304 stroff += arr[i].poslen * sizeof(WordEntryPos);
305
306 pfree(arr[i].pos);
307 }
308 inarr[i] = arr[i].entry;
309 }
310
311 Assert((strbuf + stroff - (char *) in) == totallen);
312
314}
315
316Datum
318{
320 char *outbuf;
321 int32 i,
322 lenbuf = 0,
323 pp;
324 WordEntry *ptr = ARRPTR(out);
325 char *curbegin,
326 *curin,
327 *curout;
328
329 lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
330 for (i = 0; i < out->size; i++)
331 {
332 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
333 if (ptr[i].haspos)
334 lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
335 }
336
337 curout = outbuf = (char *) palloc(lenbuf);
338 for (i = 0; i < out->size; i++)
339 {
340 curbegin = curin = STRPTR(out) + ptr->pos;
341 if (i != 0)
342 *curout++ = ' ';
343 *curout++ = '\'';
344 while (curin - curbegin < ptr->len)
345 {
346 int len = pg_mblen(curin);
347
348 if (t_iseq(curin, '\''))
349 *curout++ = '\'';
350 else if (t_iseq(curin, '\\'))
351 *curout++ = '\\';
352
353 while (len--)
354 *curout++ = *curin++;
355 }
356
357 *curout++ = '\'';
358 if ((pp = POSDATALEN(out, ptr)) != 0)
359 {
360 WordEntryPos *wptr;
361
362 *curout++ = ':';
363 wptr = POSDATAPTR(out, ptr);
364 while (pp)
365 {
366 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
367 switch (WEP_GETWEIGHT(*wptr))
368 {
369 case 3:
370 *curout++ = 'A';
371 break;
372 case 2:
373 *curout++ = 'B';
374 break;
375 case 1:
376 *curout++ = 'C';
377 break;
378 case 0:
379 default:
380 break;
381 }
382
383 if (pp > 1)
384 *curout++ = ',';
385 pp--;
386 wptr++;
387 }
388 }
389 ptr++;
390 }
391
392 *curout = '\0';
393 PG_FREE_IF_COPY(out, 0);
394 PG_RETURN_CSTRING(outbuf);
395}
396
397/*
398 * Binary Input / Output functions. The binary format is as follows:
399 *
400 * uint32 number of lexemes
401 *
402 * for each lexeme:
403 * lexeme text in client encoding, null-terminated
404 * uint16 number of positions
405 * for each position:
406 * uint16 WordEntryPos
407 */
408
409Datum
411{
414 int i,
415 j;
416 WordEntry *weptr = ARRPTR(vec);
417
419
420 pq_sendint32(&buf, vec->size);
421 for (i = 0; i < vec->size; i++)
422 {
423 uint16 npos;
424
425 /*
426 * the strings in the TSVector array are not null-terminated, so we
427 * have to send the null-terminator separately
428 */
429 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
430 pq_sendbyte(&buf, '\0');
431
432 npos = POSDATALEN(vec, weptr);
433 pq_sendint16(&buf, npos);
434
435 if (npos > 0)
436 {
437 WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
438
439 for (j = 0; j < npos; j++)
440 pq_sendint16(&buf, wepptr[j]);
441 }
442 weptr++;
443 }
444
446}
447
448Datum
450{
452 TSVector vec;
453 int i;
454 int32 nentries;
455 int datalen; /* number of bytes used in the variable size
456 * area after fixed size TSVector header and
457 * WordEntries */
458 Size hdrlen;
459 Size len; /* allocated size of vec */
460 bool needSort = false;
461
462 nentries = pq_getmsgint(buf, sizeof(int32));
463 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
464 elog(ERROR, "invalid size of tsvector");
465
466 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
467
468 len = hdrlen * 2; /* times two to make room for lexemes */
469 vec = (TSVector) palloc0(len);
470 vec->size = nentries;
471
472 datalen = 0;
473 for (i = 0; i < nentries; i++)
474 {
475 const char *lexeme;
476 uint16 npos;
477 size_t lex_len;
478
479 lexeme = pq_getmsgstring(buf);
480 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
481
482 /* sanity checks */
483
484 lex_len = strlen(lexeme);
485 if (lex_len > MAXSTRLEN)
486 elog(ERROR, "invalid tsvector: lexeme too long");
487
488 if (datalen > MAXSTRPOS)
489 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
490
491 if (npos > MAXNUMPOS)
492 elog(ERROR, "unexpected number of tsvector positions");
493
494 /*
495 * Looks valid. Fill the WordEntry struct, and copy lexeme.
496 *
497 * But make sure the buffer is large enough first.
498 */
499 while (hdrlen + SHORTALIGN(datalen + lex_len) +
500 sizeof(uint16) + npos * sizeof(WordEntryPos) >= len)
501 {
502 len *= 2;
503 vec = (TSVector) repalloc(vec, len);
504 }
505
506 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
507 vec->entries[i].len = lex_len;
508 vec->entries[i].pos = datalen;
509
510 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
511
512 datalen += lex_len;
513
514 if (i > 0 && WordEntryCMP(&vec->entries[i],
515 &vec->entries[i - 1],
516 STRPTR(vec)) <= 0)
517 needSort = true;
518
519 /* Receive positions */
520 if (npos > 0)
521 {
522 uint16 j;
523 WordEntryPos *wepptr;
524
525 /*
526 * Pad to 2-byte alignment if necessary. Though we used palloc0
527 * for the initial allocation, subsequent repalloc'd memory areas
528 * are not initialized to zero.
529 */
530 if (datalen != SHORTALIGN(datalen))
531 {
532 *(STRPTR(vec) + datalen) = '\0';
533 datalen = SHORTALIGN(datalen);
534 }
535
536 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
537
538 wepptr = POSDATAPTR(vec, &vec->entries[i]);
539 for (j = 0; j < npos; j++)
540 {
541 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
542 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
543 elog(ERROR, "position information is misordered");
544 }
545
546 datalen += sizeof(uint16) + npos * sizeof(WordEntryPos);
547 }
548 }
549
550 SET_VARSIZE(vec, hdrlen + datalen);
551
552 if (needSort)
553 qsort_arg(ARRPTR(vec), vec->size, sizeof(WordEntry),
554 compareentry, STRPTR(vec));
555
557}
#define Assert(condition)
Definition: c.h:815
#define SHORTALIGN(LEN)
Definition: c.h:764
int32_t int32
Definition: c.h:484
uint16_t uint16
Definition: c.h:487
size_t Size
Definition: c.h:562
#define ARRPTR(x)
Definition: cube.c:25
struct cursor * cur
Definition: ecpg.c:29
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ereturn(context, dummy_value,...)
Definition: elog.h:277
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define MaxAllocSize
Definition: fe_memutils.h:22
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260
#define PG_RETURN_BYTEA_P(x)
Definition: fmgr.h:371
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define PG_RETURN_CSTRING(x)
Definition: fmgr.h:362
#define PG_GETARG_CSTRING(n)
Definition: fmgr.h:277
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define CALCDATASIZE(x, lenstr)
Definition: hstore.h:72
#define STRPTR(x)
Definition: hstore.h:76
#define token
Definition: indent_globs.h:126
static int pg_cmp_s32(int32 a, int32 b)
Definition: int.h:646
int b
Definition: isn.c:69
int a
Definition: isn.c:68
int j
Definition: isn.c:73
int i
Definition: isn.c:72
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1546
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1023
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
#define SOFT_ERROR_OCCURRED(escontext)
Definition: miscnodes.h:53
void * arg
const void size_t len
static char * buf
Definition: pg_test_fsync.c:72
#define sprintf
Definition: port.h:240
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:474
uintptr_t Datum
Definition: postgres.h:69
unsigned int pq_getmsgint(StringInfo msg, int b)
Definition: pqformat.c:415
void pq_sendtext(StringInfo buf, const char *str, int slen)
Definition: pqformat.c:172
const char * pq_getmsgstring(StringInfo msg)
Definition: pqformat.c:579
void pq_begintypsend(StringInfo buf)
Definition: pqformat.c:326
bytea * pq_endtypsend(StringInfo buf)
Definition: pqformat.c:346
static void pq_sendint32(StringInfo buf, uint32 i)
Definition: pqformat.h:144
static void pq_sendbyte(StringInfo buf, uint8 byt)
Definition: pqformat.h:160
static void pq_sendint16(StringInfo buf, uint16 i)
Definition: pqformat.h:136
StringInfoData * StringInfo
Definition: stringinfo.h:54
Definition: nodes.h:129
int32 size
Definition: ts_type.h:93
WordEntry entries[FLEXIBLE_ARRAY_MEMBER]
Definition: ts_type.h:94
int poslen
Definition: tsvector.c:30
WordEntryPos * pos
Definition: tsvector.c:29
WordEntry entry
Definition: tsvector.c:28
uint32 pos
Definition: ts_type.h:46
uint32 haspos
Definition: ts_type.h:44
uint32 len
Definition: ts_type.h:45
Definition: regguts.h:323
#define t_iseq(x, c)
Definition: ts_locale.h:38
#define PG_GETARG_TSVECTOR(n)
Definition: ts_type.h:135
#define WEP_GETPOS(x)
Definition: ts_type.h:80
#define PG_RETURN_TSVECTOR(x)
Definition: ts_type.h:137
#define MAXENTRYPOS
Definition: ts_type.h:85
#define POSDATALEN(x, e)
Definition: ts_type.h:110
uint16 WordEntryPos
Definition: ts_type.h:63
#define MAXNUMPOS
Definition: ts_type.h:86
TSVectorData * TSVector
Definition: ts_type.h:98
#define WEP_SETWEIGHT(x, v)
Definition: ts_type.h:82
#define DATAHDRSIZE
Definition: ts_type.h:100
#define MAXSTRLEN
Definition: ts_type.h:49
#define POSDATAPTR(x, e)
Definition: ts_type.h:111
#define WEP_GETWEIGHT(x)
Definition: ts_type.h:79
#define MAXSTRPOS
Definition: ts_type.h:50
Datum tsvectorout(PG_FUNCTION_ARGS)
Definition: tsvector.c:317
static int uniquePos(WordEntryPos *a, int l)
Definition: tsvector.c:52
Datum tsvectorrecv(PG_FUNCTION_ARGS)
Definition: tsvector.c:449
static int compareentry(const void *va, const void *vb, void *arg)
Definition: tsvector.c:84
Datum tsvectorin(PG_FUNCTION_ARGS)
Definition: tsvector.c:178
int compareWordEntryPos(const void *a, const void *b)
Definition: tsvector.c:36
Datum tsvectorsend(PG_FUNCTION_ARGS)
Definition: tsvector.c:410
static int WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
Definition: tsvector.c:171
static int uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
Definition: tsvector.c:100
int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
Definition: tsvector_op.c:1152
void close_tsvector_parser(TSVectorParseState state)
bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr)
TSVectorParseState init_tsvector_parser(char *input, int flags, Node *escontext)
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
static StringInfoData tmpbuf
Definition: walsender.c:170