PostgreSQL Source Code  git master
trgm_op.c
Go to the documentation of this file.
1 /*
2  * contrib/pg_trgm/trgm_op.c
3  */
4 #include "postgres.h"
5 
6 #include <ctype.h>
7 
8 #include "trgm.h"
9 
10 #include "catalog/pg_type.h"
11 #include "tsearch/ts_locale.h"
12 #include "utils/lsyscache.h"
13 #include "utils/memutils.h"
14 #include "utils/pg_crc.h"
15 
17 
18 /* GUC variables */
19 double similarity_threshold = 0.3f;
22 
23 void _PG_init(void);
24 
41 
42 /* Trigram with position */
43 typedef struct
44 {
46  int index;
47 } pos_trgm;
48 
49 /* Trigram bound type */
50 typedef uint8 TrgmBound;
51 #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
52 #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
53 
54 /* Word similarity flags */
55 #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
56  * search pattern in text */
57 #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
58  * word bounds */
59 
60 /*
61  * Module load callback
62  */
63 void
64 _PG_init(void)
65 {
66  /* Define custom GUC variables. */
67  DefineCustomRealVariable("pg_trgm.similarity_threshold",
68  "Sets the threshold used by the %% operator.",
69  "Valid range is 0.0 .. 1.0.",
71  0.3,
72  0.0,
73  1.0,
75  0,
76  NULL,
77  NULL,
78  NULL);
79  DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
80  "Sets the threshold used by the <%% operator.",
81  "Valid range is 0.0 .. 1.0.",
83  0.6,
84  0.0,
85  1.0,
87  0,
88  NULL,
89  NULL,
90  NULL);
91  DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
92  "Sets the threshold used by the <<%% operator.",
93  "Valid range is 0.0 .. 1.0.",
95  0.5,
96  0.0,
97  1.0,
99  0,
100  NULL,
101  NULL,
102  NULL);
103 }
104 
105 /*
106  * Deprecated function.
107  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
108  */
109 Datum
111 {
112  float4 nlimit = PG_GETARG_FLOAT4(0);
113  char *nlimit_str;
114  Oid func_out_oid;
115  bool is_varlena;
116 
117  getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
118 
119  nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
120 
121  SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
123 
125 }
126 
127 
128 /*
129  * Get similarity threshold for given index scan strategy number.
130  */
131 double
133 {
134  switch (strategy)
135  {
137  return similarity_threshold;
142  default:
143  elog(ERROR, "unrecognized strategy number: %d", strategy);
144  break;
145  }
146 
147  return 0.0; /* keep compiler quiet */
148 }
149 
150 /*
151  * Deprecated function.
152  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
153  */
154 Datum
156 {
158 }
159 
160 static int
161 comp_trgm(const void *a, const void *b)
162 {
163  return CMPTRGM(a, b);
164 }
165 
166 static int
167 unique_array(trgm *a, int len)
168 {
169  trgm *curend,
170  *tmp;
171 
172  curend = tmp = a;
173  while (tmp - a < len)
174  if (CMPTRGM(tmp, curend))
175  {
176  curend++;
177  CPTRGM(curend, tmp);
178  tmp++;
179  }
180  else
181  tmp++;
182 
183  return curend + 1 - a;
184 }
185 
186 /*
187  * Finds first word in string, returns pointer to the word,
188  * endword points to the character after word
189  */
190 static char *
191 find_word(char *str, int lenstr, char **endword, int *charlen)
192 {
193  char *beginword = str;
194 
195  while (beginword - str < lenstr && !ISWORDCHR(beginword))
196  beginword += pg_mblen(beginword);
197 
198  if (beginword - str >= lenstr)
199  return NULL;
200 
201  *endword = beginword;
202  *charlen = 0;
203  while (*endword - str < lenstr && ISWORDCHR(*endword))
204  {
205  *endword += pg_mblen(*endword);
206  (*charlen)++;
207  }
208 
209  return beginword;
210 }
211 
212 /*
213  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
214  * which is always exactly three bytes. If we have three single-byte
215  * characters, we just use them as-is; otherwise we form a hash value.
216  */
217 void
218 compact_trigram(trgm *tptr, char *str, int bytelen)
219 {
220  if (bytelen == 3)
221  {
222  CPTRGM(tptr, str);
223  }
224  else
225  {
226  pg_crc32 crc;
227 
228  INIT_LEGACY_CRC32(crc);
229  COMP_LEGACY_CRC32(crc, str, bytelen);
230  FIN_LEGACY_CRC32(crc);
231 
232  /*
233  * use only 3 upper bytes from crc, hope, it's good enough hashing
234  */
235  CPTRGM(tptr, &crc);
236  }
237 }
238 
239 /*
240  * Adds trigrams from words (already padded).
241  */
242 static trgm *
243 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
244 {
245  char *ptr = str;
246 
247  if (charlen < 3)
248  return tptr;
249 
250  if (bytelen > charlen)
251  {
252  /* Find multibyte character boundaries and apply compact_trigram */
253  int lenfirst = pg_mblen(str),
254  lenmiddle = pg_mblen(str + lenfirst),
255  lenlast = pg_mblen(str + lenfirst + lenmiddle);
256 
257  while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
258  {
259  compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
260 
261  ptr += lenfirst;
262  tptr++;
263 
264  lenfirst = lenmiddle;
265  lenmiddle = lenlast;
266  lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
267  }
268  }
269  else
270  {
271  /* Fast path when there are no multibyte characters */
272  Assert(bytelen == charlen);
273 
274  while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
275  {
276  CPTRGM(tptr, ptr);
277  ptr++;
278  tptr++;
279  }
280  }
281 
282  return tptr;
283 }
284 
285 /*
286  * Make array of trigrams without sorting and removing duplicate items.
287  *
288  * trg: where to return the array of trigrams.
289  * str: source string, of length slen bytes.
290  * bounds: where to return bounds of trigrams (if needed).
291  *
292  * Returns length of the generated array.
293  */
294 static int
295 generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
296 {
297  trgm *tptr;
298  char *buf;
299  int charlen,
300  bytelen;
301  char *bword,
302  *eword;
303 
304  if (slen + LPADDING + RPADDING < 3 || slen == 0)
305  return 0;
306 
307  tptr = trg;
308 
309  /* Allocate a buffer for case-folded, blank-padded words */
310  buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
311 
312  if (LPADDING > 0)
313  {
314  *buf = ' ';
315  if (LPADDING > 1)
316  *(buf + 1) = ' ';
317  }
318 
319  eword = str;
320  while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
321  {
322 #ifdef IGNORECASE
323  bword = lowerstr_with_len(bword, eword - bword);
324  bytelen = strlen(bword);
325 #else
326  bytelen = eword - bword;
327 #endif
328 
329  memcpy(buf + LPADDING, bword, bytelen);
330 
331 #ifdef IGNORECASE
332  pfree(bword);
333 #endif
334 
335  buf[LPADDING + bytelen] = ' ';
336  buf[LPADDING + bytelen + 1] = ' ';
337 
338  /* Calculate trigrams marking their bounds if needed */
339  if (bounds)
340  bounds[tptr - trg] |= TRGM_BOUND_LEFT;
341  tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
342  charlen + LPADDING + RPADDING);
343  if (bounds)
344  bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
345  }
346 
347  pfree(buf);
348 
349  return tptr - trg;
350 }
351 
352 /*
353  * Guard against possible overflow in the palloc requests below. (We
354  * don't worry about the additive constants, since palloc can detect
355  * requests that are a little above MaxAllocSize --- we just need to
356  * prevent integer overflow in the multiplications.)
357  */
358 static void
360 {
361  if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
363  ereport(ERROR,
364  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
365  errmsg("out of memory")));
366 }
367 
368 /*
369  * Make array of trigrams with sorting and removing duplicate items.
370  *
371  * str: source string, of length slen bytes.
372  *
373  * Returns the sorted array of unique trigrams.
374  */
375 TRGM *
376 generate_trgm(char *str, int slen)
377 {
378  TRGM *trg;
379  int len;
380 
381  protect_out_of_mem(slen);
382 
383  trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
384  trg->flag = ARRKEY;
385 
386  len = generate_trgm_only(GETARR(trg), str, slen, NULL);
387  SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
388 
389  if (len == 0)
390  return trg;
391 
392  /*
393  * Make trigrams unique.
394  */
395  if (len > 1)
396  {
397  qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
398  len = unique_array(GETARR(trg), len);
399  }
400 
401  SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
402 
403  return trg;
404 }
405 
406 /*
407  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
408  *
409  * trg1: trigram array of search pattern, of length len1. trg1 is required
410  * word which positions don't matter and replaced with -1.
411  * trg2: trigram array of text, of length len2. trg2 is haystack where we
412  * search and have to store its positions.
413  *
414  * Returns concatenated trigram array.
415  */
416 static pos_trgm *
417 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
418 {
419  pos_trgm *result;
420  int i,
421  len = len1 + len2;
422 
423  result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
424 
425  for (i = 0; i < len1; i++)
426  {
427  memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
428  result[i].index = -1;
429  }
430 
431  for (i = 0; i < len2; i++)
432  {
433  memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
434  result[i + len1].index = i;
435  }
436 
437  return result;
438 }
439 
440 /*
441  * Compare position trigrams: compare trigrams first and position second.
442  */
443 static int
444 comp_ptrgm(const void *v1, const void *v2)
445 {
446  const pos_trgm *p1 = (const pos_trgm *) v1;
447  const pos_trgm *p2 = (const pos_trgm *) v2;
448  int cmp;
449 
450  cmp = CMPTRGM(p1->trg, p2->trg);
451  if (cmp != 0)
452  return cmp;
453 
454  if (p1->index < p2->index)
455  return -1;
456  else if (p1->index == p2->index)
457  return 0;
458  else
459  return 1;
460 }
461 
462 /*
463  * Iterative search function which calculates maximum similarity with word in
464  * the string. But maximum similarity is calculated only if check_only == false.
465  *
466  * trg2indexes: array which stores indexes of the array "found".
467  * found: array which stores true of false values.
468  * ulen1: count of unique trigrams of array "trg1".
469  * len2: length of array "trg2" and array "trg2indexes".
470  * len: length of the array "found".
471  * lags: set of boolean flags parametrizing similarity calculation.
472  * bounds: whether each trigram is left/right bound of word.
473  *
474  * Returns word similarity.
475  */
476 static float4
477 iterate_word_similarity(int *trg2indexes,
478  bool *found,
479  int ulen1,
480  int len2,
481  int len,
482  uint8 flags,
483  TrgmBound *bounds)
484 {
485  int *lastpos,
486  i,
487  ulen2 = 0,
488  count = 0,
489  upper = -1,
490  lower;
491  float4 smlr_cur,
492  smlr_max = 0.0f;
493  double threshold;
494 
495  Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
496 
497  /* Select appropriate threshold */
498  threshold = (flags & WORD_SIMILARITY_STRICT) ?
501 
502  /*
503  * Consider first trigram as initial lower bount for strict word
504  * similarity, or initialize it later with first trigram present for plain
505  * word similarity.
506  */
507  lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
508 
509  /* Memorise last position of each trigram */
510  lastpos = (int *) palloc(sizeof(int) * len);
511  memset(lastpos, -1, sizeof(int) * len);
512 
513  for (i = 0; i < len2; i++)
514  {
515  /* Get index of next trigram */
516  int trgindex = trg2indexes[i];
517 
518  /* Update last position of this trigram */
519  if (lower >= 0 || found[trgindex])
520  {
521  if (lastpos[trgindex] < 0)
522  {
523  ulen2++;
524  if (found[trgindex])
525  count++;
526  }
527  lastpos[trgindex] = i;
528  }
529 
530  /*
531  * Adjust upper bound if trigram is upper bound of word for strict
532  * word similarity, or if trigram is present in required substring for
533  * plain word similarity
534  */
535  if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
536  : found[trgindex])
537  {
538  int prev_lower,
539  tmp_ulen2,
540  tmp_lower,
541  tmp_count;
542 
543  upper = i;
544  if (lower == -1)
545  {
546  lower = i;
547  ulen2 = 1;
548  }
549 
550  smlr_cur = CALCSML(count, ulen1, ulen2);
551 
552  /* Also try to adjust lower bound for greater similarity */
553  tmp_count = count;
554  tmp_ulen2 = ulen2;
555  prev_lower = lower;
556  for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
557  {
558  float smlr_tmp;
559  int tmp_trgindex;
560 
561  /*
562  * Adjust lower bound only if trigram is lower bound of word
563  * for strict word similarity, or consider every trigram as
564  * lower bound for plain word similarity.
565  */
566  if (!(flags & WORD_SIMILARITY_STRICT)
567  || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
568  {
569  smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
570  if (smlr_tmp > smlr_cur)
571  {
572  smlr_cur = smlr_tmp;
573  ulen2 = tmp_ulen2;
574  lower = tmp_lower;
575  count = tmp_count;
576  }
577 
578  /*
579  * If we only check that word similarity is greater than
580  * threshold we do not need to calculate a maximum
581  * similarity.
582  */
583  if ((flags & WORD_SIMILARITY_CHECK_ONLY)
584  && smlr_cur >= threshold)
585  break;
586  }
587 
588  tmp_trgindex = trg2indexes[tmp_lower];
589  if (lastpos[tmp_trgindex] == tmp_lower)
590  {
591  tmp_ulen2--;
592  if (found[tmp_trgindex])
593  tmp_count--;
594  }
595  }
596 
597  smlr_max = Max(smlr_max, smlr_cur);
598 
599  /*
600  * if we only check that word similarity is greater than threshold
601  * we do not need to calculate a maximum similarity.
602  */
603  if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
604  break;
605 
606  for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
607  {
608  int tmp_trgindex;
609 
610  tmp_trgindex = trg2indexes[tmp_lower];
611  if (lastpos[tmp_trgindex] == tmp_lower)
612  lastpos[tmp_trgindex] = -1;
613  }
614  }
615  }
616 
617  pfree(lastpos);
618 
619  return smlr_max;
620 }
621 
622 /*
623  * Calculate word similarity.
624  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
625  * are used to calculate word similarity using iterate_word_similarity().
626  *
627  * "trg2indexes" is array which stores indexes of the array "found".
628  * In other words:
629  * trg2indexes[j] = i;
630  * found[i] = true (or false);
631  * If found[i] == true then there is trigram trg2[j] in array "trg1".
632  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
633  *
634  * str1: search pattern string, of length slen1 bytes.
635  * str2: text in which we are looking for a word, of length slen2 bytes.
636  * flags: set of boolean flags parametrizing similarity calculation.
637  *
638  * Returns word similarity.
639  */
640 static float4
641 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
642  uint8 flags)
643 {
644  bool *found;
645  pos_trgm *ptrg;
646  trgm *trg1;
647  trgm *trg2;
648  int len1,
649  len2,
650  len,
651  i,
652  j,
653  ulen1;
654  int *trg2indexes;
655  float4 result;
656  TrgmBound *bounds;
657 
658  protect_out_of_mem(slen1 + slen2);
659 
660  /* Make positional trigrams */
661  trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
662  trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
663  if (flags & WORD_SIMILARITY_STRICT)
664  bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
665  else
666  bounds = NULL;
667 
668  len1 = generate_trgm_only(trg1, str1, slen1, NULL);
669  len2 = generate_trgm_only(trg2, str2, slen2, bounds);
670 
671  ptrg = make_positional_trgm(trg1, len1, trg2, len2);
672  len = len1 + len2;
673  qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
674 
675  pfree(trg1);
676  pfree(trg2);
677 
678  /*
679  * Merge positional trigrams array: enumerate each trigram and find its
680  * presence in required word.
681  */
682  trg2indexes = (int *) palloc(sizeof(int) * len2);
683  found = (bool *) palloc0(sizeof(bool) * len);
684 
685  ulen1 = 0;
686  j = 0;
687  for (i = 0; i < len; i++)
688  {
689  if (i > 0)
690  {
691  int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
692 
693  if (cmp != 0)
694  {
695  if (found[j])
696  ulen1++;
697  j++;
698  }
699  }
700 
701  if (ptrg[i].index >= 0)
702  {
703  trg2indexes[ptrg[i].index] = j;
704  }
705  else
706  {
707  found[j] = true;
708  }
709  }
710  if (found[j])
711  ulen1++;
712 
713  /* Run iterative procedure to find maximum similarity with word */
714  result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
715  flags, bounds);
716 
717  pfree(trg2indexes);
718  pfree(found);
719  pfree(ptrg);
720 
721  return result;
722 }
723 
724 
725 /*
726  * Extract the next non-wildcard part of a search string, i.e. a word bounded
727  * by '_' or '%' meta-characters, non-word characters or string end.
728  *
729  * str: source string, of length lenstr bytes (need not be null-terminated)
730  * buf: where to return the substring (must be long enough)
731  * *bytelen: receives byte length of the found substring
732  * *charlen: receives character length of the found substring
733  *
734  * Returns pointer to end+1 of the found substring in the source string.
735  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
736  *
737  * If the found word is bounded by non-word characters or string boundaries
738  * then this function will include corresponding padding spaces into buf.
739  */
740 static const char *
741 get_wildcard_part(const char *str, int lenstr,
742  char *buf, int *bytelen, int *charlen)
743 {
744  const char *beginword = str;
745  const char *endword;
746  char *s = buf;
747  bool in_leading_wildcard_meta = false;
748  bool in_trailing_wildcard_meta = false;
749  bool in_escape = false;
750  int clen;
751 
752  /*
753  * Find the first word character, remembering whether preceding character
754  * was wildcard meta-character. Note that the in_escape state persists
755  * from this loop to the next one, since we may exit at a word character
756  * that is in_escape.
757  */
758  while (beginword - str < lenstr)
759  {
760  if (in_escape)
761  {
762  if (ISWORDCHR(beginword))
763  break;
764  in_escape = false;
765  in_leading_wildcard_meta = false;
766  }
767  else
768  {
769  if (ISESCAPECHAR(beginword))
770  in_escape = true;
771  else if (ISWILDCARDCHAR(beginword))
772  in_leading_wildcard_meta = true;
773  else if (ISWORDCHR(beginword))
774  break;
775  else
776  in_leading_wildcard_meta = false;
777  }
778  beginword += pg_mblen(beginword);
779  }
780 
781  /*
782  * Handle string end.
783  */
784  if (beginword - str >= lenstr)
785  return NULL;
786 
787  /*
788  * Add left padding spaces if preceding character wasn't wildcard
789  * meta-character.
790  */
791  *charlen = 0;
792  if (!in_leading_wildcard_meta)
793  {
794  if (LPADDING > 0)
795  {
796  *s++ = ' ';
797  (*charlen)++;
798  if (LPADDING > 1)
799  {
800  *s++ = ' ';
801  (*charlen)++;
802  }
803  }
804  }
805 
806  /*
807  * Copy data into buf until wildcard meta-character, non-word character or
808  * string boundary. Strip escapes during copy.
809  */
810  endword = beginword;
811  while (endword - str < lenstr)
812  {
813  clen = pg_mblen(endword);
814  if (in_escape)
815  {
816  if (ISWORDCHR(endword))
817  {
818  memcpy(s, endword, clen);
819  (*charlen)++;
820  s += clen;
821  }
822  else
823  {
824  /*
825  * Back up endword to the escape character when stopping at an
826  * escaped char, so that subsequent get_wildcard_part will
827  * restart from the escape character. We assume here that
828  * escape chars are single-byte.
829  */
830  endword--;
831  break;
832  }
833  in_escape = false;
834  }
835  else
836  {
837  if (ISESCAPECHAR(endword))
838  in_escape = true;
839  else if (ISWILDCARDCHAR(endword))
840  {
841  in_trailing_wildcard_meta = true;
842  break;
843  }
844  else if (ISWORDCHR(endword))
845  {
846  memcpy(s, endword, clen);
847  (*charlen)++;
848  s += clen;
849  }
850  else
851  break;
852  }
853  endword += clen;
854  }
855 
856  /*
857  * Add right padding spaces if next character isn't wildcard
858  * meta-character.
859  */
860  if (!in_trailing_wildcard_meta)
861  {
862  if (RPADDING > 0)
863  {
864  *s++ = ' ';
865  (*charlen)++;
866  if (RPADDING > 1)
867  {
868  *s++ = ' ';
869  (*charlen)++;
870  }
871  }
872  }
873 
874  *bytelen = s - buf;
875  return endword;
876 }
877 
878 /*
879  * Generates trigrams for wildcard search string.
880  *
881  * Returns array of trigrams that must occur in any string that matches the
882  * wildcard string. For example, given pattern "a%bcd%" the trigrams
883  * " a", "bcd" would be extracted.
884  */
885 TRGM *
886 generate_wildcard_trgm(const char *str, int slen)
887 {
888  TRGM *trg;
889  char *buf,
890  *buf2;
891  trgm *tptr;
892  int len,
893  charlen,
894  bytelen;
895  const char *eword;
896 
897  protect_out_of_mem(slen);
898 
899  trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
900  trg->flag = ARRKEY;
901  SET_VARSIZE(trg, TRGMHDRSIZE);
902 
903  if (slen + LPADDING + RPADDING < 3 || slen == 0)
904  return trg;
905 
906  tptr = GETARR(trg);
907 
908  /* Allocate a buffer for blank-padded, but not yet case-folded, words */
909  buf = palloc(sizeof(char) * (slen + 4));
910 
911  /*
912  * Extract trigrams from each substring extracted by get_wildcard_part.
913  */
914  eword = str;
915  while ((eword = get_wildcard_part(eword, slen - (eword - str),
916  buf, &bytelen, &charlen)) != NULL)
917  {
918 #ifdef IGNORECASE
919  buf2 = lowerstr_with_len(buf, bytelen);
920  bytelen = strlen(buf2);
921 #else
922  buf2 = buf;
923 #endif
924 
925  /*
926  * count trigrams
927  */
928  tptr = make_trigrams(tptr, buf2, bytelen, charlen);
929 
930 #ifdef IGNORECASE
931  pfree(buf2);
932 #endif
933  }
934 
935  pfree(buf);
936 
937  if ((len = tptr - GETARR(trg)) == 0)
938  return trg;
939 
940  /*
941  * Make trigrams unique.
942  */
943  if (len > 1)
944  {
945  qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
946  len = unique_array(GETARR(trg), len);
947  }
948 
949  SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
950 
951  return trg;
952 }
953 
954 uint32
956 {
957  uint32 val = 0;
958 
959  val |= *(((unsigned char *) ptr));
960  val <<= 8;
961  val |= *(((unsigned char *) ptr) + 1);
962  val <<= 8;
963  val |= *(((unsigned char *) ptr) + 2);
964 
965  return val;
966 }
967 
968 Datum
970 {
971  text *in = PG_GETARG_TEXT_PP(0);
972  TRGM *trg;
973  Datum *d;
974  ArrayType *a;
975  trgm *ptr;
976  int i;
977 
979  d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
980 
981  for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
982  {
983  text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
984 
986  {
987  snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
988  SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
989  }
990  else
991  {
992  SET_VARSIZE(item, VARHDRSZ + 3);
993  CPTRGM(VARDATA(item), ptr);
994  }
995  d[i] = PointerGetDatum(item);
996  }
997 
998  a = construct_array(
999  d,
1000  ARRNELEM(trg),
1001  TEXTOID,
1002  -1,
1003  false,
1004  'i'
1005  );
1006 
1007  for (i = 0; i < ARRNELEM(trg); i++)
1008  pfree(DatumGetPointer(d[i]));
1009 
1010  pfree(d);
1011  pfree(trg);
1012  PG_FREE_IF_COPY(in, 0);
1013 
1014  PG_RETURN_POINTER(a);
1015 }
1016 
1017 float4
1018 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
1019 {
1020  trgm *ptr1,
1021  *ptr2;
1022  int count = 0;
1023  int len1,
1024  len2;
1025 
1026  ptr1 = GETARR(trg1);
1027  ptr2 = GETARR(trg2);
1028 
1029  len1 = ARRNELEM(trg1);
1030  len2 = ARRNELEM(trg2);
1031 
1032  /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1033  if (len1 <= 0 || len2 <= 0)
1034  return (float4) 0.0;
1035 
1036  while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1037  {
1038  int res = CMPTRGM(ptr1, ptr2);
1039 
1040  if (res < 0)
1041  ptr1++;
1042  else if (res > 0)
1043  ptr2++;
1044  else
1045  {
1046  ptr1++;
1047  ptr2++;
1048  count++;
1049  }
1050  }
1051 
1052  /*
1053  * If inexact then len2 is equal to count, because we don't know actual
1054  * length of second string in inexact search and we can assume that count
1055  * is a lower bound of len2.
1056  */
1057  return CALCSML(count, len1, inexact ? count : len2);
1058 }
1059 
1060 
1061 /*
1062  * Returns whether trg2 contains all trigrams in trg1.
1063  * This relies on the trigram arrays being sorted.
1064  */
1065 bool
1067 {
1068  trgm *ptr1,
1069  *ptr2;
1070  int len1,
1071  len2;
1072 
1073  ptr1 = GETARR(trg1);
1074  ptr2 = GETARR(trg2);
1075 
1076  len1 = ARRNELEM(trg1);
1077  len2 = ARRNELEM(trg2);
1078 
1079  while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1080  {
1081  int res = CMPTRGM(ptr1, ptr2);
1082 
1083  if (res < 0)
1084  return false;
1085  else if (res > 0)
1086  ptr2++;
1087  else
1088  {
1089  ptr1++;
1090  ptr2++;
1091  }
1092  }
1093  if (ptr1 - GETARR(trg1) < len1)
1094  return false;
1095  else
1096  return true;
1097 }
1098 
1099 /*
1100  * Return a palloc'd boolean array showing, for each trigram in "query",
1101  * whether it is present in the trigram array "key".
1102  * This relies on the "key" array being sorted, but "query" need not be.
1103  */
1104 bool *
1106 {
1107  bool *result;
1108  trgm *ptrq = GETARR(query),
1109  *ptrk = GETARR(key);
1110  int lenq = ARRNELEM(query),
1111  lenk = ARRNELEM(key),
1112  i;
1113 
1114  result = (bool *) palloc0(lenq * sizeof(bool));
1115 
1116  /* for each query trigram, do a binary search in the key array */
1117  for (i = 0; i < lenq; i++)
1118  {
1119  int lo = 0;
1120  int hi = lenk;
1121 
1122  while (lo < hi)
1123  {
1124  int mid = (lo + hi) / 2;
1125  int res = CMPTRGM(ptrq, ptrk + mid);
1126 
1127  if (res < 0)
1128  hi = mid;
1129  else if (res > 0)
1130  lo = mid + 1;
1131  else
1132  {
1133  result[i] = true;
1134  break;
1135  }
1136  }
1137  ptrq++;
1138  }
1139 
1140  return result;
1141 }
1142 
1143 Datum
1145 {
1146  text *in1 = PG_GETARG_TEXT_PP(0);
1147  text *in2 = PG_GETARG_TEXT_PP(1);
1148  TRGM *trg1,
1149  *trg2;
1150  float4 res;
1151 
1152  trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1153  trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1154 
1155  res = cnt_sml(trg1, trg2, false);
1156 
1157  pfree(trg1);
1158  pfree(trg2);
1159  PG_FREE_IF_COPY(in1, 0);
1160  PG_FREE_IF_COPY(in2, 1);
1161 
1162  PG_RETURN_FLOAT4(res);
1163 }
1164 
1165 Datum
1167 {
1168  text *in1 = PG_GETARG_TEXT_PP(0);
1169  text *in2 = PG_GETARG_TEXT_PP(1);
1170  float4 res;
1171 
1173  VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1174  0);
1175 
1176  PG_FREE_IF_COPY(in1, 0);
1177  PG_FREE_IF_COPY(in2, 1);
1178  PG_RETURN_FLOAT4(res);
1179 }
1180 
1181 Datum
1183 {
1184  text *in1 = PG_GETARG_TEXT_PP(0);
1185  text *in2 = PG_GETARG_TEXT_PP(1);
1186  float4 res;
1187 
1189  VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1191 
1192  PG_FREE_IF_COPY(in1, 0);
1193  PG_FREE_IF_COPY(in2, 1);
1194  PG_RETURN_FLOAT4(res);
1195 }
1196 
1197 Datum
1199 {
1201  PG_GETARG_DATUM(0),
1202  PG_GETARG_DATUM(1)));
1203 
1204  PG_RETURN_FLOAT4(1.0 - res);
1205 }
1206 
1207 Datum
1209 {
1211  PG_GETARG_DATUM(0),
1212  PG_GETARG_DATUM(1)));
1213 
1215 }
1216 
1217 Datum
1219 {
1220  text *in1 = PG_GETARG_TEXT_PP(0);
1221  text *in2 = PG_GETARG_TEXT_PP(1);
1222  float4 res;
1223 
1225  VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1227 
1228  PG_FREE_IF_COPY(in1, 0);
1229  PG_FREE_IF_COPY(in2, 1);
1231 }
1232 
1233 Datum
1235 {
1236  text *in1 = PG_GETARG_TEXT_PP(0);
1237  text *in2 = PG_GETARG_TEXT_PP(1);
1238  float4 res;
1239 
1241  VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1243 
1244  PG_FREE_IF_COPY(in1, 0);
1245  PG_FREE_IF_COPY(in2, 1);
1247 }
1248 
1249 Datum
1251 {
1252  text *in1 = PG_GETARG_TEXT_PP(0);
1253  text *in2 = PG_GETARG_TEXT_PP(1);
1254  float4 res;
1255 
1257  VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1258  0);
1259 
1260  PG_FREE_IF_COPY(in1, 0);
1261  PG_FREE_IF_COPY(in2, 1);
1262  PG_RETURN_FLOAT4(1.0 - res);
1263 }
1264 
1265 Datum
1267 {
1268  text *in1 = PG_GETARG_TEXT_PP(0);
1269  text *in2 = PG_GETARG_TEXT_PP(1);
1270  float4 res;
1271 
1273  VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1274  0);
1275 
1276  PG_FREE_IF_COPY(in1, 0);
1277  PG_FREE_IF_COPY(in2, 1);
1278  PG_RETURN_FLOAT4(1.0 - res);
1279 }
1280 
1281 Datum
1283 {
1284  text *in1 = PG_GETARG_TEXT_PP(0);
1285  text *in2 = PG_GETARG_TEXT_PP(1);
1286  float4 res;
1287 
1289  VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1291 
1292  PG_FREE_IF_COPY(in1, 0);
1293  PG_FREE_IF_COPY(in2, 1);
1295 }
1296 
1297 Datum
1299 {
1300  text *in1 = PG_GETARG_TEXT_PP(0);
1301  text *in2 = PG_GETARG_TEXT_PP(1);
1302  float4 res;
1303 
1305  VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1307 
1308  PG_FREE_IF_COPY(in1, 0);
1309  PG_FREE_IF_COPY(in2, 1);
1311 }
1312 
1313 Datum
1315 {
1316  text *in1 = PG_GETARG_TEXT_PP(0);
1317  text *in2 = PG_GETARG_TEXT_PP(1);
1318  float4 res;
1319 
1321  VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1323 
1324  PG_FREE_IF_COPY(in1, 0);
1325  PG_FREE_IF_COPY(in2, 1);
1326  PG_RETURN_FLOAT4(1.0 - res);
1327 }
1328 
1329 Datum
1331 {
1332  text *in1 = PG_GETARG_TEXT_PP(0);
1333  text *in2 = PG_GETARG_TEXT_PP(1);
1334  float4 res;
1335 
1337  VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1339 
1340  PG_FREE_IF_COPY(in1, 0);
1341  PG_FREE_IF_COPY(in2, 1);
1342  PG_RETURN_FLOAT4(1.0 - res);
1343 }
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:326
uint32 trgm2int(trgm *ptr)
Definition: trgm_op.c:955
#define INIT_LEGACY_CRC32(crc)
Definition: pg_crc.h:79
#define ISWILDCARDCHAR(x)
Definition: trgm.h:63
#define PG_RETURN_FLOAT4(x)
Definition: fmgr.h:330
#define CMPTRGM(a, b)
Definition: trgm.h:45
void getTypeOutputInfo(Oid type, Oid *typOutput, bool *typIsVarlena)
Definition: lsyscache.c:2650
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
#define VARDATA(PTR)
Definition: postgres.h:302
PG_FUNCTION_INFO_V1(set_limit)
double index_strategy_get_limit(StrategyNumber strategy)
Definition: trgm_op.c:132
Datum strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1314
#define SimilarityStrategyNumber
Definition: trgm.h:30
Datum lower(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:44
static float4 iterate_word_similarity(int *trg2indexes, bool *found, int ulen1, int len2, int len, uint8 flags, TrgmBound *bounds)
Definition: trgm_op.c:477
Datum word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1266
int index
Definition: trgm_op.c:46
#define PointerGetDatum(X)
Definition: postgres.h:541
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:238
#define VARHDRSZ
Definition: c.h:522
#define StrictWordSimilarityStrategyNumber
Definition: trgm.h:38
char * lowerstr_with_len(const char *str, int len)
Definition: ts_locale.c:241
unsigned char uint8
Definition: c.h:323
ArrayType * construct_array(Datum *elems, int nelems, Oid elmtype, int elmlen, bool elmbyval, char elmalign)
Definition: arrayfuncs.c:3279
#define ISWORDCHR(c)
Definition: trgm.h:54
#define ARRKEY
Definition: trgm.h:97
uint16 StrategyNumber
Definition: stratnum.h:22
int errcode(int sqlerrcode)
Definition: elog.c:575
Datum similarity_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1208
static void protect_out_of_mem(int slen)
Definition: trgm_op.c:359
static trgm * make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
Definition: trgm_op.c:243
void _PG_init(void)
Definition: trgm_op.c:64
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define ARRNELEM(x)
Definition: trgm.h:108
TRGM * generate_trgm(char *str, int slen)
Definition: trgm_op.c:376
Datum upper(PG_FUNCTION_ARGS)
Definition: oracle_compat.c:75
unsigned int Oid
Definition: postgres_ext.h:31
static char * find_word(char *str, int lenstr, char **endword, int *charlen)
Definition: trgm_op.c:191
void DefineCustomRealVariable(const char *name, const char *short_desc, const char *long_desc, double *valueAddr, double bootValue, double minValue, double maxValue, GucContext context, int flags, GucRealCheckHook check_hook, GucRealAssignHook assign_hook, GucShowHook show_hook)
Definition: guc.c:8062
Datum strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1298
#define FIN_LEGACY_CRC32(crc)
Definition: pg_crc.h:80
#define TRGM_BOUND_RIGHT
Definition: trgm_op.c:52
#define WORD_SIMILARITY_CHECK_ONLY
Definition: trgm_op.c:55
Datum strict_word_similarity(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1182
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:278
Definition: type.h:89
#define GETARR(x)
Definition: trgm.h:107
bool * trgm_presence_map(TRGM *query, TRGM *key)
Definition: trgm_op.c:1105
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ERROR
Definition: elog.h:43
double strict_word_similarity_threshold
Definition: trgm_op.c:21
#define TRGM_BOUND_LEFT
Definition: trgm_op.c:51
Definition: trgm.h:66
static int comp_ptrgm(const void *v1, const void *v2)
Definition: trgm_op.c:444
#define ISPRINTABLETRGM(t)
Definition: trgm.h:60
Datum Float4GetDatum(float4 X)
Definition: fmgr.c:1889
void SetConfigOption(const char *name, const char *value, GucContext context, GucSource source)
Definition: guc.c:6917
static char * buf
Definition: pg_test_fsync.c:67
int pg_database_encoding_max_length(void)
Definition: wchar.c:1833
Datum show_trgm(PG_FUNCTION_ARGS)
Definition: trgm_op.c:969
unsigned int uint32
Definition: c.h:325
Datum word_similarity(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1166
#define TRGMHDRSIZE
Definition: trgm.h:73
#define PG_GETARG_FLOAT4(n)
Definition: fmgr.h:250
#define ereport(elevel, rest)
Definition: elog.h:122
TRGM * generate_wildcard_trgm(const char *str, int slen)
Definition: trgm_op.c:886
PG_MODULE_MAGIC
Definition: trgm_op.c:16
#define MaxAllocSize
Definition: memutils.h:40
float float4
Definition: c.h:457
void * palloc0(Size size)
Definition: mcxt.c:955
#define PG_RETURN_BOOL(x)
Definition: fmgr.h:324
uintptr_t Datum
Definition: postgres.h:367
uint8 flag
Definition: trgm.h:69
#define WordSimilarityStrategyNumber
Definition: trgm.h:36
Datum set_limit(PG_FUNCTION_ARGS)
Definition: trgm_op.c:110
Datum strict_word_similarity_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1282
char trgm[3]
Definition: trgm.h:41
#define Max(x, y)
Definition: c.h:851
#define CALCGTSIZE(flag)
Definition: hstore_gist.c:52
#define Assert(condition)
Definition: c.h:699
Datum similarity(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1144
Datum similarity_dist(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1198
#define DatumGetFloat4(X)
Definition: postgres.h:665
#define ISESCAPECHAR(x)
Definition: trgm.h:62
Datum word_similarity_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1218
size_t Size
Definition: c.h:433
Datum show_limit(PG_FUNCTION_ARGS)
Definition: trgm_op.c:155
Datum word_similarity_dist_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1250
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:230
#define RPADDING
Definition: trgm.h:17
int pg_mblen(const char *mbstr)
Definition: mbutils.c:760
static int unique_array(trgm *a, int len)
Definition: trgm_op.c:167
Datum word_similarity_commutator_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1234
float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
Definition: trgm_op.c:1018
#define DatumGetPointer(X)
Definition: postgres.h:534
#define WORD_SIMILARITY_STRICT
Definition: trgm_op.c:57
uint8 TrgmBound
Definition: trgm_op.c:50
trgm trg
Definition: trgm_op.c:45
char * OidOutputFunctionCall(Oid functionId, Datum val)
Definition: fmgr.c:1833
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:341
void * palloc(Size size)
Definition: mcxt.c:924
#define LPADDING
Definition: trgm.h:16
int errmsg(const char *fmt,...)
Definition: elog.c:797
double word_similarity_threshold
Definition: trgm_op.c:20
#define COMP_LEGACY_CRC32(crc, data, len)
Definition: pg_crc.h:81
int i
Datum strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
Definition: trgm_op.c:1330
static const char * get_wildcard_part(const char *str, int lenstr, char *buf, int *bytelen, int *charlen)
Definition: trgm_op.c:741
Definition: c.h:516
#define PG_FUNCTION_ARGS
Definition: fmgr.h:163
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:329
uint32 pg_crc32
Definition: pg_crc.h:37
#define elog
Definition: elog.h:219
#define qsort(a, b, c, d)
Definition: port.h:421
double similarity_threshold
Definition: trgm_op.c:19
bool trgm_contained_by(TRGM *trg1, TRGM *trg2)
Definition: trgm_op.c:1066
static int generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
Definition: trgm_op.c:295
long val
Definition: informix.c:689
#define DirectFunctionCall2(func, arg1, arg2)
Definition: fmgr.h:592
static pos_trgm * make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
Definition: trgm_op.c:417
#define CPTRGM(a, b)
Definition: trgm.h:47
static float4 calc_word_similarity(char *str1, int slen1, char *str2, int slen2, uint8 flags)
Definition: trgm_op.c:641
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition: trgm_op.c:218
#define CALCSML(count, len1, len2)
Definition: trgm.h:117
static int comp_trgm(const void *a, const void *b)
Definition: trgm_op.c:161