PostgreSQL Source Code  git master
trgm_gist.c
Go to the documentation of this file.
1 /*
2  * contrib/pg_trgm/trgm_gist.c
3  */
4 #include "postgres.h"
5 
6 #include "access/reloptions.h"
7 #include "access/stratnum.h"
8 #include "fmgr.h"
9 #include "port/pg_bitutils.h"
10 #include "trgm.h"
11 
12 /* gist_trgm_ops opclass options */
13 typedef struct
14 {
15  int32 vl_len_; /* varlena header (do not touch directly!) */
16  int siglen; /* signature length in bytes */
18 
19 #define LTREE_GET_ASIGLEN() (PG_HAS_OPCLASS_OPTIONS() ? \
20  ((TrgmGistOptions *) PG_GET_OPCLASS_OPTIONS())->siglen : \
21  SIGLEN_DEFAULT)
22 
23 typedef struct
24 {
25  /* most recent inputs to gtrgm_consistent */
28  /* extracted trigrams for query */
30  /* if a regex operator, the extracted graph */
32 
33  /*
34  * The "query" and "trigrams" are stored in the same palloc block as this
35  * cache struct, at MAXALIGN'ed offsets. The graph however isn't.
36  */
38 
39 #define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
40 
41 
53 
54 
55 Datum
57 {
58  elog(ERROR, "not implemented");
59  PG_RETURN_DATUM(0);
60 }
61 
62 Datum
64 {
65  elog(ERROR, "not implemented");
66  PG_RETURN_DATUM(0);
67 }
68 
69 static TRGM *
70 gtrgm_alloc(bool isalltrue, int siglen, BITVECP sign)
71 {
72  int flag = SIGNKEY | (isalltrue ? ALLISTRUE : 0);
73  int size = CALCGTSIZE(flag, siglen);
74  TRGM *res = palloc(size);
75 
76  SET_VARSIZE(res, size);
77  res->flag = flag;
78 
79  if (!isalltrue)
80  {
81  if (sign)
82  memcpy(GETSIGN(res), sign, siglen);
83  else
84  memset(GETSIGN(res), 0, siglen);
85  }
86 
87  return res;
88 }
89 
90 static void
91 makesign(BITVECP sign, TRGM *a, int siglen)
92 {
93  int32 k,
94  len = ARRNELEM(a);
95  trgm *ptr = GETARR(a);
96  int32 tmp = 0;
97 
98  MemSet((void *) sign, 0, siglen);
99  SETBIT(sign, SIGLENBIT(siglen)); /* set last unused bit */
100  for (k = 0; k < len; k++)
101  {
102  CPTRGM(((char *) &tmp), ptr + k);
103  HASH(sign, tmp, siglen);
104  }
105 }
106 
107 Datum
109 {
110  GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
111  int siglen = LTREE_GET_ASIGLEN();
112  GISTENTRY *retval = entry;
113 
114  if (entry->leafkey)
115  { /* trgm */
116  TRGM *res;
117  text *val = DatumGetTextPP(entry->key);
118 
119  res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
120  retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
121  gistentryinit(*retval, PointerGetDatum(res),
122  entry->rel, entry->page,
123  entry->offset, false);
124  }
125  else if (ISSIGNKEY(DatumGetPointer(entry->key)) &&
126  !ISALLTRUE(DatumGetPointer(entry->key)))
127  {
128  int32 i;
129  TRGM *res;
131 
132  LOOPBYTE(siglen)
133  {
134  if ((sign[i] & 0xff) != 0xff)
135  PG_RETURN_POINTER(retval);
136  }
137 
138  res = gtrgm_alloc(true, siglen, sign);
139  retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
140  gistentryinit(*retval, PointerGetDatum(res),
141  entry->rel, entry->page,
142  entry->offset, false);
143  }
144  PG_RETURN_POINTER(retval);
145 }
146 
147 Datum
149 {
150  GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
151  GISTENTRY *retval;
152  text *key;
153 
154  key = DatumGetTextPP(entry->key);
155 
156  if (key != (text *) DatumGetPointer(entry->key))
157  {
158  /* need to pass back the decompressed item */
159  retval = palloc(sizeof(GISTENTRY));
160  gistentryinit(*retval, PointerGetDatum(key),
161  entry->rel, entry->page, entry->offset, entry->leafkey);
162  PG_RETURN_POINTER(retval);
163  }
164  else
165  {
166  /* we can return the entry as-is */
167  PG_RETURN_POINTER(entry);
168  }
169 }
170 
171 static int32
173 {
174  int32 count = 0;
175  int32 k,
176  len = ARRNELEM(qtrg);
177  trgm *ptr = GETARR(qtrg);
178  int32 tmp = 0;
179 
180  for (k = 0; k < len; k++)
181  {
182  CPTRGM(((char *) &tmp), ptr + k);
183  count += GETBIT(sign, HASHVAL(tmp, siglen));
184  }
185 
186  return count;
187 }
188 
189 Datum
191 {
192  GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
193  text *query = PG_GETARG_TEXT_P(1);
195 
196  /* Oid subtype = PG_GETARG_OID(3); */
197  bool *recheck = (bool *) PG_GETARG_POINTER(4);
198  int siglen = LTREE_GET_ASIGLEN();
199  TRGM *key = (TRGM *) DatumGetPointer(entry->key);
200  TRGM *qtrg;
201  bool res;
202  Size querysize = VARSIZE(query);
203  gtrgm_consistent_cache *cache;
204  double nlimit;
205 
206  /*
207  * We keep the extracted trigrams in cache, because trigram extraction is
208  * relatively CPU-expensive. When trying to reuse a cached value, check
209  * strategy number not just query itself, because trigram extraction
210  * depends on strategy.
211  *
212  * The cached structure is a single palloc chunk containing the
213  * gtrgm_consistent_cache header, then the input query (4-byte length
214  * word, uncompressed, starting at a MAXALIGN boundary), then the TRGM
215  * value (also starting at a MAXALIGN boundary). However we don't try to
216  * include the regex graph (if any) in that struct. (XXX currently, this
217  * approach can leak regex graphs across index rescans. Not clear if
218  * that's worth fixing.)
219  */
220  cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
221  if (cache == NULL ||
222  cache->strategy != strategy ||
223  VARSIZE(cache->query) != querysize ||
224  memcmp((char *) cache->query, (char *) query, querysize) != 0)
225  {
226  gtrgm_consistent_cache *newcache;
227  TrgmPackedGraph *graph = NULL;
228  Size qtrgsize;
229 
230  switch (strategy)
231  {
235  qtrg = generate_trgm(VARDATA(query),
236  querysize - VARHDRSZ);
237  break;
238  case ILikeStrategyNumber:
239 #ifndef IGNORECASE
240  elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
241 #endif
242  /* FALL THRU */
243  case LikeStrategyNumber:
244  qtrg = generate_wildcard_trgm(VARDATA(query),
245  querysize - VARHDRSZ);
246  break;
248 #ifndef IGNORECASE
249  elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
250 #endif
251  /* FALL THRU */
253  qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
254  &graph, fcinfo->flinfo->fn_mcxt);
255  /* just in case an empty array is returned ... */
256  if (qtrg && ARRNELEM(qtrg) <= 0)
257  {
258  pfree(qtrg);
259  qtrg = NULL;
260  }
261  break;
262  default:
263  elog(ERROR, "unrecognized strategy number: %d", strategy);
264  qtrg = NULL; /* keep compiler quiet */
265  break;
266  }
267 
268  qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
269 
270  newcache = (gtrgm_consistent_cache *)
271  MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
273  MAXALIGN(querysize) +
274  qtrgsize);
275 
276  newcache->strategy = strategy;
277  newcache->query = (text *)
278  ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
279  memcpy((char *) newcache->query, (char *) query, querysize);
280  if (qtrg)
281  {
282  newcache->trigrams = (TRGM *)
283  ((char *) newcache->query + MAXALIGN(querysize));
284  memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
285  /* release qtrg in case it was made in fn_mcxt */
286  pfree(qtrg);
287  }
288  else
289  newcache->trigrams = NULL;
290  newcache->graph = graph;
291 
292  if (cache)
293  pfree(cache);
294  fcinfo->flinfo->fn_extra = (void *) newcache;
295  cache = newcache;
296  }
297 
298  qtrg = cache->trigrams;
299 
300  switch (strategy)
301  {
305 
306  /*
307  * Similarity search is exact. (Strict) word similarity search is
308  * inexact
309  */
310  *recheck = (strategy != SimilarityStrategyNumber);
311 
312  nlimit = index_strategy_get_limit(strategy);
313 
314  if (GIST_LEAF(entry))
315  { /* all leafs contains orig trgm */
316  double tmpsml = cnt_sml(qtrg, key, *recheck);
317 
318  res = (tmpsml >= nlimit);
319  }
320  else if (ISALLTRUE(key))
321  { /* non-leaf contains signature */
322  res = true;
323  }
324  else
325  { /* non-leaf contains signature */
326  int32 count = cnt_sml_sign_common(qtrg, GETSIGN(key), siglen);
327  int32 len = ARRNELEM(qtrg);
328 
329  if (len == 0)
330  res = false;
331  else
332  res = (((((float8) count) / ((float8) len))) >= nlimit);
333  }
334  break;
335  case ILikeStrategyNumber:
336 #ifndef IGNORECASE
337  elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
338 #endif
339  /* FALL THRU */
340  case LikeStrategyNumber:
341  /* Wildcard search is inexact */
342  *recheck = true;
343 
344  /*
345  * Check if all the extracted trigrams can be present in child
346  * nodes.
347  */
348  if (GIST_LEAF(entry))
349  { /* all leafs contains orig trgm */
350  res = trgm_contained_by(qtrg, key);
351  }
352  else if (ISALLTRUE(key))
353  { /* non-leaf contains signature */
354  res = true;
355  }
356  else
357  { /* non-leaf contains signature */
358  int32 k,
359  tmp = 0,
360  len = ARRNELEM(qtrg);
361  trgm *ptr = GETARR(qtrg);
362  BITVECP sign = GETSIGN(key);
363 
364  res = true;
365  for (k = 0; k < len; k++)
366  {
367  CPTRGM(((char *) &tmp), ptr + k);
368  if (!GETBIT(sign, HASHVAL(tmp, siglen)))
369  {
370  res = false;
371  break;
372  }
373  }
374  }
375  break;
377 #ifndef IGNORECASE
378  elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
379 #endif
380  /* FALL THRU */
382  /* Regexp search is inexact */
383  *recheck = true;
384 
385  /* Check regex match as much as we can with available info */
386  if (qtrg)
387  {
388  if (GIST_LEAF(entry))
389  { /* all leafs contains orig trgm */
390  bool *check;
391 
392  check = trgm_presence_map(qtrg, key);
393  res = trigramsMatchGraph(cache->graph, check);
394  pfree(check);
395  }
396  else if (ISALLTRUE(key))
397  { /* non-leaf contains signature */
398  res = true;
399  }
400  else
401  { /* non-leaf contains signature */
402  int32 k,
403  tmp = 0,
404  len = ARRNELEM(qtrg);
405  trgm *ptr = GETARR(qtrg);
406  BITVECP sign = GETSIGN(key);
407  bool *check;
408 
409  /*
410  * GETBIT() tests may give false positives, due to limited
411  * size of the sign array. But since trigramsMatchGraph()
412  * implements a monotone boolean function, false positives
413  * in the check array can't lead to false negative answer.
414  * So we can apply trigramsMatchGraph despite uncertainty,
415  * and that usefully improves the quality of the search.
416  */
417  check = (bool *) palloc(len * sizeof(bool));
418  for (k = 0; k < len; k++)
419  {
420  CPTRGM(((char *) &tmp), ptr + k);
421  check[k] = GETBIT(sign, HASHVAL(tmp, siglen));
422  }
423  res = trigramsMatchGraph(cache->graph, check);
424  pfree(check);
425  }
426  }
427  else
428  {
429  /* trigram-free query must be rechecked everywhere */
430  res = true;
431  }
432  break;
433  default:
434  elog(ERROR, "unrecognized strategy number: %d", strategy);
435  res = false; /* keep compiler quiet */
436  break;
437  }
438 
439  PG_RETURN_BOOL(res);
440 }
441 
442 Datum
444 {
445  GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
446  text *query = PG_GETARG_TEXT_P(1);
448 
449  /* Oid subtype = PG_GETARG_OID(3); */
450  bool *recheck = (bool *) PG_GETARG_POINTER(4);
451  int siglen = LTREE_GET_ASIGLEN();
452  TRGM *key = (TRGM *) DatumGetPointer(entry->key);
453  TRGM *qtrg;
454  float8 res;
455  Size querysize = VARSIZE(query);
456  char *cache = (char *) fcinfo->flinfo->fn_extra;
457 
458  /*
459  * Cache the generated trigrams across multiple calls with the same query.
460  */
461  if (cache == NULL ||
462  VARSIZE(cache) != querysize ||
463  memcmp(cache, query, querysize) != 0)
464  {
465  char *newcache;
466 
467  qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ);
468 
469  newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
470  MAXALIGN(querysize) +
471  VARSIZE(qtrg));
472 
473  memcpy(newcache, query, querysize);
474  memcpy(newcache + MAXALIGN(querysize), qtrg, VARSIZE(qtrg));
475 
476  if (cache)
477  pfree(cache);
478  fcinfo->flinfo->fn_extra = newcache;
479  cache = newcache;
480  }
481 
482  qtrg = (TRGM *) (cache + MAXALIGN(querysize));
483 
484  switch (strategy)
485  {
489  /* Only plain trigram distance is exact */
490  *recheck = (strategy != DistanceStrategyNumber);
491  if (GIST_LEAF(entry))
492  { /* all leafs contains orig trgm */
493 
494  /*
495  * Prevent gcc optimizing the sml variable using volatile
496  * keyword. Otherwise res can differ from the
497  * word_similarity_dist_op() function.
498  */
499  float4 volatile sml = cnt_sml(qtrg, key, *recheck);
500 
501  res = 1.0 - sml;
502  }
503  else if (ISALLTRUE(key))
504  { /* all leafs contains orig trgm */
505  res = 0.0;
506  }
507  else
508  { /* non-leaf contains signature */
509  int32 count = cnt_sml_sign_common(qtrg, GETSIGN(key), siglen);
510  int32 len = ARRNELEM(qtrg);
511 
512  res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
513  }
514  break;
515  default:
516  elog(ERROR, "unrecognized strategy number: %d", strategy);
517  res = 0; /* keep compiler quiet */
518  break;
519  }
520 
521  PG_RETURN_FLOAT8(res);
522 }
523 
524 static int32
525 unionkey(BITVECP sbase, TRGM *add, int siglen)
526 {
527  int32 i;
528 
529  if (ISSIGNKEY(add))
530  {
531  BITVECP sadd = GETSIGN(add);
532 
533  if (ISALLTRUE(add))
534  return 1;
535 
536  LOOPBYTE(siglen)
537  sbase[i] |= sadd[i];
538  }
539  else
540  {
541  trgm *ptr = GETARR(add);
542  int32 tmp = 0;
543 
544  for (i = 0; i < ARRNELEM(add); i++)
545  {
546  CPTRGM(((char *) &tmp), ptr + i);
547  HASH(sbase, tmp, siglen);
548  }
549  }
550  return 0;
551 }
552 
553 
554 Datum
556 {
558  int32 len = entryvec->n;
559  int *size = (int *) PG_GETARG_POINTER(1);
560  int siglen = LTREE_GET_ASIGLEN();
561  int32 i;
562  TRGM *result = gtrgm_alloc(false, siglen, NULL);
563  BITVECP base = GETSIGN(result);
564 
565  for (i = 0; i < len; i++)
566  {
567  if (unionkey(base, GETENTRY(entryvec, i), siglen))
568  {
569  result->flag = ALLISTRUE;
570  SET_VARSIZE(result, CALCGTSIZE(ALLISTRUE, siglen));
571  break;
572  }
573  }
574 
575  *size = VARSIZE(result);
576 
577  PG_RETURN_POINTER(result);
578 }
579 
580 Datum
582 {
583  TRGM *a = (TRGM *) PG_GETARG_POINTER(0);
584  TRGM *b = (TRGM *) PG_GETARG_POINTER(1);
585  bool *result = (bool *) PG_GETARG_POINTER(2);
586  int siglen = LTREE_GET_ASIGLEN();
587 
588  if (ISSIGNKEY(a))
589  { /* then b also ISSIGNKEY */
590  if (ISALLTRUE(a) && ISALLTRUE(b))
591  *result = true;
592  else if (ISALLTRUE(a))
593  *result = false;
594  else if (ISALLTRUE(b))
595  *result = false;
596  else
597  {
598  int32 i;
599  BITVECP sa = GETSIGN(a),
600  sb = GETSIGN(b);
601 
602  *result = true;
603  LOOPBYTE(siglen)
604  {
605  if (sa[i] != sb[i])
606  {
607  *result = false;
608  break;
609  }
610  }
611  }
612  }
613  else
614  { /* a and b ISARRKEY */
615  int32 lena = ARRNELEM(a),
616  lenb = ARRNELEM(b);
617 
618  if (lena != lenb)
619  *result = false;
620  else
621  {
622  trgm *ptra = GETARR(a),
623  *ptrb = GETARR(b);
624  int32 i;
625 
626  *result = true;
627  for (i = 0; i < lena; i++)
628  if (CMPTRGM(ptra + i, ptrb + i))
629  {
630  *result = false;
631  break;
632  }
633  }
634  }
635 
636  PG_RETURN_POINTER(result);
637 }
638 
639 static int32
640 sizebitvec(BITVECP sign, int siglen)
641 {
642  return pg_popcount(sign, siglen);
643 }
644 
645 static int
646 hemdistsign(BITVECP a, BITVECP b, int siglen)
647 {
648  int i,
649  diff,
650  dist = 0;
651 
652  LOOPBYTE(siglen)
653  {
654  diff = (unsigned char) (a[i] ^ b[i]);
655  /* Using the popcount functions here isn't likely to win */
656  dist += pg_number_of_ones[diff];
657  }
658  return dist;
659 }
660 
661 static int
662 hemdist(TRGM *a, TRGM *b, int siglen)
663 {
664  if (ISALLTRUE(a))
665  {
666  if (ISALLTRUE(b))
667  return 0;
668  else
669  return SIGLENBIT(siglen) - sizebitvec(GETSIGN(b), siglen);
670  }
671  else if (ISALLTRUE(b))
672  return SIGLENBIT(siglen) - sizebitvec(GETSIGN(a), siglen);
673 
674  return hemdistsign(GETSIGN(a), GETSIGN(b), siglen);
675 }
676 
677 Datum
679 {
680  GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */
681  GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
682  float *penalty = (float *) PG_GETARG_POINTER(2);
683  int siglen = LTREE_GET_ASIGLEN();
684  TRGM *origval = (TRGM *) DatumGetPointer(origentry->key);
685  TRGM *newval = (TRGM *) DatumGetPointer(newentry->key);
686  BITVECP orig = GETSIGN(origval);
687 
688  *penalty = 0.0;
689 
690  if (ISARRKEY(newval))
691  {
692  char *cache = (char *) fcinfo->flinfo->fn_extra;
693  TRGM *cachedVal = (TRGM *) (cache + MAXALIGN(siglen));
694  Size newvalsize = VARSIZE(newval);
695  BITVECP sign;
696 
697  /*
698  * Cache the sign data across multiple calls with the same newval.
699  */
700  if (cache == NULL ||
701  VARSIZE(cachedVal) != newvalsize ||
702  memcmp(cachedVal, newval, newvalsize) != 0)
703  {
704  char *newcache;
705 
706  newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
707  MAXALIGN(siglen) +
708  newvalsize);
709 
710  makesign((BITVECP) newcache, newval, siglen);
711 
712  cachedVal = (TRGM *) (newcache + MAXALIGN(siglen));
713  memcpy(cachedVal, newval, newvalsize);
714 
715  if (cache)
716  pfree(cache);
717  fcinfo->flinfo->fn_extra = newcache;
718  cache = newcache;
719  }
720 
721  sign = (BITVECP) cache;
722 
723  if (ISALLTRUE(origval))
724  *penalty = ((float) (SIGLENBIT(siglen) - sizebitvec(sign, siglen))) / (float) (SIGLENBIT(siglen) + 1);
725  else
726  *penalty = hemdistsign(sign, orig, siglen);
727  }
728  else
729  *penalty = hemdist(origval, newval, siglen);
730  PG_RETURN_POINTER(penalty);
731 }
732 
733 typedef struct
734 {
735  bool allistrue;
737 } CACHESIGN;
738 
739 static void
740 fillcache(CACHESIGN *item, TRGM *key, BITVECP sign, int siglen)
741 {
742  item->allistrue = false;
743  item->sign = sign;
744  if (ISARRKEY(key))
745  makesign(item->sign, key, siglen);
746  else if (ISALLTRUE(key))
747  item->allistrue = true;
748  else
749  memcpy((void *) item->sign, (void *) GETSIGN(key), siglen);
750 }
751 
752 #define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) )
753 typedef struct
754 {
755  OffsetNumber pos;
756  int32 cost;
757 } SPLITCOST;
758 
759 static int
760 comparecost(const void *a, const void *b)
761 {
762  if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost)
763  return 0;
764  else
765  return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1;
766 }
767 
768 
769 static int
770 hemdistcache(CACHESIGN *a, CACHESIGN *b, int siglen)
771 {
772  if (a->allistrue)
773  {
774  if (b->allistrue)
775  return 0;
776  else
777  return SIGLENBIT(siglen) - sizebitvec(b->sign, siglen);
778  }
779  else if (b->allistrue)
780  return SIGLENBIT(siglen) - sizebitvec(a->sign, siglen);
781 
782  return hemdistsign(a->sign, b->sign, siglen);
783 }
784 
785 Datum
787 {
789  OffsetNumber maxoff = entryvec->n - 2;
791  int siglen = LTREE_GET_ASIGLEN();
792  OffsetNumber k,
793  j;
794  TRGM *datum_l,
795  *datum_r;
796  BITVECP union_l,
797  union_r;
798  int32 size_alpha,
799  size_beta;
800  int32 size_waste,
801  waste = -1;
802  int32 nbytes;
803  OffsetNumber seed_1 = 0,
804  seed_2 = 0;
805  OffsetNumber *left,
806  *right;
807  BITVECP ptr;
808  int i;
809  CACHESIGN *cache;
810  char *cache_sign;
811  SPLITCOST *costvector;
812 
813  /* cache the sign data for each existing item */
814  cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2));
815  cache_sign = palloc(siglen * (maxoff + 2));
816 
817  for (k = FirstOffsetNumber; k <= maxoff; k = OffsetNumberNext(k))
818  fillcache(&cache[k], GETENTRY(entryvec, k), &cache_sign[siglen * k],
819  siglen);
820 
821  /* now find the two furthest-apart items */
822  for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k))
823  {
824  for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j))
825  {
826  size_waste = hemdistcache(&(cache[j]), &(cache[k]), siglen);
827  if (size_waste > waste)
828  {
829  waste = size_waste;
830  seed_1 = k;
831  seed_2 = j;
832  }
833  }
834  }
835 
836  /* just in case we didn't make a selection ... */
837  if (seed_1 == 0 || seed_2 == 0)
838  {
839  seed_1 = 1;
840  seed_2 = 2;
841  }
842 
843  /* initialize the result vectors */
844  nbytes = (maxoff + 2) * sizeof(OffsetNumber);
845  v->spl_left = left = (OffsetNumber *) palloc(nbytes);
846  v->spl_right = right = (OffsetNumber *) palloc(nbytes);
847  v->spl_nleft = 0;
848  v->spl_nright = 0;
849 
850  /* form initial .. */
851  datum_l = gtrgm_alloc(cache[seed_1].allistrue, siglen, cache[seed_1].sign);
852  datum_r = gtrgm_alloc(cache[seed_2].allistrue, siglen, cache[seed_2].sign);
853 
854  union_l = GETSIGN(datum_l);
855  union_r = GETSIGN(datum_r);
856  maxoff = OffsetNumberNext(maxoff);
857  fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff),
858  &cache_sign[siglen * maxoff], siglen);
859 
860  /* sort before ... */
861  costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
862  for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j))
863  {
864  costvector[j - 1].pos = j;
865  size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j]), siglen);
866  size_beta = hemdistcache(&(cache[seed_2]), &(cache[j]), siglen);
867  costvector[j - 1].cost = abs(size_alpha - size_beta);
868  }
869  qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
870 
871  for (k = 0; k < maxoff; k++)
872  {
873  j = costvector[k].pos;
874  if (j == seed_1)
875  {
876  *left++ = j;
877  v->spl_nleft++;
878  continue;
879  }
880  else if (j == seed_2)
881  {
882  *right++ = j;
883  v->spl_nright++;
884  continue;
885  }
886 
887  if (ISALLTRUE(datum_l) || cache[j].allistrue)
888  {
889  if (ISALLTRUE(datum_l) && cache[j].allistrue)
890  size_alpha = 0;
891  else
892  size_alpha = SIGLENBIT(siglen) -
893  sizebitvec((cache[j].allistrue) ? GETSIGN(datum_l) :
894  GETSIGN(cache[j].sign),
895  siglen);
896  }
897  else
898  size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l), siglen);
899 
900  if (ISALLTRUE(datum_r) || cache[j].allistrue)
901  {
902  if (ISALLTRUE(datum_r) && cache[j].allistrue)
903  size_beta = 0;
904  else
905  size_beta = SIGLENBIT(siglen) -
906  sizebitvec((cache[j].allistrue) ? GETSIGN(datum_r) :
907  GETSIGN(cache[j].sign),
908  siglen);
909  }
910  else
911  size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r), siglen);
912 
913  if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1))
914  {
915  if (ISALLTRUE(datum_l) || cache[j].allistrue)
916  {
917  if (!ISALLTRUE(datum_l))
918  MemSet((void *) GETSIGN(datum_l), 0xff, siglen);
919  }
920  else
921  {
922  ptr = cache[j].sign;
923  LOOPBYTE(siglen)
924  union_l[i] |= ptr[i];
925  }
926  *left++ = j;
927  v->spl_nleft++;
928  }
929  else
930  {
931  if (ISALLTRUE(datum_r) || cache[j].allistrue)
932  {
933  if (!ISALLTRUE(datum_r))
934  MemSet((void *) GETSIGN(datum_r), 0xff, siglen);
935  }
936  else
937  {
938  ptr = cache[j].sign;
939  LOOPBYTE(siglen)
940  union_r[i] |= ptr[i];
941  }
942  *right++ = j;
943  v->spl_nright++;
944  }
945  }
946 
947  *right = *left = FirstOffsetNumber;
948  v->spl_ldatum = PointerGetDatum(datum_l);
949  v->spl_rdatum = PointerGetDatum(datum_r);
950 
952 }
953 
954 Datum
956 {
958 
959  init_local_reloptions(relopts, sizeof(TrgmGistOptions));
960  add_local_int_reloption(relopts, "siglen",
961  "signature length in bytes",
963  offsetof(TrgmGistOptions, siglen));
964 
965  PG_RETURN_VOID();
966 }
#define GIST_LEAF(entry)
Definition: gist.h:162
Relation rel
Definition: gist.h:153
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:360
OffsetNumber pos
Definition: hstore_gist.c:346
#define SETBIT(x, i)
Definition: blutils.c:32
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
Definition: trgm_regexp.c:636
#define StrictWordDistanceStrategyNumber
Definition: trgm.h:39
static int32 sizebitvec(BITVECP sign, int siglen)
Definition: trgm_gist.c:640
void init_local_reloptions(local_relopts *opts, Size relopt_struct_size)
Definition: reloptions.c:710
#define CMPTRGM(a, b)
Definition: trgm.h:45
#define LOOPBYTE(siglen)
Definition: hstore_gist.c:32
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
#define VARDATA(PTR)
Definition: postgres.h:302
#define RegExpStrategyNumber
Definition: trgm.h:34
TRGM * generate_wildcard_trgm(const char *str, int slen)
Definition: trgm_op.c:866
#define SimilarityStrategyNumber
Definition: trgm.h:30
#define VARSIZE(PTR)
Definition: postgres.h:303
#define ISSIGNKEY(x)
Definition: trgm.h:101
#define GETSIGN(x)
Definition: hstore_gist.c:61
#define PointerGetDatum(X)
Definition: postgres.h:556
#define DistanceStrategyNumber
Definition: trgm.h:31
#define VARHDRSZ
Definition: c.h:568
PG_FUNCTION_INFO_V1(gtrgm_in)
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:521
#define StrictWordSimilarityStrategyNumber
Definition: trgm.h:38
static int32 unionkey(BITVECP sbase, TRGM *add, int siglen)
Definition: trgm_gist.c:525
#define RegExpICaseStrategyNumber
Definition: trgm.h:35
#define DatumGetTextPP(X)
Definition: fmgr.h:291
#define SIGLEN_MAX
Definition: hstore_gist.c:23
#define WordDistanceStrategyNumber
Definition: trgm.h:37
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:365
OffsetNumber * spl_left
Definition: gist.h:134
Datum spl_rdatum
Definition: gist.h:141
Datum gtrgm_union(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:555
int32 n
Definition: gist.h:227
static int comparecost(const void *a, const void *b)
Definition: trgm_gist.c:760
uint16 StrategyNumber
Definition: stratnum.h:22
float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
Definition: trgm_op.c:996
#define ALLISTRUE
Definition: hstore_gist.c:54
#define MemSet(start, val, len)
Definition: c.h:949
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276
#define ARRNELEM(x)
Definition: trgm.h:107
bool * trgm_presence_map(TRGM *query, TRGM *key)
Definition: trgm_op.c:1083
int spl_nleft
Definition: gist.h:135
Datum gtrgm_same(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:581
void add_local_int_reloption(local_relopts *relopts, const char *name, const char *desc, int default_val, int min_val, int max_val, int offset)
Definition: reloptions.c:894
#define PG_GET_COLLATION()
Definition: fmgr.h:198
signed int int32
Definition: c.h:362
uint64 pg_popcount(const char *buf, int bytes)
Definition: pg_bitutils.c:282
int32 cost
Definition: hstore_gist.c:347
#define WISH_F(a, b, c)
Definition: trgm_gist.c:752
uint16 OffsetNumber
Definition: off.h:24
Page page
Definition: gist.h:154
#define ISARRKEY(x)
Definition: trgm.h:100
#define CALCGTSIZE(flag, siglen)
Definition: hstore_gist.c:59
#define GETARR(x)
Definition: trgm.h:106
#define HASH(sign, val, siglen)
Definition: hstore_gist.c:45
Datum gtrgm_distance(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:443
void pfree(void *pointer)
Definition: mcxt.c:1057
#define GETBIT(x, i)
Definition: blutils.c:33
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:498
TrgmPackedGraph * graph
Definition: trgm_gist.c:31
Definition: trgm.h:66
int spl_nright
Definition: gist.h:140
Datum gtrgm_options(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:955
Datum gtrgm_consistent(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:190
static void fillcache(CACHESIGN *item, TRGM *key, BITVECP sign, int siglen)
Definition: trgm_gist.c:740
char sign
Definition: informix.c:668
StrategyNumber strategy
Definition: trgm_gist.c:26
Datum key
Definition: gist.h:152
#define LikeStrategyNumber
Definition: trgm.h:32
#define FirstOffsetNumber
Definition: off.h:27
char * flag(int b)
Definition: test-ctype.c:33
double index_strategy_get_limit(StrategyNumber strategy)
Definition: trgm_op.c:132
bool trgm_contained_by(TRGM *trg1, TRGM *trg2)
Definition: trgm_op.c:1044
#define SIGLEN_DEFAULT
Definition: hstore_gist.c:22
static int hemdistcache(CACHESIGN *a, CACHESIGN *b, int siglen)
Definition: trgm_gist.c:770
bool leafkey
Definition: gist.h:156
static int hemdist(TRGM *a, TRGM *b, int siglen)
Definition: trgm_gist.c:662
Datum gtrgm_out(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:63
float float4
Definition: c.h:497
#define SIGNKEY
Definition: trgm.h:97
#define PG_RETURN_BOOL(x)
Definition: fmgr.h:358
uintptr_t Datum
Definition: postgres.h:367
#define GETENTRY(vec, pos)
Definition: trgm_gist.c:39
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:352
static int hemdistsign(BITVECP a, BITVECP b, int siglen)
Definition: trgm_gist.c:646
uint8 flag
Definition: trgm.h:69
Datum gtrgm_penalty(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:678
Datum gtrgm_picksplit(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:786
#define ISALLTRUE(x)
Definition: hstore_gist.c:56
#define WordSimilarityStrategyNumber
Definition: trgm.h:36
#define HASHVAL(val, siglen)
Definition: hstore_gist.c:44
#define PG_RETURN_VOID()
Definition: fmgr.h:348
char trgm[3]
Definition: trgm.h:41
Datum spl_ldatum
Definition: gist.h:136
#define SIGLENBIT(siglen)
Definition: hstore_gist.c:24
#define gistentryinit(e, k, r, pg, o, l)
Definition: gist.h:236
Datum gtrgm_decompress(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:148
TRGM * generate_trgm(char *str, int slen)
Definition: trgm_op.c:356
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define newval
OffsetNumber * spl_right
Definition: gist.h:139
#define MAXALIGN(LEN)
Definition: c.h:698
#define PG_GETARG_UINT16(n)
Definition: fmgr.h:272
#define DatumGetPointer(X)
Definition: postgres.h:549
PGDLLIMPORT const uint8 pg_number_of_ones[256]
Definition: pg_bitutils.c:87
#define LTREE_GET_ASIGLEN()
Definition: trgm_gist.c:19
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:341
void * palloc(Size size)
Definition: mcxt.c:950
#define PG_GETARG_TEXT_P(n)
Definition: fmgr.h:335
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:797
#define elog(elevel,...)
Definition: elog.h:214
int i
static int32 cnt_sml_sign_common(TRGM *qtrg, BITVECP sign, int siglen)
Definition: trgm_gist.c:172
Definition: c.h:562
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:329
char * BITVECP
Definition: hstore_gist.c:30
#define ILikeStrategyNumber
Definition: trgm.h:33
bool allistrue
Definition: trgm_gist.c:735
static void makesign(BITVECP sign, TRGM *a, int siglen)
Definition: trgm_gist.c:91
BITVECP sign
Definition: trgm_gist.c:736
#define qsort(a, b, c, d)
Definition: port.h:475
Datum gtrgm_in(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:56
Datum gtrgm_compress(PG_FUNCTION_ARGS)
Definition: trgm_gist.c:108
OffsetNumber offset
Definition: gist.h:155
static TRGM * gtrgm_alloc(bool isalltrue, int siglen, BITVECP sign)
Definition: trgm_gist.c:70
long val
Definition: informix.c:664
#define offsetof(type, field)
Definition: c.h:668
#define CPTRGM(a, b)
Definition: trgm.h:47