PostgreSQL Source Code  git master
trgm_regexp.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * trgm_regexp.c
4  * Regular expression matching using trigrams.
5  *
6  * The general idea of trigram index support for a regular expression (regex)
7  * search is to transform the regex into a logical expression on trigrams.
8  * For example:
9  *
10  * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11  *
12  * If a string matches the regex, then it must match the logical expression on
13  * trigrams. The opposite is not necessarily true, however: a string that
14  * matches the logical expression might not match the original regex. Such
15  * false positives are removed via recheck, by running the regular regex match
16  * operator on the retrieved heap tuple.
17  *
18  * Since the trigram expression involves both AND and OR operators, we can't
19  * expect the core index machinery to evaluate it completely. Instead, the
20  * result of regex analysis is a list of trigrams to be sought in the index,
21  * plus a simplified graph that is used by trigramsMatchGraph() to determine
22  * whether a particular indexed value matches the expression.
23  *
24  * Converting a regex to a trigram expression is based on analysis of an
25  * automaton corresponding to the regex. The algorithm consists of four
26  * stages:
27  *
28  * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29  * regexp library, which provides accessors for its opaque regex_t struct
30  * to expose the NFA state graph and the "colors" (sets of equivalent
31  * characters) used as state transition labels.
32  *
33  * 2) Transform the original NFA into an expanded graph, where arcs
34  * are labeled with trigrams that must be present in order to move from
35  * one state to another via the arcs. The trigrams used in this stage
36  * consist of colors, not characters, as in the original NFA.
37  *
38  * 3) Expand the color trigrams into regular trigrams consisting of
39  * characters. If too many distinct trigrams are produced, trigrams are
40  * eliminated and the graph is simplified until it's simple enough.
41  *
42  * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43  * and returned to the caller.
44  *
45  * 1) Compile the regexp to NFA form
46  * ---------------------------------
47  * The automaton returned by the regexp compiler is a graph where vertices
48  * are "states" and arcs are labeled with colors. Each color represents
49  * a set of characters, so that all characters assigned to the same color
50  * are interchangeable, so far as matching the regexp is concerned. There
51  * are two special states: "initial" and "final". A state can have multiple
52  * outgoing arcs labeled with the same color, which makes the automaton
53  * non-deterministic, because it can be in many states simultaneously.
54  *
55  * Note that this NFA is already lossy compared to the original regexp,
56  * since it ignores some regex features such as lookahead constraints and
57  * backref matching. This is OK for our purposes since it's still the case
58  * that only strings matching the NFA can possibly satisfy the regexp.
59  *
60  * 2) Transform the original NFA into an expanded graph
61  * ----------------------------------------------------
62  * In the 2nd stage, the automaton is transformed into a graph based on the
63  * original NFA. Each state in the expanded graph represents a state from
64  * the original NFA, plus a prefix identifying the last two characters
65  * (colors, to be precise) seen before entering the state. There can be
66  * multiple states in the expanded graph for each state in the original NFA,
67  * depending on what characters can precede it. A prefix position can be
68  * "unknown" if it's uncertain what the preceding character was, or "blank"
69  * if the character was a non-word character (we don't need to distinguish
70  * which non-word character it was, so just think of all of them as blanks).
71  *
72  * For convenience in description, call an expanded-state identifier
73  * (two prefix colors plus a state number from the original NFA) an
74  * "enter key".
75  *
76  * Each arc of the expanded graph is labeled with a trigram that must be
77  * present in the string to match. We can construct this from an out-arc of
78  * the underlying NFA state by combining the expanded state's prefix with the
79  * color label of the underlying out-arc, if neither prefix position is
80  * "unknown". But note that some of the colors in the trigram might be
81  * "blank". This is OK since we want to generate word-boundary trigrams as
82  * the regular trigram machinery would, if we know that some word characters
83  * must be adjacent to a word boundary in all strings matching the NFA.
84  *
85  * The expanded graph can also have fewer states than the original NFA,
86  * because we don't bother to make a separate state entry unless the state
87  * is reachable by a valid arc. When an enter key is reachable from a state
88  * of the expanded graph, but we do not know a complete trigram associated
89  * with that transition, we cannot make a valid arc; instead we insert the
90  * enter key into the enterKeys list of the source state. This effectively
91  * means that the two expanded states are not reliably distinguishable based
92  * on examining trigrams.
93  *
94  * So the expanded graph resembles the original NFA, but the arcs are
95  * labeled with trigrams instead of individual characters, and there may be
96  * more or fewer states. It is a lossy representation of the original NFA:
97  * any string that matches the original regexp must match the expanded graph,
98  * but the reverse is not true.
99  *
100  * We build the expanded graph through a breadth-first traversal of states
101  * reachable from the initial state. At each reachable state, we identify the
102  * states reachable from it without traversing a predictable trigram, and add
103  * those states' enter keys to the current state. Then we generate all
104  * out-arcs leading out of this collection of states that have predictable
105  * trigrams, adding their target states to the queue of states to examine.
106  *
107  * When building the graph, if the number of states or arcs exceed pre-defined
108  * limits, we give up and simply mark any states not yet processed as final
109  * states. Roughly speaking, that means that we make use of some portion from
110  * the beginning of the regexp. Also, any colors that have too many member
111  * characters are treated as "unknown", so that we can't derive trigrams
112  * from them.
113  *
114  * 3) Expand the color trigrams into regular trigrams
115  * --------------------------------------------------
116  * The trigrams in the expanded graph are "color trigrams", consisting
117  * of three consecutive colors that must be present in the string. But for
118  * search, we need regular trigrams consisting of characters. In the 3rd
119  * stage, the color trigrams are expanded into regular trigrams. Since each
120  * color can represent many characters, the total number of regular trigrams
121  * after expansion could be very large. Because searching the index for
122  * thousands of trigrams would be slow, and would likely produce so many
123  * false positives that we would have to traverse a large fraction of the
124  * index, the graph is simplified further in a lossy fashion by removing
125  * color trigrams. When a color trigram is removed, the states connected by
126  * any arcs labeled with that trigram are merged.
127  *
128  * Trigrams do not all have equivalent value for searching: some of them are
129  * more frequent and some of them are less frequent. Ideally, we would like
130  * to know the distribution of trigrams, but we don't. But because of padding
131  * we know for sure that the empty character is more frequent than others,
132  * so we can penalize trigrams according to presence of whitespace. The
133  * penalty assigned to each color trigram is the number of simple trigrams
134  * it would produce, times the penalties[] multiplier associated with its
135  * whitespace content. (The penalties[] constants were calculated by analysis
136  * of some real-life text.) We eliminate color trigrams starting with the
137  * highest-penalty one, until we get to a total penalty of no more than
138  * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139  * lead to merging the initial and final states, so we may not be able to
140  * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141  * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142  *
143  * 4) Pack the graph into a compact representation
144  * -----------------------------------------------
145  * The 2nd and 3rd stages might have eliminated or merged many of the states
146  * and trigrams created earlier, so in this final stage, the graph is
147  * compacted and packed into a simpler struct that contains only the
148  * information needed to evaluate it.
149  *
150  * ALGORITHM EXAMPLE:
151  *
152  * Consider the example regex "ab[cd]". This regex is transformed into the
153  * following NFA (for simplicity we show colors as their single members):
154  *
155  * 4#
156  * c/
157  * a b /
158  * 1* --- 2 ---- 3
159  * \
160  * d\
161  * 5#
162  *
163  * We use * to mark initial state and # to mark final state. It's not depicted,
164  * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165  * because this pattern can match to any part of a string.
166  *
167  * As the result of stage 2 we will have the following graph:
168  *
169  * abc abd
170  * 2# <---- 1* ----> 3#
171  *
172  * The process for generating this graph is:
173  * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174  * 2) Add key (UNKNOWN, "a", 2) to state 1.
175  * 3) Add key ("a", "b", 3) to state 1.
176  * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177  * from state 1 to state 2 with label trigram "abc".
178  * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179  * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180  * from state 1 to state 3 with label trigram "abd".
181  * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182  *
183  *
184  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
185  * Portions Copyright (c) 1994, Regents of the University of California
186  *
187  * IDENTIFICATION
188  * contrib/pg_trgm/trgm_regexp.c
189  *
190  *-------------------------------------------------------------------------
191  */
192 #include "postgres.h"
193 
194 #include "regex/regexport.h"
195 #include "trgm.h"
196 #include "tsearch/ts_locale.h"
197 #include "utils/hsearch.h"
198 #include "utils/memutils.h"
199 
200 /*
201  * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
202  * for exploring and debugging the algorithm implementation.
203  * This produces three graph files in /tmp, in Graphviz .gv format.
204  * Some progress information is also printed to postmaster stderr.
205  */
206 /* #define TRGM_REGEXP_DEBUG */
207 
208 /*
209  * These parameters are used to limit the amount of work done.
210  * Otherwise regex processing could be too slow and memory-consuming.
211  *
212  * MAX_EXPANDED_STATES - How many states we allow in expanded graph
213  * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
214  * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
215  * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
216  * COLOR_COUNT_LIMIT - Maximum number of characters per color
217  */
218 #define MAX_EXPANDED_STATES 128
219 #define MAX_EXPANDED_ARCS 1024
220 #define MAX_TRGM_COUNT 256
221 #define WISH_TRGM_PENALTY 16
222 #define COLOR_COUNT_LIMIT 256
223 
224 /*
225  * Penalty multipliers for trigram counts depending on whitespace contents.
226  * Numbers based on analysis of real-life texts.
227  */
228 static const float4 penalties[8] = {
229  1.0f, /* "aaa" */
230  3.5f, /* "aa " */
231  0.0f, /* "a a" (impossible) */
232  0.0f, /* "a " (impossible) */
233  4.2f, /* " aa" */
234  2.1f, /* " a " */
235  25.0f, /* " a" */
236  0.0f /* " " (impossible) */
237 };
238 
239 /* Struct representing a single pg_wchar, converted back to multibyte form */
240 typedef struct
241 {
243 } trgm_mb_char;
244 
245 /*
246  * Attributes of NFA colors:
247  *
248  * expandable - we know the character expansion of this color
249  * containsNonWord - color contains non-word characters
250  * (which will not be extracted into trigrams)
251  * wordCharsCount - count of word characters in color
252  * wordChars - array of this color's word characters
253  * (which can be extracted into trigrams)
254  *
255  * When expandable is false, the other attributes don't matter; we just
256  * assume this color represents unknown character(s).
257  */
258 typedef struct
259 {
264 } TrgmColorInfo;
265 
266 /*
267  * A "prefix" is information about the colors of the last two characters read
268  * before reaching a specific NFA state. These colors can have special values
269  * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
270  * information, for example because we read some character of an unexpandable
271  * color. COLOR_BLANK means that we read a non-word character.
272  *
273  * We call a prefix ambiguous if at least one of its colors is unknown. It's
274  * fully ambiguous if both are unknown, partially ambiguous if only the first
275  * is unknown. (The case of first color known, second unknown is not valid.)
276  *
277  * Wholly- or partly-blank prefixes are mostly handled the same as regular
278  * color prefixes. This allows us to generate appropriate partly-blank
279  * trigrams when the NFA requires word character(s) to appear adjacent to
280  * non-word character(s).
281  */
282 typedef int TrgmColor;
283 
284 /* We assume that colors returned by the regexp engine cannot be these: */
285 #define COLOR_UNKNOWN (-3)
286 #define COLOR_BLANK (-4)
287 
288 typedef struct
289 {
290  TrgmColor colors[2];
291 } TrgmPrefix;
292 
293 /*
294  * Color-trigram data type. Note that some elements of the trigram can be
295  * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
296  */
297 typedef struct
298 {
299  TrgmColor colors[3];
300 } ColorTrgm;
301 
302 /*
303  * Key identifying a state of our expanded graph: color prefix, and number
304  * of the corresponding state in the underlying regex NFA. The color prefix
305  * shows how we reached the regex state (to the extent that we know it).
306  */
307 typedef struct
308 {
310  int nstate;
311 } TrgmStateKey;
312 
313 /*
314  * One state of the expanded graph.
315  *
316  * stateKey - ID of this state
317  * arcs - outgoing arcs of this state (List of TrgmArc)
318  * enterKeys - enter keys reachable from this state without reading any
319  * predictable trigram (List of TrgmStateKey)
320  * flags - flag bits
321  * snumber - number of this state (initially assigned as -1, -2, etc,
322  * for debugging purposes only; then at the packaging stage,
323  * surviving states are renumbered with positive numbers)
324  * parent - parent state, if this state has been merged into another
325  * tentFlags - flags this state would acquire via planned merges
326  * tentParent - planned parent state, if considering a merge
327  */
328 #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
329 #define TSTATE_FIN 0x02 /* flag indicating this state is final */
330 
331 typedef struct TrgmState
332 {
333  TrgmStateKey stateKey; /* hashtable key: must be first field */
336  int flags;
337  int snumber;
338  struct TrgmState *parent;
341 } TrgmState;
342 
343 /*
344  * One arc in the expanded graph.
345  */
346 typedef struct
347 {
348  ColorTrgm ctrgm; /* trigram needed to traverse arc */
349  TrgmState *target; /* next state */
350 } TrgmArc;
351 
352 /*
353  * Information about arc of specific color trigram (used in stage 3)
354  *
355  * Contains pointers to the source and target states.
356  */
357 typedef struct
358 {
361 } TrgmArcInfo;
362 
363 /*
364  * Information about color trigram (used in stage 3)
365  *
366  * ctrgm - trigram itself
367  * cnumber - number of this trigram (used in the packaging stage)
368  * count - number of simple trigrams created from this color trigram
369  * expanded - indicates this color trigram is expanded into simple trigrams
370  * arcs - list of all arcs labeled with this color trigram.
371  */
372 typedef struct
373 {
375  int cnumber;
376  int count;
378  bool expanded;
380 } ColorTrgmInfo;
381 
382 /*
383  * Data structure representing all the data we need during regex processing.
384  *
385  * regex - compiled regex
386  * colorInfo - extracted information about regex's colors
387  * ncolors - number of colors in colorInfo[]
388  * states - hashtable of TrgmStates (states of expanded graph)
389  * initState - pointer to initial state of expanded graph
390  * queue - queue of to-be-processed TrgmStates
391  * keysQueue - queue of to-be-processed TrgmStateKeys
392  * arcsCount - total number of arcs of expanded graph (for resource
393  * limiting)
394  * overflowed - we have exceeded resource limit for transformation
395  * colorTrgms - array of all color trigrams present in graph
396  * colorTrgmsCount - count of those color trigrams
397  * totalTrgmCount - total count of extracted simple trigrams
398  */
399 typedef struct
400 {
401  /* Source regexp, and color information extracted from it (stage 1) */
404  int ncolors;
405 
406  /* Expanded graph (stage 2) */
409  int nstates;
410 
411  /* Workspace for stage 2 */
416 
417  /* Information about distinct color trigrams in the graph (stage 3) */
421 } TrgmNFA;
422 
423 /*
424  * Final, compact representation of expanded graph.
425  */
426 typedef struct
427 {
428  int targetState; /* index of target state (zero-based) */
429  int colorTrgm; /* index of color trigram for transition */
430 } TrgmPackedArc;
431 
432 typedef struct
433 {
434  int arcsCount; /* number of out-arcs for this state */
435  TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
437 
438 /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
440 {
441  /*
442  * colorTrigramsCount and colorTrigramGroups contain information about how
443  * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
444  * count of color trigrams and "colorTrigramGroups" contains number of
445  * simple trigrams for each color trigram. The array of simple trigrams
446  * (stored separately from this struct) is ordered so that the simple
447  * trigrams for each color trigram are consecutive, and they're in order
448  * by color trigram number.
449  */
451  int *colorTrigramGroups; /* array of size colorTrigramsCount */
452 
453  /*
454  * The states of the simplified NFA. State number 0 is always initial
455  * state and state number 1 is always final state.
456  */
458  TrgmPackedState *states; /* array of size statesCount */
459 
460  /* Temporary work space for trigramsMatchGraph() */
461  bool *colorTrigramsActive; /* array of size colorTrigramsCount */
462  bool *statesActive; /* array of size statesCount */
463  int *statesQueue; /* array of size statesCount */
464 };
465 
466 /*
467  * Temporary structure for representing an arc during packaging.
468  */
469 typedef struct
470 {
475 
476 
477 /* prototypes for private functions */
478 static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
479  MemoryContext rcontext);
480 static void RE_compile(regex_t *regex, text *text_re,
481  int cflags, Oid collation);
482 static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
483 static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
484 static void transformGraph(TrgmNFA *trgmNFA);
485 static void processState(TrgmNFA *trgmNFA, TrgmState *state);
486 static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
487 static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
488 static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
489 static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
490  TrgmColor co, TrgmStateKey *destKey);
491 static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
492 static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
493 static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
494 static bool selectColorTrigrams(TrgmNFA *trgmNFA);
495 static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
496 static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
497 static void mergeStates(TrgmState *state1, TrgmState *state2);
498 static int colorTrgmInfoCmp(const void *p1, const void *p2);
499 static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
500 static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
501 static int packArcInfoCmp(const void *a1, const void *a2);
502 
503 #ifdef TRGM_REGEXP_DEBUG
504 static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
505 static void printTrgmNFA(TrgmNFA *trgmNFA);
506 static void printTrgmColor(StringInfo buf, TrgmColor co);
507 static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
508 #endif
509 
510 
511 /*
512  * Main entry point to process a regular expression.
513  *
514  * Returns an array of trigrams required by the regular expression, or NULL if
515  * the regular expression was too complex to analyze. In addition, a packed
516  * graph representation of the regex is returned into *graph. The results
517  * must be allocated in rcontext (which might or might not be the current
518  * context).
519  */
520 TRGM *
521 createTrgmNFA(text *text_re, Oid collation,
522  TrgmPackedGraph **graph, MemoryContext rcontext)
523 {
524  TRGM *trg;
525  regex_t regex;
526  MemoryContext tmpcontext;
527  MemoryContext oldcontext;
528 
529  /*
530  * This processing generates a great deal of cruft, which we'd like to
531  * clean up before returning (since this function may be called in a
532  * query-lifespan memory context). Make a temp context we can work in so
533  * that cleanup is easy.
534  */
536  "createTrgmNFA temporary context",
538  oldcontext = MemoryContextSwitchTo(tmpcontext);
539 
540  /*
541  * Stage 1: Compile the regexp into a NFA, using the regexp library.
542  */
543 #ifdef IGNORECASE
544  RE_compile(&regex, text_re,
545  REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
546 #else
547  RE_compile(&regex, text_re,
548  REG_ADVANCED | REG_NOSUB, collation);
549 #endif
550 
551  /*
552  * Since the regexp library allocates its internal data structures with
553  * malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets
554  * done even if there's an error.
555  */
556  PG_TRY();
557  {
558  trg = createTrgmNFAInternal(&regex, graph, rcontext);
559  }
560  PG_FINALLY();
561  {
562  pg_regfree(&regex);
563  }
564  PG_END_TRY();
565 
566  /* Clean up all the cruft we created */
567  MemoryContextSwitchTo(oldcontext);
568  MemoryContextDelete(tmpcontext);
569 
570  return trg;
571 }
572 
573 /*
574  * Body of createTrgmNFA, exclusive of regex compilation/freeing.
575  */
576 static TRGM *
578  MemoryContext rcontext)
579 {
580  TRGM *trg;
581  TrgmNFA trgmNFA;
582 
583  trgmNFA.regex = regex;
584 
585  /* Collect color information from the regex */
586  getColorInfo(regex, &trgmNFA);
587 
588 #ifdef TRGM_REGEXP_DEBUG
589  printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
590 #endif
591 
592  /*
593  * Stage 2: Create an expanded graph from the source NFA.
594  */
595  transformGraph(&trgmNFA);
596 
597 #ifdef TRGM_REGEXP_DEBUG
598  printTrgmNFA(&trgmNFA);
599 #endif
600 
601  /*
602  * Fail if we were unable to make a nontrivial graph, ie it is possible to
603  * get from the initial state to the final state without reading any
604  * predictable trigram.
605  */
606  if (trgmNFA.initState->flags & TSTATE_FIN)
607  return NULL;
608 
609  /*
610  * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
611  */
612  if (!selectColorTrigrams(&trgmNFA))
613  return NULL;
614 
615  /*
616  * Stage 4: Expand color trigrams and pack graph into final
617  * representation.
618  */
619  trg = expandColorTrigrams(&trgmNFA, rcontext);
620 
621  *graph = packGraph(&trgmNFA, rcontext);
622 
623 #ifdef TRGM_REGEXP_DEBUG
624  printTrgmPackedGraph(*graph, trg);
625 #endif
626 
627  return trg;
628 }
629 
630 /*
631  * Main entry point for evaluating a graph during index scanning.
632  *
633  * The check[] array is indexed by trigram number (in the array of simple
634  * trigrams returned by createTrgmNFA), and holds true for those trigrams
635  * that are present in the index entry being checked.
636  */
637 bool
639 {
640  int i,
641  j,
642  k,
643  queueIn,
644  queueOut;
645 
646  /*
647  * Reset temporary working areas.
648  */
649  memset(graph->colorTrigramsActive, 0,
650  sizeof(bool) * graph->colorTrigramsCount);
651  memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
652 
653  /*
654  * Check which color trigrams were matched. A match for any simple
655  * trigram associated with a color trigram counts as a match of the color
656  * trigram.
657  */
658  j = 0;
659  for (i = 0; i < graph->colorTrigramsCount; i++)
660  {
661  int cnt = graph->colorTrigramGroups[i];
662 
663  for (k = j; k < j + cnt; k++)
664  {
665  if (check[k])
666  {
667  /*
668  * Found one matched trigram in the group. Can skip the rest
669  * of them and go to the next group.
670  */
671  graph->colorTrigramsActive[i] = true;
672  break;
673  }
674  }
675  j = j + cnt;
676  }
677 
678  /*
679  * Initialize the statesQueue to hold just the initial state. Note:
680  * statesQueue has room for statesCount entries, which is certainly enough
681  * since no state will be put in the queue more than once. The
682  * statesActive array marks which states have been queued.
683  */
684  graph->statesActive[0] = true;
685  graph->statesQueue[0] = 0;
686  queueIn = 0;
687  queueOut = 1;
688 
689  /* Process queued states as long as there are any. */
690  while (queueIn < queueOut)
691  {
692  int stateno = graph->statesQueue[queueIn++];
693  TrgmPackedState *state = &graph->states[stateno];
694  int cnt = state->arcsCount;
695 
696  /* Loop over state's out-arcs */
697  for (i = 0; i < cnt; i++)
698  {
699  TrgmPackedArc *arc = &state->arcs[i];
700 
701  /*
702  * If corresponding color trigram is present then activate the
703  * corresponding state. We're done if that's the final state,
704  * otherwise queue the state if it's not been queued already.
705  */
706  if (graph->colorTrigramsActive[arc->colorTrgm])
707  {
708  int nextstate = arc->targetState;
709 
710  if (nextstate == 1)
711  return true; /* success: final state is reachable */
712 
713  if (!graph->statesActive[nextstate])
714  {
715  graph->statesActive[nextstate] = true;
716  graph->statesQueue[queueOut++] = nextstate;
717  }
718  }
719  }
720  }
721 
722  /* Queue is empty, so match fails. */
723  return false;
724 }
725 
726 /*
727  * Compile regex string into struct at *regex.
728  * NB: pg_regfree must be applied to regex if this completes successfully.
729  */
730 static void
731 RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
732 {
733  int text_re_len = VARSIZE_ANY_EXHDR(text_re);
734  char *text_re_val = VARDATA_ANY(text_re);
735  pg_wchar *pattern;
736  int pattern_len;
737  int regcomp_result;
738  char errMsg[100];
739 
740  /* Convert pattern string to wide characters */
741  pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
742  pattern_len = pg_mb2wchar_with_len(text_re_val,
743  pattern,
744  text_re_len);
745 
746  /* Compile regex */
747  regcomp_result = pg_regcomp(regex,
748  pattern,
749  pattern_len,
750  cflags,
751  collation);
752 
753  pfree(pattern);
754 
755  if (regcomp_result != REG_OKAY)
756  {
757  /* re didn't compile (no need for pg_regfree, if so) */
758  pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
759  ereport(ERROR,
760  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
761  errmsg("invalid regular expression: %s", errMsg)));
762  }
763 }
764 
765 
766 /*---------------------
767  * Subroutines for pre-processing the color map (stage 1).
768  *---------------------
769  */
770 
771 /*
772  * Fill TrgmColorInfo structure for each color using regex export functions.
773  */
774 static void
775 getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
776 {
777  int colorsCount = pg_reg_getnumcolors(regex);
778  int i;
779 
780  trgmNFA->ncolors = colorsCount;
781  trgmNFA->colorInfo = (TrgmColorInfo *)
782  palloc0(colorsCount * sizeof(TrgmColorInfo));
783 
784  /*
785  * Loop over colors, filling TrgmColorInfo about each. Note we include
786  * WHITE (0) even though we know it'll be reported as non-expandable.
787  */
788  for (i = 0; i < colorsCount; i++)
789  {
790  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
791  int charsCount = pg_reg_getnumcharacters(regex, i);
792  pg_wchar *chars;
793  int j;
794 
795  if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
796  {
797  /* Non expandable, or too large to work with */
798  colorInfo->expandable = false;
799  continue;
800  }
801 
802  colorInfo->expandable = true;
803  colorInfo->containsNonWord = false;
804  colorInfo->wordChars = (trgm_mb_char *)
805  palloc(sizeof(trgm_mb_char) * charsCount);
806  colorInfo->wordCharsCount = 0;
807 
808  /* Extract all the chars in this color */
809  chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount);
810  pg_reg_getcharacters(regex, i, chars, charsCount);
811 
812  /*
813  * Convert characters back to multibyte form, and save only those that
814  * are word characters. Set "containsNonWord" if any non-word
815  * character. (Note: it'd probably be nicer to keep the chars in
816  * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
817  */
818  for (j = 0; j < charsCount; j++)
819  {
820  trgm_mb_char c;
821 
822  if (!convertPgWchar(chars[j], &c))
823  continue; /* ok to ignore it altogether */
824  if (ISWORDCHR(c.bytes))
825  colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
826  else
827  colorInfo->containsNonWord = true;
828  }
829 
830  pfree(chars);
831  }
832 }
833 
834 /*
835  * Convert pg_wchar to multibyte format.
836  * Returns false if the character should be ignored completely.
837  */
838 static bool
840 {
841  /* "s" has enough space for a multibyte character and a trailing NUL */
842  char s[MAX_MULTIBYTE_CHAR_LEN + 1];
843 
844  /*
845  * We can ignore the NUL character, since it can never appear in a PG text
846  * string. This avoids the need for various special cases when
847  * reconstructing trigrams.
848  */
849  if (c == 0)
850  return false;
851 
852  /* Do the conversion, making sure the result is NUL-terminated */
853  memset(s, 0, sizeof(s));
854  pg_wchar2mb_with_len(&c, s, 1);
855 
856  /*
857  * In IGNORECASE mode, we can ignore uppercase characters. We assume that
858  * the regex engine generated both uppercase and lowercase equivalents
859  * within each color, since we used the REG_ICASE option; so there's no
860  * need to process the uppercase version.
861  *
862  * XXX this code is dependent on the assumption that lowerstr() works the
863  * same as the regex engine's internal case folding machinery. Might be
864  * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c).
865  * On the other hand, the trigrams in the index were created using
866  * lowerstr(), so we're probably screwed if there's any incompatibility
867  * anyway.
868  */
869 #ifdef IGNORECASE
870  {
871  char *lowerCased = lowerstr(s);
872 
873  if (strcmp(lowerCased, s) != 0)
874  {
875  pfree(lowerCased);
876  return false;
877  }
878  pfree(lowerCased);
879  }
880 #endif
881 
882  /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
883  memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
884  return true;
885 }
886 
887 
888 /*---------------------
889  * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
890  *---------------------
891  */
892 
893 /*
894  * Transform the graph, given a regex and extracted color information.
895  *
896  * We create and process a queue of expanded-graph states until all the states
897  * are processed.
898  *
899  * This algorithm may be stopped due to resource limitation. In this case we
900  * force every unprocessed branch to immediately finish with matching (this
901  * can give us false positives but no false negatives) by marking all
902  * unprocessed states as final.
903  */
904 static void
906 {
907  HASHCTL hashCtl;
908  TrgmStateKey initkey;
909  TrgmState *initstate;
910 
911  /* Initialize this stage's workspace in trgmNFA struct */
912  trgmNFA->queue = NIL;
913  trgmNFA->keysQueue = NIL;
914  trgmNFA->arcsCount = 0;
915  trgmNFA->overflowed = false;
916 
917  /* Create hashtable for states */
918  hashCtl.keysize = sizeof(TrgmStateKey);
919  hashCtl.entrysize = sizeof(TrgmState);
920  hashCtl.hcxt = CurrentMemoryContext;
921  trgmNFA->states = hash_create("Trigram NFA",
922  1024,
923  &hashCtl,
925  trgmNFA->nstates = 0;
926 
927  /* Create initial state: ambiguous prefix, NFA's initial state */
928  MemSet(&initkey, 0, sizeof(initkey));
929  initkey.prefix.colors[0] = COLOR_UNKNOWN;
930  initkey.prefix.colors[1] = COLOR_UNKNOWN;
931  initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
932 
933  initstate = getState(trgmNFA, &initkey);
934  initstate->flags |= TSTATE_INIT;
935  trgmNFA->initState = initstate;
936 
937  /*
938  * Recursively build the expanded graph by processing queue of states
939  * (breadth-first search). getState already put initstate in the queue.
940  */
941  while (trgmNFA->queue != NIL)
942  {
943  TrgmState *state = (TrgmState *) linitial(trgmNFA->queue);
944 
945  trgmNFA->queue = list_delete_first(trgmNFA->queue);
946 
947  /*
948  * If we overflowed then just mark state as final. Otherwise do
949  * actual processing.
950  */
951  if (trgmNFA->overflowed)
952  state->flags |= TSTATE_FIN;
953  else
954  processState(trgmNFA, state);
955 
956  /* Did we overflow? */
957  if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
959  trgmNFA->overflowed = true;
960  }
961 }
962 
963 /*
964  * Process one state: add enter keys and then add outgoing arcs.
965  */
966 static void
968 {
969  /* keysQueue should be NIL already, but make sure */
970  trgmNFA->keysQueue = NIL;
971 
972  /*
973  * Add state's own key, and then process all keys added to keysQueue until
974  * queue is empty. But we can quit if the state gets marked final.
975  */
976  addKey(trgmNFA, state, &state->stateKey);
977  while (trgmNFA->keysQueue != NIL && !(state->flags & TSTATE_FIN))
978  {
980 
981  trgmNFA->keysQueue = list_delete_first(trgmNFA->keysQueue);
982  addKey(trgmNFA, state, key);
983  }
984 
985  /*
986  * Add outgoing arcs only if state isn't final (we have no interest in
987  * outgoing arcs if we already match)
988  */
989  if (!(state->flags & TSTATE_FIN))
990  addArcs(trgmNFA, state);
991 }
992 
993 /*
994  * Add the given enter key into the state's enterKeys list, and determine
995  * whether this should result in any further enter keys being added.
996  * If so, add those keys to keysQueue so that processState will handle them.
997  *
998  * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
999  * This situation means that we can reach the final state from this expanded
1000  * state without reading any predictable trigram, so we must consider this
1001  * state as an accepting one.
1002  *
1003  * The given key could be a duplicate of one already in enterKeys, or be
1004  * redundant with some enterKeys. So we check that before doing anything.
1005  *
1006  * Note that we don't generate any actual arcs here. addArcs will do that
1007  * later, after we have identified all the enter keys for this state.
1008  */
1009 static void
1011 {
1012  regex_arc_t *arcs;
1013  TrgmStateKey destKey;
1014  ListCell *cell;
1015  int i,
1016  arcsCount;
1017 
1018  /*
1019  * Ensure any pad bytes in destKey are zero, since it may get used as a
1020  * hashtable key by getState.
1021  */
1022  MemSet(&destKey, 0, sizeof(destKey));
1023 
1024  /*
1025  * Compare key to each existing enter key of the state to check for
1026  * redundancy. We can drop either old key(s) or the new key if we find
1027  * redundancy.
1028  */
1029  foreach(cell, state->enterKeys)
1030  {
1031  TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1032 
1033  if (existingKey->nstate == key->nstate)
1034  {
1035  if (prefixContains(&existingKey->prefix, &key->prefix))
1036  {
1037  /* This old key already covers the new key. Nothing to do */
1038  return;
1039  }
1040  if (prefixContains(&key->prefix, &existingKey->prefix))
1041  {
1042  /*
1043  * The new key covers this old key. Remove the old key, it's
1044  * no longer needed once we add this key to the list.
1045  */
1046  state->enterKeys = foreach_delete_current(state->enterKeys,
1047  cell);
1048  }
1049  }
1050  }
1051 
1052  /* No redundancy, so add this key to the state's list */
1053  state->enterKeys = lappend(state->enterKeys, key);
1054 
1055  /* If state is now known final, mark it and we're done */
1056  if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1057  {
1058  state->flags |= TSTATE_FIN;
1059  return;
1060  }
1061 
1062  /*
1063  * Loop through all outgoing arcs of the corresponding state in the
1064  * original NFA.
1065  */
1066  arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1067  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
1068  pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1069 
1070  for (i = 0; i < arcsCount; i++)
1071  {
1072  regex_arc_t *arc = &arcs[i];
1073 
1074  if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1075  {
1076  /*
1077  * Start of line/string (^). Trigram extraction treats start of
1078  * line same as start of word: double space prefix is added.
1079  * Hence, make an enter key showing we can reach the arc
1080  * destination with all-blank prefix.
1081  */
1082  destKey.prefix.colors[0] = COLOR_BLANK;
1083  destKey.prefix.colors[1] = COLOR_BLANK;
1084  destKey.nstate = arc->to;
1085 
1086  /* Add enter key to this state */
1087  addKeyToQueue(trgmNFA, &destKey);
1088  }
1089  else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1090  {
1091  /*
1092  * End of line/string ($). We must consider this arc as a
1093  * transition that doesn't read anything. The reason for adding
1094  * this enter key to the state is that if the arc leads to the
1095  * NFA's final state, we must mark this expanded state as final.
1096  */
1097  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1098  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1099  destKey.nstate = arc->to;
1100 
1101  /* Add enter key to this state */
1102  addKeyToQueue(trgmNFA, &destKey);
1103  }
1104  else if (arc->co >= 0)
1105  {
1106  /* Regular color (including WHITE) */
1107  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1108 
1109  if (colorInfo->expandable)
1110  {
1111  if (colorInfo->containsNonWord &&
1112  !validArcLabel(key, COLOR_BLANK))
1113  {
1114  /*
1115  * We can reach the arc destination after reading a
1116  * non-word character, but the prefix is not something
1117  * that addArc will accept with COLOR_BLANK, so no trigram
1118  * arc can get made for this transition. We must make an
1119  * enter key to show that the arc destination is
1120  * reachable. Set it up with an all-blank prefix, since
1121  * that corresponds to what the trigram extraction code
1122  * will do at a word starting boundary.
1123  */
1124  destKey.prefix.colors[0] = COLOR_BLANK;
1125  destKey.prefix.colors[1] = COLOR_BLANK;
1126  destKey.nstate = arc->to;
1127  addKeyToQueue(trgmNFA, &destKey);
1128  }
1129 
1130  if (colorInfo->wordCharsCount > 0 &&
1131  !validArcLabel(key, arc->co))
1132  {
1133  /*
1134  * We can reach the arc destination after reading a word
1135  * character, but the prefix is not something that addArc
1136  * will accept, so no trigram arc can get made for this
1137  * transition. We must make an enter key to show that the
1138  * arc destination is reachable. The prefix for the enter
1139  * key should reflect the info we have for this arc.
1140  */
1141  destKey.prefix.colors[0] = key->prefix.colors[1];
1142  destKey.prefix.colors[1] = arc->co;
1143  destKey.nstate = arc->to;
1144  addKeyToQueue(trgmNFA, &destKey);
1145  }
1146  }
1147  else
1148  {
1149  /*
1150  * Unexpandable color. Add enter key with ambiguous prefix,
1151  * showing we can reach the destination from this state, but
1152  * the preceding colors will be uncertain. (We do not set the
1153  * first prefix color to key->prefix.colors[1], because a
1154  * prefix of known followed by unknown is invalid.)
1155  */
1156  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1157  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1158  destKey.nstate = arc->to;
1159  addKeyToQueue(trgmNFA, &destKey);
1160  }
1161  }
1162  else
1163  {
1164  /* RAINBOW: treat as unexpandable color */
1165  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1166  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1167  destKey.nstate = arc->to;
1168  addKeyToQueue(trgmNFA, &destKey);
1169  }
1170  }
1171 
1172  pfree(arcs);
1173 }
1174 
1175 /*
1176  * Add copy of given key to keysQueue for later processing.
1177  */
1178 static void
1180 {
1181  TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey));
1182 
1183  memcpy(keyCopy, key, sizeof(TrgmStateKey));
1184  trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1185 }
1186 
1187 /*
1188  * Add outgoing arcs from given state, whose enter keys are all now known.
1189  */
1190 static void
1192 {
1193  TrgmStateKey destKey;
1194  ListCell *cell;
1195  regex_arc_t *arcs;
1196  int arcsCount,
1197  i;
1198 
1199  /*
1200  * Ensure any pad bytes in destKey are zero, since it may get used as a
1201  * hashtable key by getState.
1202  */
1203  MemSet(&destKey, 0, sizeof(destKey));
1204 
1205  /*
1206  * Iterate over enter keys associated with this expanded-graph state. This
1207  * includes both the state's own stateKey, and any enter keys we added to
1208  * it during addKey (which represent expanded-graph states that are not
1209  * distinguishable from this one by means of trigrams). For each such
1210  * enter key, examine all the out-arcs of the key's underlying NFA state,
1211  * and try to make a trigram arc leading to where the out-arc leads.
1212  * (addArc will deal with whether the arc is valid or not.)
1213  */
1214  foreach(cell, state->enterKeys)
1215  {
1216  TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1217 
1218  arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1219  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
1220  pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1221 
1222  for (i = 0; i < arcsCount; i++)
1223  {
1224  regex_arc_t *arc = &arcs[i];
1225  TrgmColorInfo *colorInfo;
1226 
1227  /*
1228  * Ignore non-expandable colors; addKey already handled the case.
1229  *
1230  * We need no special check for WHITE or begin/end pseudocolors
1231  * here. We don't need to do any processing for them, and they
1232  * will be marked non-expandable since the regex engine will have
1233  * reported them that way. We do have to watch out for RAINBOW,
1234  * which has a negative color number.
1235  */
1236  if (arc->co < 0)
1237  continue;
1238  Assert(arc->co < trgmNFA->ncolors);
1239 
1240  colorInfo = &trgmNFA->colorInfo[arc->co];
1241  if (!colorInfo->expandable)
1242  continue;
1243 
1244  if (colorInfo->containsNonWord)
1245  {
1246  /*
1247  * Color includes non-word character(s).
1248  *
1249  * Generate an arc, treating this transition as occurring on
1250  * BLANK. This allows word-ending trigrams to be manufactured
1251  * if possible.
1252  */
1253  destKey.prefix.colors[0] = key->prefix.colors[1];
1254  destKey.prefix.colors[1] = COLOR_BLANK;
1255  destKey.nstate = arc->to;
1256 
1257  addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
1258  }
1259 
1260  if (colorInfo->wordCharsCount > 0)
1261  {
1262  /*
1263  * Color includes word character(s).
1264  *
1265  * Generate an arc. Color is pushed into prefix of target
1266  * state.
1267  */
1268  destKey.prefix.colors[0] = key->prefix.colors[1];
1269  destKey.prefix.colors[1] = arc->co;
1270  destKey.nstate = arc->to;
1271 
1272  addArc(trgmNFA, state, key, arc->co, &destKey);
1273  }
1274  }
1275 
1276  pfree(arcs);
1277  }
1278 }
1279 
1280 /*
1281  * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1282  *
1283  * state: expanded-graph state we want to add an out-arc to
1284  * key: provides prefix colors (key->nstate is not used)
1285  * co: transition color
1286  * destKey: identifier for destination state of expanded graph
1287  */
1288 static void
1290  TrgmColor co, TrgmStateKey *destKey)
1291 {
1292  TrgmArc *arc;
1293  ListCell *cell;
1294 
1295  /* Do nothing if this wouldn't be a valid arc label trigram */
1296  if (!validArcLabel(key, co))
1297  return;
1298 
1299  /*
1300  * Check if we are going to reach key which is covered by a key which is
1301  * already listed in this state. If so arc is useless: the NFA can bypass
1302  * it through a path that doesn't require any predictable trigram, so
1303  * whether the arc's trigram is present or not doesn't really matter.
1304  */
1305  foreach(cell, state->enterKeys)
1306  {
1307  TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1308 
1309  if (existingKey->nstate == destKey->nstate &&
1310  prefixContains(&existingKey->prefix, &destKey->prefix))
1311  return;
1312  }
1313 
1314  /* Checks were successful, add new arc */
1315  arc = (TrgmArc *) palloc(sizeof(TrgmArc));
1316  arc->target = getState(trgmNFA, destKey);
1317  arc->ctrgm.colors[0] = key->prefix.colors[0];
1318  arc->ctrgm.colors[1] = key->prefix.colors[1];
1319  arc->ctrgm.colors[2] = co;
1320 
1321  state->arcs = lappend(state->arcs, arc);
1322  trgmNFA->arcsCount++;
1323 }
1324 
1325 /*
1326  * Can we make a valid trigram arc label from the given prefix and arc color?
1327  *
1328  * This is split out so that tests in addKey and addArc will stay in sync.
1329  */
1330 static bool
1332 {
1333  /*
1334  * We have to know full trigram in order to add outgoing arc. So we can't
1335  * do it if prefix is ambiguous.
1336  */
1337  if (key->prefix.colors[0] == COLOR_UNKNOWN)
1338  return false;
1339 
1340  /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1341  Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1342  /* And we should not be called with an unknown arc color anytime */
1343  Assert(co != COLOR_UNKNOWN);
1344 
1345  /*
1346  * We don't bother with making arcs representing three non-word
1347  * characters, since that's useless for trigram extraction.
1348  */
1349  if (key->prefix.colors[0] == COLOR_BLANK &&
1350  key->prefix.colors[1] == COLOR_BLANK &&
1351  co == COLOR_BLANK)
1352  return false;
1353 
1354  /*
1355  * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1356  * case doesn't correspond to any trigram the trigram extraction code
1357  * would make. The nonblank-blank-blank case is also not possible with
1358  * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1359  * trigram even if it were valid, for example processing "foo bar" will
1360  * not result in considering the trigram "o ". So if you want to support
1361  * RPADDING = 2, there's more to do than just twiddle this test.)
1362  */
1363  if (key->prefix.colors[0] != COLOR_BLANK &&
1364  key->prefix.colors[1] == COLOR_BLANK)
1365  return false;
1366 
1367  /*
1368  * Other combinations involving blank are valid, in particular we assume
1369  * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1370  *
1371  * Note: Using again the example "foo bar", we will not consider the
1372  * trigram " b", though this trigram would be found by the trigram
1373  * extraction code. Since we will find " ba", it doesn't seem worth
1374  * trying to hack the algorithm to generate the additional trigram.
1375  */
1376 
1377  /* arc label is valid */
1378  return true;
1379 }
1380 
1381 /*
1382  * Get state of expanded graph for given state key,
1383  * and queue the state for processing if it didn't already exist.
1384  */
1385 static TrgmState *
1387 {
1388  TrgmState *state;
1389  bool found;
1390 
1391  state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1392  &found);
1393  if (!found)
1394  {
1395  /* New state: initialize and queue it */
1396  state->arcs = NIL;
1397  state->enterKeys = NIL;
1398  state->flags = 0;
1399  /* states are initially given negative numbers */
1400  state->snumber = -(++trgmNFA->nstates);
1401  state->parent = NULL;
1402  state->tentFlags = 0;
1403  state->tentParent = NULL;
1404 
1405  trgmNFA->queue = lappend(trgmNFA->queue, state);
1406  }
1407  return state;
1408 }
1409 
1410 /*
1411  * Check if prefix1 "contains" prefix2.
1412  *
1413  * "contains" means that any exact prefix (with no ambiguity) that satisfies
1414  * prefix2 also satisfies prefix1.
1415  */
1416 static bool
1418 {
1419  if (prefix1->colors[1] == COLOR_UNKNOWN)
1420  {
1421  /* Fully ambiguous prefix contains everything */
1422  return true;
1423  }
1424  else if (prefix1->colors[0] == COLOR_UNKNOWN)
1425  {
1426  /*
1427  * Prefix with only first unknown color contains every prefix with
1428  * same second color.
1429  */
1430  if (prefix1->colors[1] == prefix2->colors[1])
1431  return true;
1432  else
1433  return false;
1434  }
1435  else
1436  {
1437  /* Exact prefix contains only the exact same prefix */
1438  if (prefix1->colors[0] == prefix2->colors[0] &&
1439  prefix1->colors[1] == prefix2->colors[1])
1440  return true;
1441  else
1442  return false;
1443  }
1444 }
1445 
1446 
1447 /*---------------------
1448  * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1449  *---------------------
1450  */
1451 
1452 /*
1453  * Get vector of all color trigrams in graph and select which of them
1454  * to expand into simple trigrams.
1455  *
1456  * Returns true if OK, false if exhausted resource limits.
1457  */
1458 static bool
1460 {
1461  HASH_SEQ_STATUS scan_status;
1462  int arcsCount = trgmNFA->arcsCount,
1463  i;
1464  TrgmState *state;
1465  ColorTrgmInfo *colorTrgms;
1466  int64 totalTrgmCount;
1467  float4 totalTrgmPenalty;
1468  int cnumber;
1469 
1470  /* Collect color trigrams from all arcs */
1471  colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
1472  trgmNFA->colorTrgms = colorTrgms;
1473 
1474  i = 0;
1475  hash_seq_init(&scan_status, trgmNFA->states);
1476  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1477  {
1478  ListCell *cell;
1479 
1480  foreach(cell, state->arcs)
1481  {
1482  TrgmArc *arc = (TrgmArc *) lfirst(cell);
1483  TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
1484  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1485 
1486  arcInfo->source = state;
1487  arcInfo->target = arc->target;
1488  trgmInfo->ctrgm = arc->ctrgm;
1489  trgmInfo->cnumber = -1;
1490  /* count and penalty will be set below */
1491  trgmInfo->expanded = true;
1492  trgmInfo->arcs = list_make1(arcInfo);
1493  i++;
1494  }
1495  }
1496  Assert(i == arcsCount);
1497 
1498  /* Remove duplicates, merging their arcs lists */
1499  if (arcsCount >= 2)
1500  {
1501  ColorTrgmInfo *p1,
1502  *p2;
1503 
1504  /* Sort trigrams to ease duplicate detection */
1505  qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1506 
1507  /* p1 is probe point, p2 is last known non-duplicate. */
1508  p2 = colorTrgms;
1509  for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1510  {
1511  if (colorTrgmInfoCmp(p1, p2) > 0)
1512  {
1513  p2++;
1514  *p2 = *p1;
1515  }
1516  else
1517  {
1518  p2->arcs = list_concat(p2->arcs, p1->arcs);
1519  }
1520  }
1521  trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1522  }
1523  else
1524  {
1525  trgmNFA->colorTrgmsCount = arcsCount;
1526  }
1527 
1528  /*
1529  * Count number of simple trigrams generated by each color trigram, and
1530  * also compute a penalty value, which is the number of simple trigrams
1531  * times a multiplier that depends on its whitespace content.
1532  *
1533  * Note: per-color-trigram counts cannot overflow an int so long as
1534  * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1535  * 1290. However, the grand total totalTrgmCount might conceivably
1536  * overflow an int, so we use int64 for that within this routine. Also,
1537  * penalties are calculated in float4 arithmetic to avoid any overflow
1538  * worries.
1539  */
1540  totalTrgmCount = 0;
1541  totalTrgmPenalty = 0.0f;
1542  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1543  {
1544  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1545  int j,
1546  count = 1,
1547  typeIndex = 0;
1548 
1549  for (j = 0; j < 3; j++)
1550  {
1551  TrgmColor c = trgmInfo->ctrgm.colors[j];
1552 
1553  typeIndex *= 2;
1554  if (c == COLOR_BLANK)
1555  typeIndex++;
1556  else
1557  count *= trgmNFA->colorInfo[c].wordCharsCount;
1558  }
1559  trgmInfo->count = count;
1560  totalTrgmCount += count;
1561  trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1562  totalTrgmPenalty += trgmInfo->penalty;
1563  }
1564 
1565  /* Sort color trigrams in descending order of their penalties */
1566  qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1568 
1569  /*
1570  * Remove color trigrams from the graph so long as total penalty of color
1571  * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1572  * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1573  * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1574  * penalty, since those are the most promising for reducing the total
1575  * penalty. When removing a color trigram we have to merge states
1576  * connected by arcs labeled with that trigram. It's necessary to not
1577  * merge initial and final states, because our graph becomes useless if
1578  * that happens; so we cannot always remove the trigram we'd prefer to.
1579  */
1580  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1581  {
1582  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1583  bool canRemove = true;
1584  ListCell *cell;
1585 
1586  /* Done if we've reached the target */
1587  if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
1588  break;
1589 
1590 #ifdef TRGM_REGEXP_DEBUG
1591  fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1592  trgmInfo->ctrgm.colors[0],
1593  trgmInfo->ctrgm.colors[1],
1594  trgmInfo->ctrgm.colors[2],
1595  trgmInfo->penalty,
1596  list_length(trgmInfo->arcs));
1597 #endif
1598 
1599  /*
1600  * Does any arc of this color trigram connect initial and final
1601  * states? If so we can't remove it.
1602  */
1603  foreach(cell, trgmInfo->arcs)
1604  {
1605  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1606  TrgmState *source = arcInfo->source,
1607  *target = arcInfo->target;
1608  int source_flags,
1609  target_flags;
1610 
1611 #ifdef TRGM_REGEXP_DEBUG
1612  fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1613  -target->snumber, target->flags,
1614  -source->snumber, source->flags);
1615 #endif
1616 
1617  /* examine parent states, if any merging has already happened */
1618  while (source->parent)
1619  source = source->parent;
1620  while (target->parent)
1621  target = target->parent;
1622 
1623 #ifdef TRGM_REGEXP_DEBUG
1624  fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1625  -target->snumber, target->flags,
1626  -source->snumber, source->flags);
1627 #endif
1628 
1629  /* we must also consider merges we are planning right now */
1630  source_flags = source->flags | source->tentFlags;
1631  while (source->tentParent)
1632  {
1633  source = source->tentParent;
1634  source_flags |= source->flags | source->tentFlags;
1635  }
1636  target_flags = target->flags | target->tentFlags;
1637  while (target->tentParent)
1638  {
1639  target = target->tentParent;
1640  target_flags |= target->flags | target->tentFlags;
1641  }
1642 
1643 #ifdef TRGM_REGEXP_DEBUG
1644  fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1645  -target->snumber, target_flags,
1646  -source->snumber, source_flags);
1647 #endif
1648 
1649  /* would fully-merged state have both INIT and FIN set? */
1650  if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1651  (TSTATE_INIT | TSTATE_FIN))
1652  {
1653  canRemove = false;
1654  break;
1655  }
1656 
1657  /* ok so far, so remember planned merge */
1658  if (source != target)
1659  {
1660 #ifdef TRGM_REGEXP_DEBUG
1661  fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1662  -target->snumber, -source->snumber);
1663 #endif
1664  target->tentParent = source;
1665  source->tentFlags |= target_flags;
1666  }
1667  }
1668 
1669  /*
1670  * We must reset all the tentFlags/tentParent fields before
1671  * continuing. tentFlags could only have become set in states that
1672  * are the source or parent or tentative parent of one of the current
1673  * arcs; likewise tentParent could only have become set in states that
1674  * are the target or parent or tentative parent of one of the current
1675  * arcs. There might be some overlap between those sets, but if we
1676  * clear tentFlags in target states as well as source states, we
1677  * should be okay even if we visit a state as target before visiting
1678  * it as a source.
1679  */
1680  foreach(cell, trgmInfo->arcs)
1681  {
1682  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1683  TrgmState *source = arcInfo->source,
1684  *target = arcInfo->target;
1685  TrgmState *ttarget;
1686 
1687  /* no need to touch previously-merged states */
1688  while (source->parent)
1689  source = source->parent;
1690  while (target->parent)
1691  target = target->parent;
1692 
1693  while (source)
1694  {
1695  source->tentFlags = 0;
1696  source = source->tentParent;
1697  }
1698 
1699  while ((ttarget = target->tentParent) != NULL)
1700  {
1701  target->tentParent = NULL;
1702  target->tentFlags = 0; /* in case it was also a source */
1703  target = ttarget;
1704  }
1705  }
1706 
1707  /* Now, move on if we can't drop this trigram */
1708  if (!canRemove)
1709  {
1710 #ifdef TRGM_REGEXP_DEBUG
1711  fprintf(stderr, " ... not ok to merge\n");
1712 #endif
1713  continue;
1714  }
1715 
1716  /* OK, merge states linked by each arc labeled by the trigram */
1717  foreach(cell, trgmInfo->arcs)
1718  {
1719  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1720  TrgmState *source = arcInfo->source,
1721  *target = arcInfo->target;
1722 
1723  while (source->parent)
1724  source = source->parent;
1725  while (target->parent)
1726  target = target->parent;
1727  if (source != target)
1728  {
1729 #ifdef TRGM_REGEXP_DEBUG
1730  fprintf(stderr, "merging s%d into s%d\n",
1731  -target->snumber, -source->snumber);
1732 #endif
1733  mergeStates(source, target);
1734  /* Assert we didn't merge initial and final states */
1735  Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1736  (TSTATE_INIT | TSTATE_FIN));
1737  }
1738  }
1739 
1740  /* Mark trigram unexpanded, and update totals */
1741  trgmInfo->expanded = false;
1742  totalTrgmCount -= trgmInfo->count;
1743  totalTrgmPenalty -= trgmInfo->penalty;
1744  }
1745 
1746  /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1747  if (totalTrgmCount > MAX_TRGM_COUNT)
1748  return false;
1749 
1750  trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1751 
1752  /*
1753  * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1754  * and enumerate the color trigrams that are expanded.
1755  */
1756  cnumber = 0;
1757  qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1759  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1760  {
1761  if (colorTrgms[i].expanded)
1762  {
1763  colorTrgms[i].cnumber = cnumber;
1764  cnumber++;
1765  }
1766  }
1767 
1768  return true;
1769 }
1770 
1771 /*
1772  * Expand selected color trigrams into regular trigrams.
1773  *
1774  * Returns the TRGM array to be passed to the index machinery.
1775  * The array must be allocated in rcontext.
1776  */
1777 static TRGM *
1779 {
1780  TRGM *trg;
1781  trgm *p;
1782  int i;
1783  TrgmColorInfo blankColor;
1784  trgm_mb_char blankChar;
1785 
1786  /* Set up "blank" color structure containing a single zero character */
1787  memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1788  blankColor.wordCharsCount = 1;
1789  blankColor.wordChars = &blankChar;
1790 
1791  /* Construct the trgm array */
1792  trg = (TRGM *)
1793  MemoryContextAllocZero(rcontext,
1794  TRGMHDRSIZE +
1795  trgmNFA->totalTrgmCount * sizeof(trgm));
1796  trg->flag = ARRKEY;
1797  SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1798  p = GETARR(trg);
1799  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1800  {
1801  ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1802  TrgmColorInfo *c[3];
1803  trgm_mb_char s[3];
1804  int j,
1805  i1,
1806  i2,
1807  i3;
1808 
1809  /* Ignore any unexpanded trigrams ... */
1810  if (!colorTrgm->expanded)
1811  continue;
1812 
1813  /* Get colors, substituting the dummy struct for COLOR_BLANK */
1814  for (j = 0; j < 3; j++)
1815  {
1816  if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1817  c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1818  else
1819  c[j] = &blankColor;
1820  }
1821 
1822  /* Iterate over all possible combinations of colors' characters */
1823  for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1824  {
1825  s[0] = c[0]->wordChars[i1];
1826  for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1827  {
1828  s[1] = c[1]->wordChars[i2];
1829  for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1830  {
1831  s[2] = c[2]->wordChars[i3];
1832  fillTrgm(p, s);
1833  p++;
1834  }
1835  }
1836  }
1837  }
1838 
1839  return trg;
1840 }
1841 
1842 /*
1843  * Convert trigram into trgm datatype.
1844  */
1845 static void
1847 {
1848  char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1849  *p;
1850  int i,
1851  j;
1852 
1853  /* Write multibyte string into "str" (we don't need null termination) */
1854  p = str;
1855 
1856  for (i = 0; i < 3; i++)
1857  {
1858  if (s[i].bytes[0] != 0)
1859  {
1860  for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1861  *p++ = s[i].bytes[j];
1862  }
1863  else
1864  {
1865  /* Emit a space in place of COLOR_BLANK */
1866  *p++ = ' ';
1867  }
1868  }
1869 
1870  /* Convert "str" to a standard trigram (possibly hashing it) */
1871  compact_trigram(ptrgm, str, p - str);
1872 }
1873 
1874 /*
1875  * Merge two states of graph.
1876  */
1877 static void
1879 {
1880  Assert(state1 != state2);
1881  Assert(!state1->parent);
1882  Assert(!state2->parent);
1883 
1884  /* state1 absorbs state2's flags */
1885  state1->flags |= state2->flags;
1886 
1887  /* state2, and indirectly all its children, become children of state1 */
1888  state2->parent = state1;
1889 }
1890 
1891 /*
1892  * Compare function for sorting of color trigrams by their colors.
1893  */
1894 static int
1895 colorTrgmInfoCmp(const void *p1, const void *p2)
1896 {
1897  const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1898  const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1899 
1900  return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1901 }
1902 
1903 /*
1904  * Compare function for sorting color trigrams in descending order of
1905  * their penalty fields.
1906  */
1907 static int
1908 colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1909 {
1910  float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1911  float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1912 
1913  if (penalty1 < penalty2)
1914  return 1;
1915  else if (penalty1 == penalty2)
1916  return 0;
1917  else
1918  return -1;
1919 }
1920 
1921 
1922 /*---------------------
1923  * Subroutines for packing the graph into final representation (stage 4).
1924  *---------------------
1925  */
1926 
1927 /*
1928  * Pack expanded graph into final representation.
1929  *
1930  * The result data must be allocated in rcontext.
1931  */
1932 static TrgmPackedGraph *
1933 packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
1934 {
1935  int snumber = 2,
1936  arcIndex,
1937  arcsCount;
1938  HASH_SEQ_STATUS scan_status;
1939  TrgmState *state;
1941  *p1,
1942  *p2;
1943  TrgmPackedArc *packedArcs;
1944  TrgmPackedGraph *result;
1945  int i,
1946  j;
1947 
1948  /* Enumerate surviving states, giving init and fin reserved numbers */
1949  hash_seq_init(&scan_status, trgmNFA->states);
1950  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1951  {
1952  while (state->parent)
1953  state = state->parent;
1954 
1955  if (state->snumber < 0)
1956  {
1957  if (state->flags & TSTATE_INIT)
1958  state->snumber = 0;
1959  else if (state->flags & TSTATE_FIN)
1960  state->snumber = 1;
1961  else
1962  {
1963  state->snumber = snumber;
1964  snumber++;
1965  }
1966  }
1967  }
1968 
1969  /* Collect array of all arcs */
1970  arcs = (TrgmPackArcInfo *)
1971  palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount);
1972  arcIndex = 0;
1973  hash_seq_init(&scan_status, trgmNFA->states);
1974  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1975  {
1976  TrgmState *source = state;
1977  ListCell *cell;
1978 
1979  while (source->parent)
1980  source = source->parent;
1981 
1982  foreach(cell, state->arcs)
1983  {
1984  TrgmArc *arc = (TrgmArc *) lfirst(cell);
1985  TrgmState *target = arc->target;
1986 
1987  while (target->parent)
1988  target = target->parent;
1989 
1990  if (source->snumber != target->snumber)
1991  {
1992  ColorTrgmInfo *ctrgm;
1993 
1994  ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1995  trgmNFA->colorTrgms,
1996  trgmNFA->colorTrgmsCount,
1997  sizeof(ColorTrgmInfo),
1999  Assert(ctrgm != NULL);
2000  Assert(ctrgm->expanded);
2001 
2002  arcs[arcIndex].sourceState = source->snumber;
2003  arcs[arcIndex].targetState = target->snumber;
2004  arcs[arcIndex].colorTrgm = ctrgm->cnumber;
2005  arcIndex++;
2006  }
2007  }
2008  }
2009 
2010  /* Sort arcs to ease duplicate detection */
2011  qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
2012 
2013  /* We could have duplicates because states were merged. Remove them. */
2014  /* p1 is probe point, p2 is last known non-duplicate. */
2015  p2 = arcs;
2016  for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2017  {
2018  if (packArcInfoCmp(p1, p2) > 0)
2019  {
2020  p2++;
2021  *p2 = *p1;
2022  }
2023  }
2024  arcsCount = (p2 - arcs) + 1;
2025 
2026  /* Create packed representation */
2027  result = (TrgmPackedGraph *)
2028  MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
2029 
2030  /* Pack color trigrams information */
2031  result->colorTrigramsCount = 0;
2032  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2033  {
2034  if (trgmNFA->colorTrgms[i].expanded)
2035  result->colorTrigramsCount++;
2036  }
2037  result->colorTrigramGroups = (int *)
2038  MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2039  j = 0;
2040  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2041  {
2042  if (trgmNFA->colorTrgms[i].expanded)
2043  {
2044  result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2045  j++;
2046  }
2047  }
2048 
2049  /* Pack states and arcs information */
2050  result->statesCount = snumber;
2051  result->states = (TrgmPackedState *)
2052  MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2053  packedArcs = (TrgmPackedArc *)
2054  MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2055  j = 0;
2056  for (i = 0; i < snumber; i++)
2057  {
2058  int cnt = 0;
2059 
2060  result->states[i].arcs = &packedArcs[j];
2061  while (j < arcsCount && arcs[j].sourceState == i)
2062  {
2063  packedArcs[j].targetState = arcs[j].targetState;
2064  packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2065  cnt++;
2066  j++;
2067  }
2068  result->states[i].arcsCount = cnt;
2069  }
2070 
2071  /* Allocate working memory for trigramsMatchGraph() */
2072  result->colorTrigramsActive = (bool *)
2073  MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2074  result->statesActive = (bool *)
2075  MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2076  result->statesQueue = (int *)
2077  MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2078 
2079  return result;
2080 }
2081 
2082 /*
2083  * Comparison function for sorting TrgmPackArcInfos.
2084  *
2085  * Compares arcs in following order: sourceState, colorTrgm, targetState.
2086  */
2087 static int
2088 packArcInfoCmp(const void *a1, const void *a2)
2089 {
2090  const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2091  const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2092 
2093  if (p1->sourceState < p2->sourceState)
2094  return -1;
2095  if (p1->sourceState > p2->sourceState)
2096  return 1;
2097  if (p1->colorTrgm < p2->colorTrgm)
2098  return -1;
2099  if (p1->colorTrgm > p2->colorTrgm)
2100  return 1;
2101  if (p1->targetState < p2->targetState)
2102  return -1;
2103  if (p1->targetState > p2->targetState)
2104  return 1;
2105  return 0;
2106 }
2107 
2108 
2109 /*---------------------
2110  * Debugging functions
2111  *
2112  * These are designed to emit GraphViz files.
2113  *---------------------
2114  */
2115 
2116 #ifdef TRGM_REGEXP_DEBUG
2117 
2118 /*
2119  * Print initial NFA, in regexp library's representation
2120  */
2121 static void
2122 printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2123 {
2125  int nstates = pg_reg_getnumstates(regex);
2126  int state;
2127  int i;
2128 
2129  initStringInfo(&buf);
2130 
2131  appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2132 
2133  for (state = 0; state < nstates; state++)
2134  {
2135  regex_arc_t *arcs;
2136  int i,
2137  arcsCount;
2138 
2139  appendStringInfo(&buf, "s%d", state);
2140  if (pg_reg_getfinalstate(regex) == state)
2141  appendStringInfoString(&buf, " [shape = doublecircle]");
2142  appendStringInfoString(&buf, ";\n");
2143 
2144  arcsCount = pg_reg_getnumoutarcs(regex, state);
2145  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
2146  pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2147 
2148  for (i = 0; i < arcsCount; i++)
2149  {
2150  appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2151  state, arcs[i].to, arcs[i].co);
2152  }
2153 
2154  pfree(arcs);
2155  }
2156 
2157  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2158  appendStringInfo(&buf, " initial -> s%d;\n",
2159  pg_reg_getinitialstate(regex));
2160 
2161  /* Print colors */
2162  appendStringInfoString(&buf, " { rank = sink;\n");
2163  appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2164 
2165  for (i = 0; i < ncolors; i++)
2166  {
2167  TrgmColorInfo *color = &colors[i];
2168  int j;
2169 
2170  appendStringInfo(&buf, "<br/>Color %d: ", i);
2171  if (color->expandable)
2172  {
2173  for (j = 0; j < color->wordCharsCount; j++)
2174  {
2175  char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2176 
2177  memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2178  s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2179  appendStringInfoString(&buf, s);
2180  }
2181  }
2182  else
2183  appendStringInfoString(&buf, "not expandable");
2184  appendStringInfoChar(&buf, '\n');
2185  }
2186 
2187  appendStringInfoString(&buf, " >];\n");
2188  appendStringInfoString(&buf, " }\n");
2189  appendStringInfoString(&buf, "}\n");
2190 
2191  {
2192  /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2193  FILE *fp = fopen("/tmp/source.gv", "w");
2194 
2195  fprintf(fp, "%s", buf.data);
2196  fclose(fp);
2197  }
2198 
2199  pfree(buf.data);
2200 }
2201 
2202 /*
2203  * Print expanded graph.
2204  */
2205 static void
2206 printTrgmNFA(TrgmNFA *trgmNFA)
2207 {
2209  HASH_SEQ_STATUS scan_status;
2210  TrgmState *state;
2211  TrgmState *initstate = NULL;
2212 
2213  initStringInfo(&buf);
2214 
2215  appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2216 
2217  hash_seq_init(&scan_status, trgmNFA->states);
2218  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2219  {
2220  ListCell *cell;
2221 
2222  appendStringInfo(&buf, "s%d", -state->snumber);
2223  if (state->flags & TSTATE_FIN)
2224  appendStringInfoString(&buf, " [shape = doublecircle]");
2225  if (state->flags & TSTATE_INIT)
2226  initstate = state;
2227  appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2228  appendStringInfoString(&buf, ";\n");
2229 
2230  foreach(cell, state->arcs)
2231  {
2232  TrgmArc *arc = (TrgmArc *) lfirst(cell);
2233 
2234  appendStringInfo(&buf, " s%d -> s%d [label = \"",
2235  -state->snumber, -arc->target->snumber);
2236  printTrgmColor(&buf, arc->ctrgm.colors[0]);
2237  appendStringInfoChar(&buf, ' ');
2238  printTrgmColor(&buf, arc->ctrgm.colors[1]);
2239  appendStringInfoChar(&buf, ' ');
2240  printTrgmColor(&buf, arc->ctrgm.colors[2]);
2241  appendStringInfoString(&buf, "\"];\n");
2242  }
2243  }
2244 
2245  if (initstate)
2246  {
2247  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2248  appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2249  }
2250 
2251  appendStringInfoString(&buf, "}\n");
2252 
2253  {
2254  /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2255  FILE *fp = fopen("/tmp/transformed.gv", "w");
2256 
2257  fprintf(fp, "%s", buf.data);
2258  fclose(fp);
2259  }
2260 
2261  pfree(buf.data);
2262 }
2263 
2264 /*
2265  * Print a TrgmColor readably.
2266  */
2267 static void
2268 printTrgmColor(StringInfo buf, TrgmColor co)
2269 {
2270  if (co == COLOR_UNKNOWN)
2271  appendStringInfoChar(buf, 'u');
2272  else if (co == COLOR_BLANK)
2273  appendStringInfoChar(buf, 'b');
2274  else
2275  appendStringInfo(buf, "%d", (int) co);
2276 }
2277 
2278 /*
2279  * Print final packed representation of trigram-based expanded graph.
2280  */
2281 static void
2282 printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
2283 {
2285  trgm *p;
2286  int i;
2287 
2288  initStringInfo(&buf);
2289 
2290  appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2291 
2292  for (i = 0; i < packedGraph->statesCount; i++)
2293  {
2294  TrgmPackedState *state = &packedGraph->states[i];
2295  int j;
2296 
2297  appendStringInfo(&buf, " s%d", i);
2298  if (i == 1)
2299  appendStringInfoString(&buf, " [shape = doublecircle]");
2300 
2301  appendStringInfo(&buf, " [label = <s%d>];\n", i);
2302 
2303  for (j = 0; j < state->arcsCount; j++)
2304  {
2305  TrgmPackedArc *arc = &state->arcs[j];
2306 
2307  appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2308  i, arc->targetState, arc->colorTrgm);
2309  }
2310  }
2311 
2312  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2313  appendStringInfo(&buf, " initial -> s%d;\n", 0);
2314 
2315  /* Print trigrams */
2316  appendStringInfoString(&buf, " { rank = sink;\n");
2317  appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2318 
2319  p = GETARR(trigrams);
2320  for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2321  {
2322  int count = packedGraph->colorTrigramGroups[i];
2323  int j;
2324 
2325  appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2326 
2327  for (j = 0; j < count; j++)
2328  {
2329  if (j > 0)
2330  appendStringInfoString(&buf, ", ");
2331 
2332  /*
2333  * XXX This representation is nice only for all-ASCII trigrams.
2334  */
2335  appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2336  p++;
2337  }
2338  }
2339 
2340  appendStringInfoString(&buf, " >];\n");
2341  appendStringInfoString(&buf, " }\n");
2342  appendStringInfoString(&buf, "}\n");
2343 
2344  {
2345  /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2346  FILE *fp = fopen("/tmp/packed.gv", "w");
2347 
2348  fprintf(fp, "%s", buf.data);
2349  fclose(fp);
2350  }
2351 
2352  pfree(buf.data);
2353 }
2354 
2355 #endif /* TRGM_REGEXP_DEBUG */
#define MAX_EXPANDED_ARCS
Definition: trgm_regexp.c:219
#define NIL
Definition: pg_list.h:65
static void mergeStates(TrgmState *state1, TrgmState *state2)
Definition: trgm_regexp.c:1878
bool * colorTrigramsActive
Definition: trgm_regexp.c:461
int pg_reg_getnumstates(const regex_t *regex)
Definition: regexport.c:36
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
Definition: regexport.c:155
struct TrgmState TrgmState
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
#define AllocSetContextCreate
Definition: memutils.h:173
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:775
ColorTrgm ctrgm
Definition: trgm_regexp.c:348
#define VARDATA_ANY(PTR)
Definition: postgres.h:361
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
Definition: trgm_regexp.c:1417
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
struct TrgmState * tentParent
Definition: trgm_regexp.c:340
MemoryContext hcxt
Definition: hsearch.h:86
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
Definition: trgm_regexp.c:1289
int pg_reg_getfinalstate(const regex_t *regex)
Definition: regexport.c:64
TrgmColorInfo * colorInfo
Definition: trgm_regexp.c:403
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:577
int pg_reg_getnumcharacters(const regex_t *regex, int co)
Definition: regexport.c:230
regex_t * regex
Definition: trgm_regexp.c:402
float4 penalty
Definition: trgm_regexp.c:377
Definition: regguts.h:290
def bytes(source, encoding='ascii', errors='strict')
short color
Definition: regguts.h:150
TrgmColor colors[3]
Definition: trgm_regexp.c:299
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
HTAB * states
Definition: trgm_regexp.c:407
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
Definition: trgm_regexp.c:1010
Size entrysize
Definition: hsearch.h:76
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
Definition: regexport.c:266
#define ISWORDCHR(c)
Definition: trgm.h:55
#define WISH_TRGM_PENALTY
Definition: trgm_regexp.c:221
#define ARRKEY
Definition: trgm.h:97
List * list_concat(List *list1, const List *list2)
Definition: list.c:530
int errcode(int sqlerrcode)
Definition: elog.c:698
#define MemSet(start, val, len)
Definition: c.h:1008
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1382
int pg_reg_colorisbegin(const regex_t *regex, int co)
Definition: regexport.c:191
TrgmPackedState * states
Definition: trgm_regexp.c:458
TrgmState * initState
Definition: trgm_regexp.c:408
#define TSTATE_FIN
Definition: trgm_regexp.c:329
char * lowerstr(const char *str)
Definition: ts_locale.c:244
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
unsigned int Oid
Definition: postgres_ext.h:31
#define TSTATE_INIT
Definition: trgm_regexp.c:328
int totalTrgmCount
Definition: trgm_regexp.c:420
#define fprintf
Definition: port.h:221
bool overflowed
Definition: trgm_regexp.c:415
#define REG_ICASE
Definition: regex.h:108
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition: regcomp.c:328
int pg_reg_getnumcolors(const regex_t *regex)
Definition: regexport.c:174
int nstates
Definition: trgm_regexp.c:409
#define CALCGTSIZE(flag, siglen)
Definition: hstore_gist.c:59
#define foreach_delete_current(lst, cell)
Definition: pg_list.h:369
bool containsNonWord
Definition: trgm_regexp.c:261
static bool convertPgWchar(pg_wchar c, trgm_mb_char *result)
Definition: trgm_regexp.c:839
#define list_make1(x1)
Definition: pg_list.h:206
TrgmPackedArc * arcs
Definition: trgm_regexp.c:435
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:1459
static int packArcInfoCmp(const void *a1, const void *a2)
Definition: trgm_regexp.c:2088
#define GETARR(x)
Definition: trgm.h:107
static const FormData_pg_attribute a2
Definition: heap.c:167
Definition: dynahash.c:219
bool * statesActive
Definition: trgm_regexp.c:462
void pfree(void *pointer)
Definition: mcxt.c:1169
#define REG_OKAY
Definition: regex.h:139
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
#define linitial(l)
Definition: pg_list.h:174
TrgmColor colors[2]
Definition: trgm_regexp.c:290
#define ERROR
Definition: elog.h:46
Definition: trgm.h:67
int pg_reg_getinitialstate(const regex_t *regex)
Definition: regexport.c:50
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:195
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
char * c
static char * buf
Definition: pg_test_fsync.c:68
List * enterKeys
Definition: trgm_regexp.c:335
TrgmState * target
Definition: trgm_regexp.c:360
ColorTrgmInfo * colorTrgms
Definition: trgm_regexp.c:418
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition: regerror.c:60
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
Definition: trgm_regexp.c:638
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
#define COLOR_COUNT_LIMIT
Definition: trgm_regexp.c:222
#define REG_ADVANCED
Definition: regex.h:105
TrgmState * target
Definition: trgm_regexp.c:349
List * arcs
Definition: trgm_regexp.c:334
#define TRGMHDRSIZE
Definition: trgm.h:74
unsigned int pg_wchar
Definition: mbprint.c:31
TrgmState * source
Definition: trgm_regexp.c:359
List * lappend(List *list, void *datum)
Definition: list.c:336
#define MAX_MULTIBYTE_CHAR_LEN
Definition: pg_wchar.h:30
int arcsCount
Definition: trgm_regexp.c:414
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
#define COLOR_BLANK
Definition: trgm_regexp.c:286
float float4
Definition: c.h:564
List * queue
Definition: trgm_regexp.c:412
#define HASH_BLOBS
Definition: hsearch.h:97
#define PG_FINALLY()
Definition: elog.h:330
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1933
void * palloc0(Size size)
Definition: mcxt.c:1093
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:521
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1179
Size keysize
Definition: hsearch.h:75
uint8 flag
Definition: trgm.h:70
int pg_reg_colorisend(const regex_t *regex, int co)
Definition: regexport.c:208
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:906
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1908
int colorTrgmsCount
Definition: trgm_regexp.c:419
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:929
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition: trgm_op.c:198
#define ereport(elevel,...)
Definition: elog.h:157
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
Definition: regexport.c:134
char trgm[3]
Definition: trgm.h:42
#define MAX_TRGM_COUNT
Definition: trgm_regexp.c:220
int tentFlags
Definition: trgm_regexp.c:339
#define Assert(condition)
Definition: c.h:804
#define lfirst(lc)
Definition: pg_list.h:169
Definition: regguts.h:317
char bytes[MAX_MULTIBYTE_CHAR_LEN]
Definition: trgm_regexp.c:242
TrgmStateKey stateKey
Definition: trgm_regexp.c:333
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1778
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
Definition: trgm_regexp.c:731
static rewind_source * source
Definition: pg_rewind.c:79
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
Definition: trgm_regexp.c:1331
static int list_length(const List *l)
Definition: pg_list.h:149
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1386
#define REG_NOSUB
Definition: regex.h:109
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1436
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:1191
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1426
TrgmPrefix prefix
Definition: trgm_regexp.c:309
static const float4 penalties[8]
Definition: trgm_regexp.c:228
#define MAX_EXPANDED_STATES
Definition: trgm_regexp.c:218
static int colorTrgmInfoCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1895
struct TrgmState * parent
Definition: trgm_regexp.c:338
#define COLOR_UNKNOWN
Definition: trgm_regexp.c:285
static void transformGraph(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:905
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:354
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
int i
List * keysQueue
Definition: trgm_regexp.c:413
ColorTrgm ctrgm
Definition: trgm_regexp.c:374
Definition: c.h:621
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:342
static void processState(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:967
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:951
static const FormData_pg_attribute a1
Definition: heap.c:153
#define qsort(a, b, c, d)
Definition: port.h:505
trgm_mb_char * wordChars
Definition: trgm_regexp.c:263
#define PG_TRY()
Definition: elog.h:313
void pg_regfree(regex_t *re)
Definition: regfree.c:49
Definition: pg_list.h:50
static char chars[TZ_MAX_CHARS]
Definition: zic.c:399
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
Definition: trgm_regexp.c:1846
Definition: regex.h:55
#define PG_END_TRY()
Definition: elog.h:338
List * list_delete_first(List *list)
Definition: list.c:875
int * colorTrigramGroups
Definition: trgm_regexp.c:451
int ncolors
Definition: trgm_regexp.c:404
int TrgmColor
Definition: trgm_regexp.c:282