PostgreSQL Source Code  git master
trgm_regexp.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * trgm_regexp.c
4  * Regular expression matching using trigrams.
5  *
6  * The general idea of trigram index support for a regular expression (regex)
7  * search is to transform the regex into a logical expression on trigrams.
8  * For example:
9  *
10  * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11  *
12  * If a string matches the regex, then it must match the logical expression on
13  * trigrams. The opposite is not necessarily true, however: a string that
14  * matches the logical expression might not match the original regex. Such
15  * false positives are removed via recheck, by running the regular regex match
16  * operator on the retrieved heap tuple.
17  *
18  * Since the trigram expression involves both AND and OR operators, we can't
19  * expect the core index machinery to evaluate it completely. Instead, the
20  * result of regex analysis is a list of trigrams to be sought in the index,
21  * plus a simplified graph that is used by trigramsMatchGraph() to determine
22  * whether a particular indexed value matches the expression.
23  *
24  * Converting a regex to a trigram expression is based on analysis of an
25  * automaton corresponding to the regex. The algorithm consists of four
26  * stages:
27  *
28  * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29  * regexp library, which provides accessors for its opaque regex_t struct
30  * to expose the NFA state graph and the "colors" (sets of equivalent
31  * characters) used as state transition labels.
32  *
33  * 2) Transform the original NFA into an expanded graph, where arcs
34  * are labeled with trigrams that must be present in order to move from
35  * one state to another via the arcs. The trigrams used in this stage
36  * consist of colors, not characters, as in the original NFA.
37  *
38  * 3) Expand the color trigrams into regular trigrams consisting of
39  * characters. If too many distinct trigrams are produced, trigrams are
40  * eliminated and the graph is simplified until it's simple enough.
41  *
42  * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43  * and returned to the caller.
44  *
45  * 1) Compile the regexp to NFA form
46  * ---------------------------------
47  * The automaton returned by the regexp compiler is a graph where vertices
48  * are "states" and arcs are labeled with colors. Each color represents
49  * a set of characters, so that all characters assigned to the same color
50  * are interchangeable, so far as matching the regexp is concerned. There
51  * are two special states: "initial" and "final". A state can have multiple
52  * outgoing arcs labeled with the same color, which makes the automaton
53  * non-deterministic, because it can be in many states simultaneously.
54  *
55  * Note that this NFA is already lossy compared to the original regexp,
56  * since it ignores some regex features such as lookahead constraints and
57  * backref matching. This is OK for our purposes since it's still the case
58  * that only strings matching the NFA can possibly satisfy the regexp.
59  *
60  * 2) Transform the original NFA into an expanded graph
61  * ----------------------------------------------------
62  * In the 2nd stage, the automaton is transformed into a graph based on the
63  * original NFA. Each state in the expanded graph represents a state from
64  * the original NFA, plus a prefix identifying the last two characters
65  * (colors, to be precise) seen before entering the state. There can be
66  * multiple states in the expanded graph for each state in the original NFA,
67  * depending on what characters can precede it. A prefix position can be
68  * "unknown" if it's uncertain what the preceding character was, or "blank"
69  * if the character was a non-word character (we don't need to distinguish
70  * which non-word character it was, so just think of all of them as blanks).
71  *
72  * For convenience in description, call an expanded-state identifier
73  * (two prefix colors plus a state number from the original NFA) an
74  * "enter key".
75  *
76  * Each arc of the expanded graph is labelled with a trigram that must be
77  * present in the string to match. We can construct this from an out-arc of
78  * the underlying NFA state by combining the expanded state's prefix with the
79  * color label of the underlying out-arc, if neither prefix position is
80  * "unknown". But note that some of the colors in the trigram might be
81  * "blank". This is OK since we want to generate word-boundary trigrams as
82  * the regular trigram machinery would, if we know that some word characters
83  * must be adjacent to a word boundary in all strings matching the NFA.
84  *
85  * The expanded graph can also have fewer states than the original NFA,
86  * because we don't bother to make a separate state entry unless the state
87  * is reachable by a valid arc. When an enter key is reachable from a state
88  * of the expanded graph, but we do not know a complete trigram associated
89  * with that transition, we cannot make a valid arc; instead we insert the
90  * enter key into the enterKeys list of the source state. This effectively
91  * means that the two expanded states are not reliably distinguishable based
92  * on examining trigrams.
93  *
94  * So the expanded graph resembles the original NFA, but the arcs are
95  * labeled with trigrams instead of individual characters, and there may be
96  * more or fewer states. It is a lossy representation of the original NFA:
97  * any string that matches the original regexp must match the expanded graph,
98  * but the reverse is not true.
99  *
100  * We build the expanded graph through a breadth-first traversal of states
101  * reachable from the initial state. At each reachable state, we identify the
102  * states reachable from it without traversing a predictable trigram, and add
103  * those states' enter keys to the current state. Then we generate all
104  * out-arcs leading out of this collection of states that have predictable
105  * trigrams, adding their target states to the queue of states to examine.
106  *
107  * When building the graph, if the number of states or arcs exceed pre-defined
108  * limits, we give up and simply mark any states not yet processed as final
109  * states. Roughly speaking, that means that we make use of some portion from
110  * the beginning of the regexp. Also, any colors that have too many member
111  * characters are treated as "unknown", so that we can't derive trigrams
112  * from them.
113  *
114  * 3) Expand the color trigrams into regular trigrams
115  * --------------------------------------------------
116  * The trigrams in the expanded graph are "color trigrams", consisting
117  * of three consecutive colors that must be present in the string. But for
118  * search, we need regular trigrams consisting of characters. In the 3rd
119  * stage, the color trigrams are expanded into regular trigrams. Since each
120  * color can represent many characters, the total number of regular trigrams
121  * after expansion could be very large. Because searching the index for
122  * thousands of trigrams would be slow, and would likely produce so many
123  * false positives that we would have to traverse a large fraction of the
124  * index, the graph is simplified further in a lossy fashion by removing
125  * color trigrams. When a color trigram is removed, the states connected by
126  * any arcs labelled with that trigram are merged.
127  *
128  * Trigrams do not all have equivalent value for searching: some of them are
129  * more frequent and some of them are less frequent. Ideally, we would like
130  * to know the distribution of trigrams, but we don't. But because of padding
131  * we know for sure that the empty character is more frequent than others,
132  * so we can penalize trigrams according to presence of whitespace. The
133  * penalty assigned to each color trigram is the number of simple trigrams
134  * it would produce, times the penalties[] multiplier associated with its
135  * whitespace content. (The penalties[] constants were calculated by analysis
136  * of some real-life text.) We eliminate color trigrams starting with the
137  * highest-penalty one, until we get to a total penalty of no more than
138  * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139  * lead to merging the initial and final states, so we may not be able to
140  * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141  * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142  *
143  * 4) Pack the graph into a compact representation
144  * -----------------------------------------------
145  * The 2nd and 3rd stages might have eliminated or merged many of the states
146  * and trigrams created earlier, so in this final stage, the graph is
147  * compacted and packed into a simpler struct that contains only the
148  * information needed to evaluate it.
149  *
150  * ALGORITHM EXAMPLE:
151  *
152  * Consider the example regex "ab[cd]". This regex is transformed into the
153  * following NFA (for simplicity we show colors as their single members):
154  *
155  * 4#
156  * c/
157  * a b /
158  * 1* --- 2 ---- 3
159  * \
160  * d\
161  * 5#
162  *
163  * We use * to mark initial state and # to mark final state. It's not depicted,
164  * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165  * because this pattern can match to any part of a string.
166  *
167  * As the result of stage 2 we will have the following graph:
168  *
169  * abc abd
170  * 2# <---- 1* ----> 3#
171  *
172  * The process for generating this graph is:
173  * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174  * 2) Add key (UNKNOWN, "a", 2) to state 1.
175  * 3) Add key ("a", "b", 3) to state 1.
176  * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177  * from state 1 to state 2 with label trigram "abc".
178  * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179  * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180  * from state 1 to state 3 with label trigram "abd".
181  * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182  *
183  *
184  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
185  * Portions Copyright (c) 1994, Regents of the University of California
186  *
187  * IDENTIFICATION
188  * contrib/pg_trgm/trgm_regexp.c
189  *
190  *-------------------------------------------------------------------------
191  */
192 #include "postgres.h"
193 
194 #include "trgm.h"
195 
196 #include "regex/regexport.h"
197 #include "tsearch/ts_locale.h"
198 #include "utils/hsearch.h"
199 #include "utils/memutils.h"
200 
201 
202 /*
203  * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
204  * for exploring and debugging the algorithm implementation.
205  * This produces three graph files in /tmp, in Graphviz .gv format.
206  * Some progress information is also printed to postmaster stderr.
207  */
208 /* #define TRGM_REGEXP_DEBUG */
209 
210 /*
211  * These parameters are used to limit the amount of work done.
212  * Otherwise regex processing could be too slow and memory-consuming.
213  *
214  * MAX_EXPANDED_STATES - How many states we allow in expanded graph
215  * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
216  * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
217  * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
218  * COLOR_COUNT_LIMIT - Maximum number of characters per color
219  */
220 #define MAX_EXPANDED_STATES 128
221 #define MAX_EXPANDED_ARCS 1024
222 #define MAX_TRGM_COUNT 256
223 #define WISH_TRGM_PENALTY 16
224 #define COLOR_COUNT_LIMIT 256
225 
226 /*
227  * Penalty multipliers for trigram counts depending on whitespace contents.
228  * Numbers based on analysis of real-life texts.
229  */
230 static const float4 penalties[8] = {
231  1.0f, /* "aaa" */
232  3.5f, /* "aa " */
233  0.0f, /* "a a" (impossible) */
234  0.0f, /* "a " (impossible) */
235  4.2f, /* " aa" */
236  2.1f, /* " a " */
237  25.0f, /* " a" */
238  0.0f /* " " (impossible) */
239 };
240 
241 /* Struct representing a single pg_wchar, converted back to multibyte form */
242 typedef struct
243 {
245 } trgm_mb_char;
246 
247 /*
248  * Attributes of NFA colors:
249  *
250  * expandable - we know the character expansion of this color
251  * containsNonWord - color contains non-word characters
252  * (which will not be extracted into trigrams)
253  * wordCharsCount - count of word characters in color
254  * wordChars - array of this color's word characters
255  * (which can be extracted into trigrams)
256  *
257  * When expandable is false, the other attributes don't matter; we just
258  * assume this color represents unknown character(s).
259  */
260 typedef struct
261 {
266 } TrgmColorInfo;
267 
268 /*
269  * A "prefix" is information about the colors of the last two characters read
270  * before reaching a specific NFA state. These colors can have special values
271  * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
272  * information, for example because we read some character of an unexpandable
273  * color. COLOR_BLANK means that we read a non-word character.
274  *
275  * We call a prefix ambiguous if at least one of its colors is unknown. It's
276  * fully ambiguous if both are unknown, partially ambiguous if only the first
277  * is unknown. (The case of first color known, second unknown is not valid.)
278  *
279  * Wholly- or partly-blank prefixes are mostly handled the same as regular
280  * color prefixes. This allows us to generate appropriate partly-blank
281  * trigrams when the NFA requires word character(s) to appear adjacent to
282  * non-word character(s).
283  */
284 typedef int TrgmColor;
285 
286 /* We assume that colors returned by the regexp engine cannot be these: */
287 #define COLOR_UNKNOWN (-1)
288 #define COLOR_BLANK (-2)
289 
290 typedef struct
291 {
292  TrgmColor colors[2];
293 } TrgmPrefix;
294 
295 /*
296  * Color-trigram data type. Note that some elements of the trigram can be
297  * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
298  */
299 typedef struct
300 {
301  TrgmColor colors[3];
302 } ColorTrgm;
303 
304 /*
305  * Key identifying a state of our expanded graph: color prefix, and number
306  * of the corresponding state in the underlying regex NFA. The color prefix
307  * shows how we reached the regex state (to the extent that we know it).
308  */
309 typedef struct
310 {
312  int nstate;
313 } TrgmStateKey;
314 
315 /*
316  * One state of the expanded graph.
317  *
318  * stateKey - ID of this state
319  * arcs - outgoing arcs of this state (List of TrgmArc)
320  * enterKeys - enter keys reachable from this state without reading any
321  * predictable trigram (List of TrgmStateKey)
322  * flags - flag bits
323  * snumber - number of this state (initially assigned as -1, -2, etc,
324  * for debugging purposes only; then at the packaging stage,
325  * surviving states are renumbered with positive numbers)
326  * parent - parent state, if this state has been merged into another
327  * tentFlags - flags this state would acquire via planned merges
328  * tentParent - planned parent state, if considering a merge
329  */
330 #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
331 #define TSTATE_FIN 0x02 /* flag indicating this state is final */
332 
333 typedef struct TrgmState
334 {
335  TrgmStateKey stateKey; /* hashtable key: must be first field */
338  int flags;
339  int snumber;
340  struct TrgmState *parent;
343 } TrgmState;
344 
345 /*
346  * One arc in the expanded graph.
347  */
348 typedef struct
349 {
350  ColorTrgm ctrgm; /* trigram needed to traverse arc */
351  TrgmState *target; /* next state */
352 } TrgmArc;
353 
354 /*
355  * Information about arc of specific color trigram (used in stage 3)
356  *
357  * Contains pointers to the source and target states.
358  */
359 typedef struct
360 {
363 } TrgmArcInfo;
364 
365 /*
366  * Information about color trigram (used in stage 3)
367  *
368  * ctrgm - trigram itself
369  * cnumber - number of this trigram (used in the packaging stage)
370  * count - number of simple trigrams created from this color trigram
371  * expanded - indicates this color trigram is expanded into simple trigrams
372  * arcs - list of all arcs labeled with this color trigram.
373  */
374 typedef struct
375 {
377  int cnumber;
378  int count;
380  bool expanded;
382 } ColorTrgmInfo;
383 
384 /*
385  * Data structure representing all the data we need during regex processing.
386  *
387  * regex - compiled regex
388  * colorInfo - extracted information about regex's colors
389  * ncolors - number of colors in colorInfo[]
390  * states - hashtable of TrgmStates (states of expanded graph)
391  * initState - pointer to initial state of expanded graph
392  * queue - queue of to-be-processed TrgmStates
393  * keysQueue - queue of to-be-processed TrgmStateKeys
394  * arcsCount - total number of arcs of expanded graph (for resource
395  * limiting)
396  * overflowed - we have exceeded resource limit for transformation
397  * colorTrgms - array of all color trigrams present in graph
398  * colorTrgmsCount - count of those color trigrams
399  * totalTrgmCount - total count of extracted simple trigrams
400  */
401 typedef struct
402 {
403  /* Source regexp, and color information extracted from it (stage 1) */
406  int ncolors;
407 
408  /* Expanded graph (stage 2) */
411  int nstates;
412 
413  /* Workspace for stage 2 */
418 
419  /* Information about distinct color trigrams in the graph (stage 3) */
423 } TrgmNFA;
424 
425 /*
426  * Final, compact representation of expanded graph.
427  */
428 typedef struct
429 {
430  int targetState; /* index of target state (zero-based) */
431  int colorTrgm; /* index of color trigram for transition */
432 } TrgmPackedArc;
433 
434 typedef struct
435 {
436  int arcsCount; /* number of out-arcs for this state */
437  TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
439 
440 /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
442 {
443  /*
444  * colorTrigramsCount and colorTrigramGroups contain information about how
445  * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
446  * count of color trigrams and "colorTrigramGroups" contains number of
447  * simple trigrams for each color trigram. The array of simple trigrams
448  * (stored separately from this struct) is ordered so that the simple
449  * trigrams for each color trigram are consecutive, and they're in order
450  * by color trigram number.
451  */
453  int *colorTrigramGroups; /* array of size colorTrigramsCount */
454 
455  /*
456  * The states of the simplified NFA. State number 0 is always initial
457  * state and state number 1 is always final state.
458  */
460  TrgmPackedState *states; /* array of size statesCount */
461 
462  /* Temporary work space for trigramsMatchGraph() */
463  bool *colorTrigramsActive; /* array of size colorTrigramsCount */
464  bool *statesActive; /* array of size statesCount */
465  int *statesQueue; /* array of size statesCount */
466 };
467 
468 /*
469  * Temporary structure for representing an arc during packaging.
470  */
471 typedef struct
472 {
477 
478 
479 /* prototypes for private functions */
480 static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
481  MemoryContext rcontext);
482 static void RE_compile(regex_t *regex, text *text_re,
483  int cflags, Oid collation);
484 static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
485 static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
486 static void transformGraph(TrgmNFA *trgmNFA);
487 static void processState(TrgmNFA *trgmNFA, TrgmState *state);
488 static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
489 static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
490 static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
491 static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
492  TrgmColor co, TrgmStateKey *destKey);
493 static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
494 static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
495 static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
496 static bool selectColorTrigrams(TrgmNFA *trgmNFA);
497 static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
498 static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
499 static void mergeStates(TrgmState *state1, TrgmState *state2);
500 static int colorTrgmInfoCmp(const void *p1, const void *p2);
501 static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
502 static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
503 static int packArcInfoCmp(const void *a1, const void *a2);
504 
505 #ifdef TRGM_REGEXP_DEBUG
506 static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
507 static void printTrgmNFA(TrgmNFA *trgmNFA);
508 static void printTrgmColor(StringInfo buf, TrgmColor co);
509 static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
510 #endif
511 
512 
513 /*
514  * Main entry point to process a regular expression.
515  *
516  * Returns an array of trigrams required by the regular expression, or NULL if
517  * the regular expression was too complex to analyze. In addition, a packed
518  * graph representation of the regex is returned into *graph. The results
519  * must be allocated in rcontext (which might or might not be the current
520  * context).
521  */
522 TRGM *
523 createTrgmNFA(text *text_re, Oid collation,
524  TrgmPackedGraph **graph, MemoryContext rcontext)
525 {
526  TRGM *trg;
527  regex_t regex;
528  MemoryContext tmpcontext;
529  MemoryContext oldcontext;
530 
531  /*
532  * This processing generates a great deal of cruft, which we'd like to
533  * clean up before returning (since this function may be called in a
534  * query-lifespan memory context). Make a temp context we can work in so
535  * that cleanup is easy.
536  */
538  "createTrgmNFA temporary context",
540  oldcontext = MemoryContextSwitchTo(tmpcontext);
541 
542  /*
543  * Stage 1: Compile the regexp into a NFA, using the regexp library.
544  */
545 #ifdef IGNORECASE
546  RE_compile(&regex, text_re, REG_ADVANCED | REG_ICASE, collation);
547 #else
548  RE_compile(&regex, text_re, REG_ADVANCED, collation);
549 #endif
550 
551  /*
552  * Since the regexp library allocates its internal data structures with
553  * malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets
554  * done even if there's an error.
555  */
556  PG_TRY();
557  {
558  trg = createTrgmNFAInternal(&regex, graph, rcontext);
559  }
560  PG_CATCH();
561  {
562  pg_regfree(&regex);
563  PG_RE_THROW();
564  }
565  PG_END_TRY();
566 
567  pg_regfree(&regex);
568 
569  /* Clean up all the cruft we created */
570  MemoryContextSwitchTo(oldcontext);
571  MemoryContextDelete(tmpcontext);
572 
573  return trg;
574 }
575 
576 /*
577  * Body of createTrgmNFA, exclusive of regex compilation/freeing.
578  */
579 static TRGM *
581  MemoryContext rcontext)
582 {
583  TRGM *trg;
584  TrgmNFA trgmNFA;
585 
586  trgmNFA.regex = regex;
587 
588  /* Collect color information from the regex */
589  getColorInfo(regex, &trgmNFA);
590 
591 #ifdef TRGM_REGEXP_DEBUG
592  printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
593 #endif
594 
595  /*
596  * Stage 2: Create an expanded graph from the source NFA.
597  */
598  transformGraph(&trgmNFA);
599 
600 #ifdef TRGM_REGEXP_DEBUG
601  printTrgmNFA(&trgmNFA);
602 #endif
603 
604  /*
605  * Fail if we were unable to make a nontrivial graph, ie it is possible to
606  * get from the initial state to the final state without reading any
607  * predictable trigram.
608  */
609  if (trgmNFA.initState->flags & TSTATE_FIN)
610  return NULL;
611 
612  /*
613  * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
614  */
615  if (!selectColorTrigrams(&trgmNFA))
616  return NULL;
617 
618  /*
619  * Stage 4: Expand color trigrams and pack graph into final
620  * representation.
621  */
622  trg = expandColorTrigrams(&trgmNFA, rcontext);
623 
624  *graph = packGraph(&trgmNFA, rcontext);
625 
626 #ifdef TRGM_REGEXP_DEBUG
627  printTrgmPackedGraph(*graph, trg);
628 #endif
629 
630  return trg;
631 }
632 
633 /*
634  * Main entry point for evaluating a graph during index scanning.
635  *
636  * The check[] array is indexed by trigram number (in the array of simple
637  * trigrams returned by createTrgmNFA), and holds true for those trigrams
638  * that are present in the index entry being checked.
639  */
640 bool
642 {
643  int i,
644  j,
645  k,
646  queueIn,
647  queueOut;
648 
649  /*
650  * Reset temporary working areas.
651  */
652  memset(graph->colorTrigramsActive, 0,
653  sizeof(bool) * graph->colorTrigramsCount);
654  memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
655 
656  /*
657  * Check which color trigrams were matched. A match for any simple
658  * trigram associated with a color trigram counts as a match of the color
659  * trigram.
660  */
661  j = 0;
662  for (i = 0; i < graph->colorTrigramsCount; i++)
663  {
664  int cnt = graph->colorTrigramGroups[i];
665 
666  for (k = j; k < j + cnt; k++)
667  {
668  if (check[k])
669  {
670  /*
671  * Found one matched trigram in the group. Can skip the rest
672  * of them and go to the next group.
673  */
674  graph->colorTrigramsActive[i] = true;
675  break;
676  }
677  }
678  j = j + cnt;
679  }
680 
681  /*
682  * Initialize the statesQueue to hold just the initial state. Note:
683  * statesQueue has room for statesCount entries, which is certainly enough
684  * since no state will be put in the queue more than once. The
685  * statesActive array marks which states have been queued.
686  */
687  graph->statesActive[0] = true;
688  graph->statesQueue[0] = 0;
689  queueIn = 0;
690  queueOut = 1;
691 
692  /* Process queued states as long as there are any. */
693  while (queueIn < queueOut)
694  {
695  int stateno = graph->statesQueue[queueIn++];
696  TrgmPackedState *state = &graph->states[stateno];
697  int cnt = state->arcsCount;
698 
699  /* Loop over state's out-arcs */
700  for (i = 0; i < cnt; i++)
701  {
702  TrgmPackedArc *arc = &state->arcs[i];
703 
704  /*
705  * If corresponding color trigram is present then activate the
706  * corresponding state. We're done if that's the final state,
707  * otherwise queue the state if it's not been queued already.
708  */
709  if (graph->colorTrigramsActive[arc->colorTrgm])
710  {
711  int nextstate = arc->targetState;
712 
713  if (nextstate == 1)
714  return true; /* success: final state is reachable */
715 
716  if (!graph->statesActive[nextstate])
717  {
718  graph->statesActive[nextstate] = true;
719  graph->statesQueue[queueOut++] = nextstate;
720  }
721  }
722  }
723  }
724 
725  /* Queue is empty, so match fails. */
726  return false;
727 }
728 
729 /*
730  * Compile regex string into struct at *regex.
731  * NB: pg_regfree must be applied to regex if this completes successfully.
732  */
733 static void
734 RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
735 {
736  int text_re_len = VARSIZE_ANY_EXHDR(text_re);
737  char *text_re_val = VARDATA_ANY(text_re);
738  pg_wchar *pattern;
739  int pattern_len;
740  int regcomp_result;
741  char errMsg[100];
742 
743  /* Convert pattern string to wide characters */
744  pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
745  pattern_len = pg_mb2wchar_with_len(text_re_val,
746  pattern,
747  text_re_len);
748 
749  /* Compile regex */
750  regcomp_result = pg_regcomp(regex,
751  pattern,
752  pattern_len,
753  cflags,
754  collation);
755 
756  pfree(pattern);
757 
758  if (regcomp_result != REG_OKAY)
759  {
760  /* re didn't compile (no need for pg_regfree, if so) */
761  pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
762  ereport(ERROR,
763  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
764  errmsg("invalid regular expression: %s", errMsg)));
765  }
766 }
767 
768 
769 /*---------------------
770  * Subroutines for pre-processing the color map (stage 1).
771  *---------------------
772  */
773 
774 /*
775  * Fill TrgmColorInfo structure for each color using regex export functions.
776  */
777 static void
778 getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
779 {
780  int colorsCount = pg_reg_getnumcolors(regex);
781  int i;
782 
783  trgmNFA->ncolors = colorsCount;
784  trgmNFA->colorInfo = (TrgmColorInfo *)
785  palloc0(colorsCount * sizeof(TrgmColorInfo));
786 
787  /*
788  * Loop over colors, filling TrgmColorInfo about each.
789  */
790  for (i = 0; i < colorsCount; i++)
791  {
792  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
793  int charsCount = pg_reg_getnumcharacters(regex, i);
794  pg_wchar *chars;
795  int j;
796 
797  if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
798  {
799  /* Non expandable, or too large to work with */
800  colorInfo->expandable = false;
801  continue;
802  }
803 
804  colorInfo->expandable = true;
805  colorInfo->containsNonWord = false;
806  colorInfo->wordChars = (trgm_mb_char *)
807  palloc(sizeof(trgm_mb_char) * charsCount);
808  colorInfo->wordCharsCount = 0;
809 
810  /* Extract all the chars in this color */
811  chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount);
812  pg_reg_getcharacters(regex, i, chars, charsCount);
813 
814  /*
815  * Convert characters back to multibyte form, and save only those that
816  * are word characters. Set "containsNonWord" if any non-word
817  * character. (Note: it'd probably be nicer to keep the chars in
818  * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
819  */
820  for (j = 0; j < charsCount; j++)
821  {
822  trgm_mb_char c;
823 
824  if (!convertPgWchar(chars[j], &c))
825  continue; /* ok to ignore it altogether */
826  if (ISWORDCHR(c.bytes))
827  colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
828  else
829  colorInfo->containsNonWord = true;
830  }
831 
832  pfree(chars);
833  }
834 }
835 
836 /*
837  * Convert pg_wchar to multibyte format.
838  * Returns false if the character should be ignored completely.
839  */
840 static bool
842 {
843  /* "s" has enough space for a multibyte character and a trailing NUL */
844  char s[MAX_MULTIBYTE_CHAR_LEN + 1];
845 
846  /*
847  * We can ignore the NUL character, since it can never appear in a PG text
848  * string. This avoids the need for various special cases when
849  * reconstructing trigrams.
850  */
851  if (c == 0)
852  return false;
853 
854  /* Do the conversion, making sure the result is NUL-terminated */
855  memset(s, 0, sizeof(s));
856  pg_wchar2mb_with_len(&c, s, 1);
857 
858  /*
859  * In IGNORECASE mode, we can ignore uppercase characters. We assume that
860  * the regex engine generated both uppercase and lowercase equivalents
861  * within each color, since we used the REG_ICASE option; so there's no
862  * need to process the uppercase version.
863  *
864  * XXX this code is dependent on the assumption that lowerstr() works the
865  * same as the regex engine's internal case folding machinery. Might be
866  * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c).
867  * On the other hand, the trigrams in the index were created using
868  * lowerstr(), so we're probably screwed if there's any incompatibility
869  * anyway.
870  */
871 #ifdef IGNORECASE
872  {
873  char *lowerCased = lowerstr(s);
874 
875  if (strcmp(lowerCased, s) != 0)
876  {
877  pfree(lowerCased);
878  return false;
879  }
880  pfree(lowerCased);
881  }
882 #endif
883 
884  /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
885  memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
886  return true;
887 }
888 
889 
890 /*---------------------
891  * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
892  *---------------------
893  */
894 
895 /*
896  * Transform the graph, given a regex and extracted color information.
897  *
898  * We create and process a queue of expanded-graph states until all the states
899  * are processed.
900  *
901  * This algorithm may be stopped due to resource limitation. In this case we
902  * force every unprocessed branch to immediately finish with matching (this
903  * can give us false positives but no false negatives) by marking all
904  * unprocessed states as final.
905  */
906 static void
908 {
909  HASHCTL hashCtl;
910  TrgmStateKey initkey;
911  TrgmState *initstate;
912 
913  /* Initialize this stage's workspace in trgmNFA struct */
914  trgmNFA->queue = NIL;
915  trgmNFA->keysQueue = NIL;
916  trgmNFA->arcsCount = 0;
917  trgmNFA->overflowed = false;
918 
919  /* Create hashtable for states */
920  hashCtl.keysize = sizeof(TrgmStateKey);
921  hashCtl.entrysize = sizeof(TrgmState);
922  hashCtl.hcxt = CurrentMemoryContext;
923  trgmNFA->states = hash_create("Trigram NFA",
924  1024,
925  &hashCtl,
927  trgmNFA->nstates = 0;
928 
929  /* Create initial state: ambiguous prefix, NFA's initial state */
930  MemSet(&initkey, 0, sizeof(initkey));
931  initkey.prefix.colors[0] = COLOR_UNKNOWN;
932  initkey.prefix.colors[1] = COLOR_UNKNOWN;
933  initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
934 
935  initstate = getState(trgmNFA, &initkey);
936  initstate->flags |= TSTATE_INIT;
937  trgmNFA->initState = initstate;
938 
939  /*
940  * Recursively build the expanded graph by processing queue of states
941  * (breadth-first search). getState already put initstate in the queue.
942  */
943  while (trgmNFA->queue != NIL)
944  {
945  TrgmState *state = (TrgmState *) linitial(trgmNFA->queue);
946 
947  trgmNFA->queue = list_delete_first(trgmNFA->queue);
948 
949  /*
950  * If we overflowed then just mark state as final. Otherwise do
951  * actual processing.
952  */
953  if (trgmNFA->overflowed)
954  state->flags |= TSTATE_FIN;
955  else
956  processState(trgmNFA, state);
957 
958  /* Did we overflow? */
959  if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
961  trgmNFA->overflowed = true;
962  }
963 }
964 
965 /*
966  * Process one state: add enter keys and then add outgoing arcs.
967  */
968 static void
970 {
971  /* keysQueue should be NIL already, but make sure */
972  trgmNFA->keysQueue = NIL;
973 
974  /*
975  * Add state's own key, and then process all keys added to keysQueue until
976  * queue is empty. But we can quit if the state gets marked final.
977  */
978  addKey(trgmNFA, state, &state->stateKey);
979  while (trgmNFA->keysQueue != NIL && !(state->flags & TSTATE_FIN))
980  {
982 
983  trgmNFA->keysQueue = list_delete_first(trgmNFA->keysQueue);
984  addKey(trgmNFA, state, key);
985  }
986 
987  /*
988  * Add outgoing arcs only if state isn't final (we have no interest in
989  * outgoing arcs if we already match)
990  */
991  if (!(state->flags & TSTATE_FIN))
992  addArcs(trgmNFA, state);
993 }
994 
995 /*
996  * Add the given enter key into the state's enterKeys list, and determine
997  * whether this should result in any further enter keys being added.
998  * If so, add those keys to keysQueue so that processState will handle them.
999  *
1000  * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
1001  * This situation means that we can reach the final state from this expanded
1002  * state without reading any predictable trigram, so we must consider this
1003  * state as an accepting one.
1004  *
1005  * The given key could be a duplicate of one already in enterKeys, or be
1006  * redundant with some enterKeys. So we check that before doing anything.
1007  *
1008  * Note that we don't generate any actual arcs here. addArcs will do that
1009  * later, after we have identified all the enter keys for this state.
1010  */
1011 static void
1013 {
1014  regex_arc_t *arcs;
1015  TrgmStateKey destKey;
1016  ListCell *cell;
1017  int i,
1018  arcsCount;
1019 
1020  /*
1021  * Ensure any pad bytes in destKey are zero, since it may get used as a
1022  * hashtable key by getState.
1023  */
1024  MemSet(&destKey, 0, sizeof(destKey));
1025 
1026  /*
1027  * Compare key to each existing enter key of the state to check for
1028  * redundancy. We can drop either old key(s) or the new key if we find
1029  * redundancy.
1030  */
1031  foreach(cell, state->enterKeys)
1032  {
1033  TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1034 
1035  if (existingKey->nstate == key->nstate)
1036  {
1037  if (prefixContains(&existingKey->prefix, &key->prefix))
1038  {
1039  /* This old key already covers the new key. Nothing to do */
1040  return;
1041  }
1042  if (prefixContains(&key->prefix, &existingKey->prefix))
1043  {
1044  /*
1045  * The new key covers this old key. Remove the old key, it's
1046  * no longer needed once we add this key to the list.
1047  */
1048  state->enterKeys = foreach_delete_current(state->enterKeys,
1049  cell);
1050  }
1051  }
1052  }
1053 
1054  /* No redundancy, so add this key to the state's list */
1055  state->enterKeys = lappend(state->enterKeys, key);
1056 
1057  /* If state is now known final, mark it and we're done */
1058  if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1059  {
1060  state->flags |= TSTATE_FIN;
1061  return;
1062  }
1063 
1064  /*
1065  * Loop through all outgoing arcs of the corresponding state in the
1066  * original NFA.
1067  */
1068  arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1069  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
1070  pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1071 
1072  for (i = 0; i < arcsCount; i++)
1073  {
1074  regex_arc_t *arc = &arcs[i];
1075 
1076  if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1077  {
1078  /*
1079  * Start of line/string (^). Trigram extraction treats start of
1080  * line same as start of word: double space prefix is added.
1081  * Hence, make an enter key showing we can reach the arc
1082  * destination with all-blank prefix.
1083  */
1084  destKey.prefix.colors[0] = COLOR_BLANK;
1085  destKey.prefix.colors[1] = COLOR_BLANK;
1086  destKey.nstate = arc->to;
1087 
1088  /* Add enter key to this state */
1089  addKeyToQueue(trgmNFA, &destKey);
1090  }
1091  else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1092  {
1093  /*
1094  * End of line/string ($). We must consider this arc as a
1095  * transition that doesn't read anything. The reason for adding
1096  * this enter key to the state is that if the arc leads to the
1097  * NFA's final state, we must mark this expanded state as final.
1098  */
1099  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1100  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1101  destKey.nstate = arc->to;
1102 
1103  /* Add enter key to this state */
1104  addKeyToQueue(trgmNFA, &destKey);
1105  }
1106  else
1107  {
1108  /* Regular color */
1109  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1110 
1111  if (colorInfo->expandable)
1112  {
1113  if (colorInfo->containsNonWord &&
1114  !validArcLabel(key, COLOR_BLANK))
1115  {
1116  /*
1117  * We can reach the arc destination after reading a
1118  * non-word character, but the prefix is not something
1119  * that addArc will accept with COLOR_BLANK, so no trigram
1120  * arc can get made for this transition. We must make an
1121  * enter key to show that the arc destination is
1122  * reachable. Set it up with an all-blank prefix, since
1123  * that corresponds to what the trigram extraction code
1124  * will do at a word starting boundary.
1125  */
1126  destKey.prefix.colors[0] = COLOR_BLANK;
1127  destKey.prefix.colors[1] = COLOR_BLANK;
1128  destKey.nstate = arc->to;
1129  addKeyToQueue(trgmNFA, &destKey);
1130  }
1131 
1132  if (colorInfo->wordCharsCount > 0 &&
1133  !validArcLabel(key, arc->co))
1134  {
1135  /*
1136  * We can reach the arc destination after reading a word
1137  * character, but the prefix is not something that addArc
1138  * will accept, so no trigram arc can get made for this
1139  * transition. We must make an enter key to show that the
1140  * arc destination is reachable. The prefix for the enter
1141  * key should reflect the info we have for this arc.
1142  */
1143  destKey.prefix.colors[0] = key->prefix.colors[1];
1144  destKey.prefix.colors[1] = arc->co;
1145  destKey.nstate = arc->to;
1146  addKeyToQueue(trgmNFA, &destKey);
1147  }
1148  }
1149  else
1150  {
1151  /*
1152  * Unexpandable color. Add enter key with ambiguous prefix,
1153  * showing we can reach the destination from this state, but
1154  * the preceding colors will be uncertain. (We do not set the
1155  * first prefix color to key->prefix.colors[1], because a
1156  * prefix of known followed by unknown is invalid.)
1157  */
1158  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1159  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1160  destKey.nstate = arc->to;
1161  addKeyToQueue(trgmNFA, &destKey);
1162  }
1163  }
1164  }
1165 
1166  pfree(arcs);
1167 }
1168 
1169 /*
1170  * Add copy of given key to keysQueue for later processing.
1171  */
1172 static void
1174 {
1175  TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey));
1176 
1177  memcpy(keyCopy, key, sizeof(TrgmStateKey));
1178  trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1179 }
1180 
1181 /*
1182  * Add outgoing arcs from given state, whose enter keys are all now known.
1183  */
1184 static void
1186 {
1187  TrgmStateKey destKey;
1188  ListCell *cell;
1189  regex_arc_t *arcs;
1190  int arcsCount,
1191  i;
1192 
1193  /*
1194  * Ensure any pad bytes in destKey are zero, since it may get used as a
1195  * hashtable key by getState.
1196  */
1197  MemSet(&destKey, 0, sizeof(destKey));
1198 
1199  /*
1200  * Iterate over enter keys associated with this expanded-graph state. This
1201  * includes both the state's own stateKey, and any enter keys we added to
1202  * it during addKey (which represent expanded-graph states that are not
1203  * distinguishable from this one by means of trigrams). For each such
1204  * enter key, examine all the out-arcs of the key's underlying NFA state,
1205  * and try to make a trigram arc leading to where the out-arc leads.
1206  * (addArc will deal with whether the arc is valid or not.)
1207  */
1208  foreach(cell, state->enterKeys)
1209  {
1210  TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1211 
1212  arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1213  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
1214  pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1215 
1216  for (i = 0; i < arcsCount; i++)
1217  {
1218  regex_arc_t *arc = &arcs[i];
1219  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1220 
1221  /*
1222  * Ignore non-expandable colors; addKey already handled the case.
1223  *
1224  * We need no special check for begin/end pseudocolors here. We
1225  * don't need to do any processing for them, and they will be
1226  * marked non-expandable since the regex engine will have reported
1227  * them that way.
1228  */
1229  if (!colorInfo->expandable)
1230  continue;
1231 
1232  if (colorInfo->containsNonWord)
1233  {
1234  /*
1235  * Color includes non-word character(s).
1236  *
1237  * Generate an arc, treating this transition as occurring on
1238  * BLANK. This allows word-ending trigrams to be manufactured
1239  * if possible.
1240  */
1241  destKey.prefix.colors[0] = key->prefix.colors[1];
1242  destKey.prefix.colors[1] = COLOR_BLANK;
1243  destKey.nstate = arc->to;
1244 
1245  addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
1246  }
1247 
1248  if (colorInfo->wordCharsCount > 0)
1249  {
1250  /*
1251  * Color includes word character(s).
1252  *
1253  * Generate an arc. Color is pushed into prefix of target
1254  * state.
1255  */
1256  destKey.prefix.colors[0] = key->prefix.colors[1];
1257  destKey.prefix.colors[1] = arc->co;
1258  destKey.nstate = arc->to;
1259 
1260  addArc(trgmNFA, state, key, arc->co, &destKey);
1261  }
1262  }
1263 
1264  pfree(arcs);
1265  }
1266 }
1267 
1268 /*
1269  * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1270  *
1271  * state: expanded-graph state we want to add an out-arc to
1272  * key: provides prefix colors (key->nstate is not used)
1273  * co: transition color
1274  * destKey: identifier for destination state of expanded graph
1275  */
1276 static void
1278  TrgmColor co, TrgmStateKey *destKey)
1279 {
1280  TrgmArc *arc;
1281  ListCell *cell;
1282 
1283  /* Do nothing if this wouldn't be a valid arc label trigram */
1284  if (!validArcLabel(key, co))
1285  return;
1286 
1287  /*
1288  * Check if we are going to reach key which is covered by a key which is
1289  * already listed in this state. If so arc is useless: the NFA can bypass
1290  * it through a path that doesn't require any predictable trigram, so
1291  * whether the arc's trigram is present or not doesn't really matter.
1292  */
1293  foreach(cell, state->enterKeys)
1294  {
1295  TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1296 
1297  if (existingKey->nstate == destKey->nstate &&
1298  prefixContains(&existingKey->prefix, &destKey->prefix))
1299  return;
1300  }
1301 
1302  /* Checks were successful, add new arc */
1303  arc = (TrgmArc *) palloc(sizeof(TrgmArc));
1304  arc->target = getState(trgmNFA, destKey);
1305  arc->ctrgm.colors[0] = key->prefix.colors[0];
1306  arc->ctrgm.colors[1] = key->prefix.colors[1];
1307  arc->ctrgm.colors[2] = co;
1308 
1309  state->arcs = lappend(state->arcs, arc);
1310  trgmNFA->arcsCount++;
1311 }
1312 
1313 /*
1314  * Can we make a valid trigram arc label from the given prefix and arc color?
1315  *
1316  * This is split out so that tests in addKey and addArc will stay in sync.
1317  */
1318 static bool
1320 {
1321  /*
1322  * We have to know full trigram in order to add outgoing arc. So we can't
1323  * do it if prefix is ambiguous.
1324  */
1325  if (key->prefix.colors[0] == COLOR_UNKNOWN)
1326  return false;
1327 
1328  /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1329  Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1330  /* And we should not be called with an unknown arc color anytime */
1331  Assert(co != COLOR_UNKNOWN);
1332 
1333  /*
1334  * We don't bother with making arcs representing three non-word
1335  * characters, since that's useless for trigram extraction.
1336  */
1337  if (key->prefix.colors[0] == COLOR_BLANK &&
1338  key->prefix.colors[1] == COLOR_BLANK &&
1339  co == COLOR_BLANK)
1340  return false;
1341 
1342  /*
1343  * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1344  * case doesn't correspond to any trigram the trigram extraction code
1345  * would make. The nonblank-blank-blank case is also not possible with
1346  * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1347  * trigram even if it were valid, for example processing "foo bar" will
1348  * not result in considering the trigram "o ". So if you want to support
1349  * RPADDING = 2, there's more to do than just twiddle this test.)
1350  */
1351  if (key->prefix.colors[0] != COLOR_BLANK &&
1352  key->prefix.colors[1] == COLOR_BLANK)
1353  return false;
1354 
1355  /*
1356  * Other combinations involving blank are valid, in particular we assume
1357  * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1358  *
1359  * Note: Using again the example "foo bar", we will not consider the
1360  * trigram " b", though this trigram would be found by the trigram
1361  * extraction code. Since we will find " ba", it doesn't seem worth
1362  * trying to hack the algorithm to generate the additional trigram.
1363  */
1364 
1365  /* arc label is valid */
1366  return true;
1367 }
1368 
1369 /*
1370  * Get state of expanded graph for given state key,
1371  * and queue the state for processing if it didn't already exist.
1372  */
1373 static TrgmState *
1375 {
1376  TrgmState *state;
1377  bool found;
1378 
1379  state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1380  &found);
1381  if (!found)
1382  {
1383  /* New state: initialize and queue it */
1384  state->arcs = NIL;
1385  state->enterKeys = NIL;
1386  state->flags = 0;
1387  /* states are initially given negative numbers */
1388  state->snumber = -(++trgmNFA->nstates);
1389  state->parent = NULL;
1390  state->tentFlags = 0;
1391  state->tentParent = NULL;
1392 
1393  trgmNFA->queue = lappend(trgmNFA->queue, state);
1394  }
1395  return state;
1396 }
1397 
1398 /*
1399  * Check if prefix1 "contains" prefix2.
1400  *
1401  * "contains" means that any exact prefix (with no ambiguity) that satisfies
1402  * prefix2 also satisfies prefix1.
1403  */
1404 static bool
1406 {
1407  if (prefix1->colors[1] == COLOR_UNKNOWN)
1408  {
1409  /* Fully ambiguous prefix contains everything */
1410  return true;
1411  }
1412  else if (prefix1->colors[0] == COLOR_UNKNOWN)
1413  {
1414  /*
1415  * Prefix with only first unknown color contains every prefix with
1416  * same second color.
1417  */
1418  if (prefix1->colors[1] == prefix2->colors[1])
1419  return true;
1420  else
1421  return false;
1422  }
1423  else
1424  {
1425  /* Exact prefix contains only the exact same prefix */
1426  if (prefix1->colors[0] == prefix2->colors[0] &&
1427  prefix1->colors[1] == prefix2->colors[1])
1428  return true;
1429  else
1430  return false;
1431  }
1432 }
1433 
1434 
1435 /*---------------------
1436  * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1437  *---------------------
1438  */
1439 
1440 /*
1441  * Get vector of all color trigrams in graph and select which of them
1442  * to expand into simple trigrams.
1443  *
1444  * Returns true if OK, false if exhausted resource limits.
1445  */
1446 static bool
1448 {
1449  HASH_SEQ_STATUS scan_status;
1450  int arcsCount = trgmNFA->arcsCount,
1451  i;
1452  TrgmState *state;
1453  ColorTrgmInfo *colorTrgms;
1454  int64 totalTrgmCount;
1455  float4 totalTrgmPenalty;
1456  int cnumber;
1457 
1458  /* Collect color trigrams from all arcs */
1459  colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
1460  trgmNFA->colorTrgms = colorTrgms;
1461 
1462  i = 0;
1463  hash_seq_init(&scan_status, trgmNFA->states);
1464  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1465  {
1466  ListCell *cell;
1467 
1468  foreach(cell, state->arcs)
1469  {
1470  TrgmArc *arc = (TrgmArc *) lfirst(cell);
1471  TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
1472  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1473 
1474  arcInfo->source = state;
1475  arcInfo->target = arc->target;
1476  trgmInfo->ctrgm = arc->ctrgm;
1477  trgmInfo->cnumber = -1;
1478  /* count and penalty will be set below */
1479  trgmInfo->expanded = true;
1480  trgmInfo->arcs = list_make1(arcInfo);
1481  i++;
1482  }
1483  }
1484  Assert(i == arcsCount);
1485 
1486  /* Remove duplicates, merging their arcs lists */
1487  if (arcsCount >= 2)
1488  {
1489  ColorTrgmInfo *p1,
1490  *p2;
1491 
1492  /* Sort trigrams to ease duplicate detection */
1493  qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1494 
1495  /* p1 is probe point, p2 is last known non-duplicate. */
1496  p2 = colorTrgms;
1497  for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1498  {
1499  if (colorTrgmInfoCmp(p1, p2) > 0)
1500  {
1501  p2++;
1502  *p2 = *p1;
1503  }
1504  else
1505  {
1506  p2->arcs = list_concat(p2->arcs, p1->arcs);
1507  }
1508  }
1509  trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1510  }
1511  else
1512  {
1513  trgmNFA->colorTrgmsCount = arcsCount;
1514  }
1515 
1516  /*
1517  * Count number of simple trigrams generated by each color trigram, and
1518  * also compute a penalty value, which is the number of simple trigrams
1519  * times a multiplier that depends on its whitespace content.
1520  *
1521  * Note: per-color-trigram counts cannot overflow an int so long as
1522  * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1523  * 1290. However, the grand total totalTrgmCount might conceivably
1524  * overflow an int, so we use int64 for that within this routine. Also,
1525  * penalties are calculated in float4 arithmetic to avoid any overflow
1526  * worries.
1527  */
1528  totalTrgmCount = 0;
1529  totalTrgmPenalty = 0.0f;
1530  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1531  {
1532  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1533  int j,
1534  count = 1,
1535  typeIndex = 0;
1536 
1537  for (j = 0; j < 3; j++)
1538  {
1539  TrgmColor c = trgmInfo->ctrgm.colors[j];
1540 
1541  typeIndex *= 2;
1542  if (c == COLOR_BLANK)
1543  typeIndex++;
1544  else
1545  count *= trgmNFA->colorInfo[c].wordCharsCount;
1546  }
1547  trgmInfo->count = count;
1548  totalTrgmCount += count;
1549  trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1550  totalTrgmPenalty += trgmInfo->penalty;
1551  }
1552 
1553  /* Sort color trigrams in descending order of their penalties */
1554  qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1556 
1557  /*
1558  * Remove color trigrams from the graph so long as total penalty of color
1559  * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1560  * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1561  * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1562  * penalty, since those are the most promising for reducing the total
1563  * penalty. When removing a color trigram we have to merge states
1564  * connected by arcs labeled with that trigram. It's necessary to not
1565  * merge initial and final states, because our graph becomes useless if
1566  * that happens; so we cannot always remove the trigram we'd prefer to.
1567  */
1568  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1569  {
1570  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1571  bool canRemove = true;
1572  ListCell *cell;
1573 
1574  /* Done if we've reached the target */
1575  if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
1576  break;
1577 
1578 #ifdef TRGM_REGEXP_DEBUG
1579  fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1580  trgmInfo->ctrgm.colors[0],
1581  trgmInfo->ctrgm.colors[1],
1582  trgmInfo->ctrgm.colors[2],
1583  trgmInfo->penalty,
1584  list_length(trgmInfo->arcs));
1585 #endif
1586 
1587  /*
1588  * Does any arc of this color trigram connect initial and final
1589  * states? If so we can't remove it.
1590  */
1591  foreach(cell, trgmInfo->arcs)
1592  {
1593  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1594  TrgmState *source = arcInfo->source,
1595  *target = arcInfo->target;
1596  int source_flags,
1597  target_flags;
1598 
1599 #ifdef TRGM_REGEXP_DEBUG
1600  fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1601  -target->snumber, target->flags,
1602  -source->snumber, source->flags);
1603 #endif
1604 
1605  /* examine parent states, if any merging has already happened */
1606  while (source->parent)
1607  source = source->parent;
1608  while (target->parent)
1609  target = target->parent;
1610 
1611 #ifdef TRGM_REGEXP_DEBUG
1612  fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1613  -target->snumber, target->flags,
1614  -source->snumber, source->flags);
1615 #endif
1616 
1617  /* we must also consider merges we are planning right now */
1618  source_flags = source->flags | source->tentFlags;
1619  while (source->tentParent)
1620  {
1621  source = source->tentParent;
1622  source_flags |= source->flags | source->tentFlags;
1623  }
1624  target_flags = target->flags | target->tentFlags;
1625  while (target->tentParent)
1626  {
1627  target = target->tentParent;
1628  target_flags |= target->flags | target->tentFlags;
1629  }
1630 
1631 #ifdef TRGM_REGEXP_DEBUG
1632  fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1633  -target->snumber, target_flags,
1634  -source->snumber, source_flags);
1635 #endif
1636 
1637  /* would fully-merged state have both INIT and FIN set? */
1638  if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1639  (TSTATE_INIT | TSTATE_FIN))
1640  {
1641  canRemove = false;
1642  break;
1643  }
1644 
1645  /* ok so far, so remember planned merge */
1646  if (source != target)
1647  {
1648 #ifdef TRGM_REGEXP_DEBUG
1649  fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1650  -target->snumber, -source->snumber);
1651 #endif
1652  target->tentParent = source;
1653  source->tentFlags |= target_flags;
1654  }
1655  }
1656 
1657  /*
1658  * We must reset all the tentFlags/tentParent fields before
1659  * continuing. tentFlags could only have become set in states that
1660  * are the source or parent or tentative parent of one of the current
1661  * arcs; likewise tentParent could only have become set in states that
1662  * are the target or parent or tentative parent of one of the current
1663  * arcs. There might be some overlap between those sets, but if we
1664  * clear tentFlags in target states as well as source states, we
1665  * should be okay even if we visit a state as target before visiting
1666  * it as a source.
1667  */
1668  foreach(cell, trgmInfo->arcs)
1669  {
1670  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1671  TrgmState *source = arcInfo->source,
1672  *target = arcInfo->target;
1673  TrgmState *ttarget;
1674 
1675  /* no need to touch previously-merged states */
1676  while (source->parent)
1677  source = source->parent;
1678  while (target->parent)
1679  target = target->parent;
1680 
1681  while (source)
1682  {
1683  source->tentFlags = 0;
1684  source = source->tentParent;
1685  }
1686 
1687  while ((ttarget = target->tentParent) != NULL)
1688  {
1689  target->tentParent = NULL;
1690  target->tentFlags = 0; /* in case it was also a source */
1691  target = ttarget;
1692  }
1693  }
1694 
1695  /* Now, move on if we can't drop this trigram */
1696  if (!canRemove)
1697  {
1698 #ifdef TRGM_REGEXP_DEBUG
1699  fprintf(stderr, " ... not ok to merge\n");
1700 #endif
1701  continue;
1702  }
1703 
1704  /* OK, merge states linked by each arc labeled by the trigram */
1705  foreach(cell, trgmInfo->arcs)
1706  {
1707  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1708  TrgmState *source = arcInfo->source,
1709  *target = arcInfo->target;
1710 
1711  while (source->parent)
1712  source = source->parent;
1713  while (target->parent)
1714  target = target->parent;
1715  if (source != target)
1716  {
1717 #ifdef TRGM_REGEXP_DEBUG
1718  fprintf(stderr, "merging s%d into s%d\n",
1719  -target->snumber, -source->snumber);
1720 #endif
1721  mergeStates(source, target);
1722  /* Assert we didn't merge initial and final states */
1723  Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1724  (TSTATE_INIT | TSTATE_FIN));
1725  }
1726  }
1727 
1728  /* Mark trigram unexpanded, and update totals */
1729  trgmInfo->expanded = false;
1730  totalTrgmCount -= trgmInfo->count;
1731  totalTrgmPenalty -= trgmInfo->penalty;
1732  }
1733 
1734  /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1735  if (totalTrgmCount > MAX_TRGM_COUNT)
1736  return false;
1737 
1738  trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1739 
1740  /*
1741  * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1742  * and enumerate the color trigrams that are expanded.
1743  */
1744  cnumber = 0;
1745  qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1747  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1748  {
1749  if (colorTrgms[i].expanded)
1750  {
1751  colorTrgms[i].cnumber = cnumber;
1752  cnumber++;
1753  }
1754  }
1755 
1756  return true;
1757 }
1758 
1759 /*
1760  * Expand selected color trigrams into regular trigrams.
1761  *
1762  * Returns the TRGM array to be passed to the index machinery.
1763  * The array must be allocated in rcontext.
1764  */
1765 static TRGM *
1767 {
1768  TRGM *trg;
1769  trgm *p;
1770  int i;
1771  TrgmColorInfo blankColor;
1772  trgm_mb_char blankChar;
1773 
1774  /* Set up "blank" color structure containing a single zero character */
1775  memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1776  blankColor.wordCharsCount = 1;
1777  blankColor.wordChars = &blankChar;
1778 
1779  /* Construct the trgm array */
1780  trg = (TRGM *)
1781  MemoryContextAllocZero(rcontext,
1782  TRGMHDRSIZE +
1783  trgmNFA->totalTrgmCount * sizeof(trgm));
1784  trg->flag = ARRKEY;
1785  SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1786  p = GETARR(trg);
1787  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1788  {
1789  ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1790  TrgmColorInfo *c[3];
1791  trgm_mb_char s[3];
1792  int j,
1793  i1,
1794  i2,
1795  i3;
1796 
1797  /* Ignore any unexpanded trigrams ... */
1798  if (!colorTrgm->expanded)
1799  continue;
1800 
1801  /* Get colors, substituting the dummy struct for COLOR_BLANK */
1802  for (j = 0; j < 3; j++)
1803  {
1804  if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1805  c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1806  else
1807  c[j] = &blankColor;
1808  }
1809 
1810  /* Iterate over all possible combinations of colors' characters */
1811  for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1812  {
1813  s[0] = c[0]->wordChars[i1];
1814  for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1815  {
1816  s[1] = c[1]->wordChars[i2];
1817  for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1818  {
1819  s[2] = c[2]->wordChars[i3];
1820  fillTrgm(p, s);
1821  p++;
1822  }
1823  }
1824  }
1825  }
1826 
1827  return trg;
1828 }
1829 
1830 /*
1831  * Convert trigram into trgm datatype.
1832  */
1833 static void
1835 {
1836  char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1837  *p;
1838  int i,
1839  j;
1840 
1841  /* Write multibyte string into "str" (we don't need null termination) */
1842  p = str;
1843 
1844  for (i = 0; i < 3; i++)
1845  {
1846  if (s[i].bytes[0] != 0)
1847  {
1848  for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1849  *p++ = s[i].bytes[j];
1850  }
1851  else
1852  {
1853  /* Emit a space in place of COLOR_BLANK */
1854  *p++ = ' ';
1855  }
1856  }
1857 
1858  /* Convert "str" to a standard trigram (possibly hashing it) */
1859  compact_trigram(ptrgm, str, p - str);
1860 }
1861 
1862 /*
1863  * Merge two states of graph.
1864  */
1865 static void
1867 {
1868  Assert(state1 != state2);
1869  Assert(!state1->parent);
1870  Assert(!state2->parent);
1871 
1872  /* state1 absorbs state2's flags */
1873  state1->flags |= state2->flags;
1874 
1875  /* state2, and indirectly all its children, become children of state1 */
1876  state2->parent = state1;
1877 }
1878 
1879 /*
1880  * Compare function for sorting of color trigrams by their colors.
1881  */
1882 static int
1883 colorTrgmInfoCmp(const void *p1, const void *p2)
1884 {
1885  const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1886  const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1887 
1888  return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1889 }
1890 
1891 /*
1892  * Compare function for sorting color trigrams in descending order of
1893  * their penalty fields.
1894  */
1895 static int
1896 colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1897 {
1898  float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1899  float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1900 
1901  if (penalty1 < penalty2)
1902  return 1;
1903  else if (penalty1 == penalty2)
1904  return 0;
1905  else
1906  return -1;
1907 }
1908 
1909 
1910 /*---------------------
1911  * Subroutines for packing the graph into final representation (stage 4).
1912  *---------------------
1913  */
1914 
1915 /*
1916  * Pack expanded graph into final representation.
1917  *
1918  * The result data must be allocated in rcontext.
1919  */
1920 static TrgmPackedGraph *
1921 packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
1922 {
1923  int snumber = 2,
1924  arcIndex,
1925  arcsCount;
1926  HASH_SEQ_STATUS scan_status;
1927  TrgmState *state;
1929  *p1,
1930  *p2;
1931  TrgmPackedArc *packedArcs;
1932  TrgmPackedGraph *result;
1933  int i,
1934  j;
1935 
1936  /* Enumerate surviving states, giving init and fin reserved numbers */
1937  hash_seq_init(&scan_status, trgmNFA->states);
1938  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1939  {
1940  while (state->parent)
1941  state = state->parent;
1942 
1943  if (state->snumber < 0)
1944  {
1945  if (state->flags & TSTATE_INIT)
1946  state->snumber = 0;
1947  else if (state->flags & TSTATE_FIN)
1948  state->snumber = 1;
1949  else
1950  {
1951  state->snumber = snumber;
1952  snumber++;
1953  }
1954  }
1955  }
1956 
1957  /* Collect array of all arcs */
1958  arcs = (TrgmPackArcInfo *)
1959  palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount);
1960  arcIndex = 0;
1961  hash_seq_init(&scan_status, trgmNFA->states);
1962  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1963  {
1964  TrgmState *source = state;
1965  ListCell *cell;
1966 
1967  while (source->parent)
1968  source = source->parent;
1969 
1970  foreach(cell, state->arcs)
1971  {
1972  TrgmArc *arc = (TrgmArc *) lfirst(cell);
1973  TrgmState *target = arc->target;
1974 
1975  while (target->parent)
1976  target = target->parent;
1977 
1978  if (source->snumber != target->snumber)
1979  {
1980  ColorTrgmInfo *ctrgm;
1981 
1982  ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1983  trgmNFA->colorTrgms,
1984  trgmNFA->colorTrgmsCount,
1985  sizeof(ColorTrgmInfo),
1987  Assert(ctrgm != NULL);
1988  Assert(ctrgm->expanded);
1989 
1990  arcs[arcIndex].sourceState = source->snumber;
1991  arcs[arcIndex].targetState = target->snumber;
1992  arcs[arcIndex].colorTrgm = ctrgm->cnumber;
1993  arcIndex++;
1994  }
1995  }
1996  }
1997 
1998  /* Sort arcs to ease duplicate detection */
1999  qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
2000 
2001  /* We could have duplicates because states were merged. Remove them. */
2002  /* p1 is probe point, p2 is last known non-duplicate. */
2003  p2 = arcs;
2004  for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2005  {
2006  if (packArcInfoCmp(p1, p2) > 0)
2007  {
2008  p2++;
2009  *p2 = *p1;
2010  }
2011  }
2012  arcsCount = (p2 - arcs) + 1;
2013 
2014  /* Create packed representation */
2015  result = (TrgmPackedGraph *)
2016  MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
2017 
2018  /* Pack color trigrams information */
2019  result->colorTrigramsCount = 0;
2020  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2021  {
2022  if (trgmNFA->colorTrgms[i].expanded)
2023  result->colorTrigramsCount++;
2024  }
2025  result->colorTrigramGroups = (int *)
2026  MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2027  j = 0;
2028  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2029  {
2030  if (trgmNFA->colorTrgms[i].expanded)
2031  {
2032  result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2033  j++;
2034  }
2035  }
2036 
2037  /* Pack states and arcs information */
2038  result->statesCount = snumber;
2039  result->states = (TrgmPackedState *)
2040  MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2041  packedArcs = (TrgmPackedArc *)
2042  MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2043  j = 0;
2044  for (i = 0; i < snumber; i++)
2045  {
2046  int cnt = 0;
2047 
2048  result->states[i].arcs = &packedArcs[j];
2049  while (j < arcsCount && arcs[j].sourceState == i)
2050  {
2051  packedArcs[j].targetState = arcs[j].targetState;
2052  packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2053  cnt++;
2054  j++;
2055  }
2056  result->states[i].arcsCount = cnt;
2057  }
2058 
2059  /* Allocate working memory for trigramsMatchGraph() */
2060  result->colorTrigramsActive = (bool *)
2061  MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2062  result->statesActive = (bool *)
2063  MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2064  result->statesQueue = (int *)
2065  MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2066 
2067  return result;
2068 }
2069 
2070 /*
2071  * Comparison function for sorting TrgmPackArcInfos.
2072  *
2073  * Compares arcs in following order: sourceState, colorTrgm, targetState.
2074  */
2075 static int
2076 packArcInfoCmp(const void *a1, const void *a2)
2077 {
2078  const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2079  const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2080 
2081  if (p1->sourceState < p2->sourceState)
2082  return -1;
2083  if (p1->sourceState > p2->sourceState)
2084  return 1;
2085  if (p1->colorTrgm < p2->colorTrgm)
2086  return -1;
2087  if (p1->colorTrgm > p2->colorTrgm)
2088  return 1;
2089  if (p1->targetState < p2->targetState)
2090  return -1;
2091  if (p1->targetState > p2->targetState)
2092  return 1;
2093  return 0;
2094 }
2095 
2096 
2097 /*---------------------
2098  * Debugging functions
2099  *
2100  * These are designed to emit GraphViz files.
2101  *---------------------
2102  */
2103 
2104 #ifdef TRGM_REGEXP_DEBUG
2105 
2106 /*
2107  * Print initial NFA, in regexp library's representation
2108  */
2109 static void
2110 printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2111 {
2113  int nstates = pg_reg_getnumstates(regex);
2114  int state;
2115  int i;
2116 
2117  initStringInfo(&buf);
2118 
2119  appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2120 
2121  for (state = 0; state < nstates; state++)
2122  {
2123  regex_arc_t *arcs;
2124  int i,
2125  arcsCount;
2126 
2127  appendStringInfo(&buf, "s%d", state);
2128  if (pg_reg_getfinalstate(regex) == state)
2129  appendStringInfoString(&buf, " [shape = doublecircle]");
2130  appendStringInfoString(&buf, ";\n");
2131 
2132  arcsCount = pg_reg_getnumoutarcs(regex, state);
2133  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
2134  pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2135 
2136  for (i = 0; i < arcsCount; i++)
2137  {
2138  appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2139  state, arcs[i].to, arcs[i].co);
2140  }
2141 
2142  pfree(arcs);
2143  }
2144 
2145  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2146  appendStringInfo(&buf, " initial -> s%d;\n",
2147  pg_reg_getinitialstate(regex));
2148 
2149  /* Print colors */
2150  appendStringInfoString(&buf, " { rank = sink;\n");
2151  appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2152 
2153  for (i = 0; i < ncolors; i++)
2154  {
2155  TrgmColorInfo *color = &colors[i];
2156  int j;
2157 
2158  appendStringInfo(&buf, "<br/>Color %d: ", i);
2159  if (color->expandable)
2160  {
2161  for (j = 0; j < color->wordCharsCount; j++)
2162  {
2163  char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2164 
2165  memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2166  s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2167  appendStringInfoString(&buf, s);
2168  }
2169  }
2170  else
2171  appendStringInfoString(&buf, "not expandable");
2172  appendStringInfoChar(&buf, '\n');
2173  }
2174 
2175  appendStringInfoString(&buf, " >];\n");
2176  appendStringInfoString(&buf, " }\n");
2177  appendStringInfoString(&buf, "}\n");
2178 
2179  {
2180  /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2181  FILE *fp = fopen("/tmp/source.gv", "w");
2182 
2183  fprintf(fp, "%s", buf.data);
2184  fclose(fp);
2185  }
2186 
2187  pfree(buf.data);
2188 }
2189 
2190 /*
2191  * Print expanded graph.
2192  */
2193 static void
2194 printTrgmNFA(TrgmNFA *trgmNFA)
2195 {
2197  HASH_SEQ_STATUS scan_status;
2198  TrgmState *state;
2199  TrgmState *initstate = NULL;
2200 
2201  initStringInfo(&buf);
2202 
2203  appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2204 
2205  hash_seq_init(&scan_status, trgmNFA->states);
2206  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2207  {
2208  ListCell *cell;
2209 
2210  appendStringInfo(&buf, "s%d", -state->snumber);
2211  if (state->flags & TSTATE_FIN)
2212  appendStringInfoString(&buf, " [shape = doublecircle]");
2213  if (state->flags & TSTATE_INIT)
2214  initstate = state;
2215  appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2216  appendStringInfoString(&buf, ";\n");
2217 
2218  foreach(cell, state->arcs)
2219  {
2220  TrgmArc *arc = (TrgmArc *) lfirst(cell);
2221 
2222  appendStringInfo(&buf, " s%d -> s%d [label = \"",
2223  -state->snumber, -arc->target->snumber);
2224  printTrgmColor(&buf, arc->ctrgm.colors[0]);
2225  appendStringInfoChar(&buf, ' ');
2226  printTrgmColor(&buf, arc->ctrgm.colors[1]);
2227  appendStringInfoChar(&buf, ' ');
2228  printTrgmColor(&buf, arc->ctrgm.colors[2]);
2229  appendStringInfoString(&buf, "\"];\n");
2230  }
2231  }
2232 
2233  if (initstate)
2234  {
2235  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2236  appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2237  }
2238 
2239  appendStringInfoString(&buf, "}\n");
2240 
2241  {
2242  /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2243  FILE *fp = fopen("/tmp/transformed.gv", "w");
2244 
2245  fprintf(fp, "%s", buf.data);
2246  fclose(fp);
2247  }
2248 
2249  pfree(buf.data);
2250 }
2251 
2252 /*
2253  * Print a TrgmColor readably.
2254  */
2255 static void
2256 printTrgmColor(StringInfo buf, TrgmColor co)
2257 {
2258  if (co == COLOR_UNKNOWN)
2259  appendStringInfoChar(buf, 'u');
2260  else if (co == COLOR_BLANK)
2261  appendStringInfoChar(buf, 'b');
2262  else
2263  appendStringInfo(buf, "%d", (int) co);
2264 }
2265 
2266 /*
2267  * Print final packed representation of trigram-based expanded graph.
2268  */
2269 static void
2270 printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
2271 {
2273  trgm *p;
2274  int i;
2275 
2276  initStringInfo(&buf);
2277 
2278  appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2279 
2280  for (i = 0; i < packedGraph->statesCount; i++)
2281  {
2282  TrgmPackedState *state = &packedGraph->states[i];
2283  int j;
2284 
2285  appendStringInfo(&buf, " s%d", i);
2286  if (i == 1)
2287  appendStringInfoString(&buf, " [shape = doublecircle]");
2288 
2289  appendStringInfo(&buf, " [label = <s%d>];\n", i);
2290 
2291  for (j = 0; j < state->arcsCount; j++)
2292  {
2293  TrgmPackedArc *arc = &state->arcs[j];
2294 
2295  appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2296  i, arc->targetState, arc->colorTrgm);
2297  }
2298  }
2299 
2300  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2301  appendStringInfo(&buf, " initial -> s%d;\n", 0);
2302 
2303  /* Print trigrams */
2304  appendStringInfoString(&buf, " { rank = sink;\n");
2305  appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2306 
2307  p = GETARR(trigrams);
2308  for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2309  {
2310  int count = packedGraph->colorTrigramGroups[i];
2311  int j;
2312 
2313  appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2314 
2315  for (j = 0; j < count; j++)
2316  {
2317  if (j > 0)
2318  appendStringInfoString(&buf, ", ");
2319 
2320  /*
2321  * XXX This representation is nice only for all-ASCII trigrams.
2322  */
2323  appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2324  p++;
2325  }
2326  }
2327 
2328  appendStringInfoString(&buf, " >];\n");
2329  appendStringInfoString(&buf, " }\n");
2330  appendStringInfoString(&buf, "}\n");
2331 
2332  {
2333  /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2334  FILE *fp = fopen("/tmp/packed.gv", "w");
2335 
2336  fprintf(fp, "%s", buf.data);
2337  fclose(fp);
2338  }
2339 
2340  pfree(buf.data);
2341 }
2342 
2343 #endif /* TRGM_REGEXP_DEBUG */
#define MAX_EXPANDED_ARCS
Definition: trgm_regexp.c:221
#define NIL
Definition: pg_list.h:65
static void mergeStates(TrgmState *state1, TrgmState *state2)
Definition: trgm_regexp.c:1866
bool * colorTrigramsActive
Definition: trgm_regexp.c:463
int pg_reg_getnumstates(const regex_t *regex)
Definition: regexport.c:36
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
Definition: regexport.c:155
struct TrgmState TrgmState
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:211
#define AllocSetContextCreate
Definition: memutils.h:170
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:778
ColorTrgm ctrgm
Definition: trgm_regexp.c:350
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
Definition: trgm_regexp.c:1405
#define HASH_CONTEXT
Definition: hsearch.h:93
#define HASH_ELEM
Definition: hsearch.h:87
struct TrgmState * tentParent
Definition: trgm_regexp.c:342
MemoryContext hcxt
Definition: hsearch.h:78
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
Definition: trgm_regexp.c:1277
int pg_reg_getfinalstate(const regex_t *regex)
Definition: regexport.c:64
TrgmColorInfo * colorInfo
Definition: trgm_regexp.c:405
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:580
int pg_reg_getnumcharacters(const regex_t *regex, int co)
Definition: regexport.c:229
regex_t * regex
Definition: trgm_regexp.c:404
float4 penalty
Definition: trgm_regexp.c:379
Definition: regguts.h:276
def bytes(source, encoding='ascii', errors='strict')
short color
Definition: regguts.h:134
TrgmColor colors[3]
Definition: trgm_regexp.c:301
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
HTAB * states
Definition: trgm_regexp.c:409
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
Definition: trgm_regexp.c:1012
Size entrysize
Definition: hsearch.h:73
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
Definition: regexport.c:265
#define ISWORDCHR(c)
Definition: trgm.h:54
#define WISH_TRGM_PENALTY
Definition: trgm_regexp.c:223
#define ARRKEY
Definition: trgm.h:97
List * list_concat(List *list1, const List *list2)
Definition: list.c:516
int errcode(int sqlerrcode)
Definition: elog.c:570
#define MemSet(start, val, len)
Definition: c.h:955
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1335
int pg_reg_colorisbegin(const regex_t *regex, int co)
Definition: regexport.c:191
TrgmPackedState * states
Definition: trgm_regexp.c:460
TrgmState * initState
Definition: trgm_regexp.c:410
#define TSTATE_FIN
Definition: trgm_regexp.c:331
char * lowerstr(const char *str)
Definition: ts_locale.c:239
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:906
unsigned int Oid
Definition: postgres_ext.h:31
#define TSTATE_INIT
Definition: trgm_regexp.c:330
int totalTrgmCount
Definition: trgm_regexp.c:422
#define fprintf
Definition: port.h:196
bool overflowed
Definition: trgm_regexp.c:417
#define REG_ICASE
Definition: regex.h:106
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition: regcomp.c:313
int pg_reg_getnumcolors(const regex_t *regex)
Definition: regexport.c:174
int nstates
Definition: trgm_regexp.c:411
#define foreach_delete_current(lst, cell)
Definition: pg_list.h:368
bool containsNonWord
Definition: trgm_regexp.c:263
static bool convertPgWchar(pg_wchar c, trgm_mb_char *result)
Definition: trgm_regexp.c:841
#define list_make1(x1)
Definition: pg_list.h:227
TrgmPackedArc * arcs
Definition: trgm_regexp.c:437
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:1447
static int packArcInfoCmp(const void *a1, const void *a2)
Definition: trgm_regexp.c:2076
#define GETARR(x)
Definition: trgm.h:107
static const FormData_pg_attribute a2
Definition: heap.c:166
Definition: dynahash.c:208
bool * statesActive
Definition: trgm_regexp.c:464
void pfree(void *pointer)
Definition: mcxt.c:1056
#define REG_OKAY
Definition: regex.h:137
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:78
#define linitial(l)
Definition: pg_list.h:195
TrgmColor colors[2]
Definition: trgm_regexp.c:292
#define ERROR
Definition: elog.h:43
Definition: trgm.h:66
int pg_reg_getinitialstate(const regex_t *regex)
Definition: regexport.c:50
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:163
char * c
static char * buf
Definition: pg_test_fsync.c:68
List * enterKeys
Definition: trgm_regexp.c:337
TrgmState * target
Definition: trgm_regexp.c:362
ColorTrgmInfo * colorTrgms
Definition: trgm_regexp.c:420
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition: regerror.c:60
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
Definition: trgm_regexp.c:641
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define COLOR_COUNT_LIMIT
Definition: trgm_regexp.c:224
#define REG_ADVANCED
Definition: regex.h:103
TrgmState * target
Definition: trgm_regexp.c:351
List * arcs
Definition: trgm_regexp.c:336
#define TRGMHDRSIZE
Definition: trgm.h:73
#define ereport(elevel, rest)
Definition: elog.h:141
unsigned int pg_wchar
Definition: mbprint.c:31
TrgmState * source
Definition: trgm_regexp.c:361
List * lappend(List *list, void *datum)
Definition: list.c:322
#define MAX_MULTIBYTE_CHAR_LEN
Definition: pg_wchar.h:30
int arcsCount
Definition: trgm_regexp.c:416
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:175
void initStringInfo(StringInfo str)
Definition: stringinfo.c:46
#define COLOR_BLANK
Definition: trgm_regexp.c:288
float float4
Definition: c.h:490
List * queue
Definition: trgm_regexp.c:414
#define HASH_BLOBS
Definition: hsearch.h:88
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1921
void * palloc0(Size size)
Definition: mcxt.c:980
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:316
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:523
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1173
Size keysize
Definition: hsearch.h:72
uint8 flag
Definition: trgm.h:69
int pg_reg_colorisend(const regex_t *regex, int co)
Definition: regexport.c:208
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:839
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1896
int colorTrgmsCount
Definition: trgm_regexp.c:421
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:765
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition: trgm_op.c:218
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
Definition: regexport.c:134
char trgm[3]
Definition: trgm.h:41
#define PG_CATCH()
Definition: elog.h:310
#define MAX_TRGM_COUNT
Definition: trgm_regexp.c:222
#define CALCGTSIZE(flag)
Definition: hstore_gist.c:49
int tentFlags
Definition: trgm_regexp.c:341
#define Assert(condition)
Definition: c.h:732
#define lfirst(lc)
Definition: pg_list.h:190
Definition: regguts.h:298
char bytes[MAX_MULTIBYTE_CHAR_LEN]
Definition: trgm_regexp.c:244
TrgmStateKey stateKey
Definition: trgm_regexp.c:335
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1766
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
Definition: trgm_regexp.c:734
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
Definition: trgm_regexp.c:1319
static int list_length(const List *l)
Definition: pg_list.h:169
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1374
#define PG_RE_THROW()
Definition: elog.h:331
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1389
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:1185
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1379
TrgmPrefix prefix
Definition: trgm_regexp.c:311
static const float4 penalties[8]
Definition: trgm_regexp.c:230
#define MAX_EXPANDED_STATES
Definition: trgm_regexp.c:220
static int colorTrgmInfoCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1883
struct TrgmState * parent
Definition: trgm_regexp.c:340
#define COLOR_UNKNOWN
Definition: trgm_regexp.c:287
static void transformGraph(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:907
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:341
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:784
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:796
int i
List * keysQueue
Definition: trgm_regexp.c:415
ColorTrgm ctrgm
Definition: trgm_regexp.c:376
Definition: c.h:549
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:329
static void processState(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:969
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:787
static const FormData_pg_attribute a1
Definition: heap.c:152
#define qsort(a, b, c, d)
Definition: port.h:492
trgm_mb_char * wordChars
Definition: trgm_regexp.c:265
#define PG_TRY()
Definition: elog.h:301
void pg_regfree(regex_t *re)
Definition: regfree.c:49
Definition: pg_list.h:50
static char chars[TZ_MAX_CHARS]
Definition: zic.c:395
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
Definition: trgm_regexp.c:1834
Definition: regex.h:55
#define PG_END_TRY()
Definition: elog.h:317
List * list_delete_first(List *list)
Definition: list.c:861
int * colorTrigramGroups
Definition: trgm_regexp.c:453
int ncolors
Definition: trgm_regexp.c:406
int TrgmColor
Definition: trgm_regexp.c:284