PostgreSQL Source Code  git master
trgm_regexp.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * trgm_regexp.c
4  * Regular expression matching using trigrams.
5  *
6  * The general idea of trigram index support for a regular expression (regex)
7  * search is to transform the regex into a logical expression on trigrams.
8  * For example:
9  *
10  * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11  *
12  * If a string matches the regex, then it must match the logical expression on
13  * trigrams. The opposite is not necessarily true, however: a string that
14  * matches the logical expression might not match the original regex. Such
15  * false positives are removed via recheck, by running the regular regex match
16  * operator on the retrieved heap tuple.
17  *
18  * Since the trigram expression involves both AND and OR operators, we can't
19  * expect the core index machinery to evaluate it completely. Instead, the
20  * result of regex analysis is a list of trigrams to be sought in the index,
21  * plus a simplified graph that is used by trigramsMatchGraph() to determine
22  * whether a particular indexed value matches the expression.
23  *
24  * Converting a regex to a trigram expression is based on analysis of an
25  * automaton corresponding to the regex. The algorithm consists of four
26  * stages:
27  *
28  * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29  * regexp library, which provides accessors for its opaque regex_t struct
30  * to expose the NFA state graph and the "colors" (sets of equivalent
31  * characters) used as state transition labels.
32  *
33  * 2) Transform the original NFA into an expanded graph, where arcs
34  * are labeled with trigrams that must be present in order to move from
35  * one state to another via the arcs. The trigrams used in this stage
36  * consist of colors, not characters, as in the original NFA.
37  *
38  * 3) Expand the color trigrams into regular trigrams consisting of
39  * characters. If too many distinct trigrams are produced, trigrams are
40  * eliminated and the graph is simplified until it's simple enough.
41  *
42  * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43  * and returned to the caller.
44  *
45  * 1) Compile the regexp to NFA form
46  * ---------------------------------
47  * The automaton returned by the regexp compiler is a graph where vertices
48  * are "states" and arcs are labeled with colors. Each color represents
49  * a set of characters, so that all characters assigned to the same color
50  * are interchangeable, so far as matching the regexp is concerned. There
51  * are two special states: "initial" and "final". A state can have multiple
52  * outgoing arcs labeled with the same color, which makes the automaton
53  * non-deterministic, because it can be in many states simultaneously.
54  *
55  * Note that this NFA is already lossy compared to the original regexp,
56  * since it ignores some regex features such as lookahead constraints and
57  * backref matching. This is OK for our purposes since it's still the case
58  * that only strings matching the NFA can possibly satisfy the regexp.
59  *
60  * 2) Transform the original NFA into an expanded graph
61  * ----------------------------------------------------
62  * In the 2nd stage, the automaton is transformed into a graph based on the
63  * original NFA. Each state in the expanded graph represents a state from
64  * the original NFA, plus a prefix identifying the last two characters
65  * (colors, to be precise) seen before entering the state. There can be
66  * multiple states in the expanded graph for each state in the original NFA,
67  * depending on what characters can precede it. A prefix position can be
68  * "unknown" if it's uncertain what the preceding character was, or "blank"
69  * if the character was a non-word character (we don't need to distinguish
70  * which non-word character it was, so just think of all of them as blanks).
71  *
72  * For convenience in description, call an expanded-state identifier
73  * (two prefix colors plus a state number from the original NFA) an
74  * "enter key".
75  *
76  * Each arc of the expanded graph is labeled with a trigram that must be
77  * present in the string to match. We can construct this from an out-arc of
78  * the underlying NFA state by combining the expanded state's prefix with the
79  * color label of the underlying out-arc, if neither prefix position is
80  * "unknown". But note that some of the colors in the trigram might be
81  * "blank". This is OK since we want to generate word-boundary trigrams as
82  * the regular trigram machinery would, if we know that some word characters
83  * must be adjacent to a word boundary in all strings matching the NFA.
84  *
85  * The expanded graph can also have fewer states than the original NFA,
86  * because we don't bother to make a separate state entry unless the state
87  * is reachable by a valid arc. When an enter key is reachable from a state
88  * of the expanded graph, but we do not know a complete trigram associated
89  * with that transition, we cannot make a valid arc; instead we insert the
90  * enter key into the enterKeys list of the source state. This effectively
91  * means that the two expanded states are not reliably distinguishable based
92  * on examining trigrams.
93  *
94  * So the expanded graph resembles the original NFA, but the arcs are
95  * labeled with trigrams instead of individual characters, and there may be
96  * more or fewer states. It is a lossy representation of the original NFA:
97  * any string that matches the original regexp must match the expanded graph,
98  * but the reverse is not true.
99  *
100  * We build the expanded graph through a breadth-first traversal of states
101  * reachable from the initial state. At each reachable state, we identify the
102  * states reachable from it without traversing a predictable trigram, and add
103  * those states' enter keys to the current state. Then we generate all
104  * out-arcs leading out of this collection of states that have predictable
105  * trigrams, adding their target states to the queue of states to examine.
106  *
107  * When building the graph, if the number of states or arcs exceed pre-defined
108  * limits, we give up and simply mark any states not yet processed as final
109  * states. Roughly speaking, that means that we make use of some portion from
110  * the beginning of the regexp. Also, any colors that have too many member
111  * characters are treated as "unknown", so that we can't derive trigrams
112  * from them.
113  *
114  * 3) Expand the color trigrams into regular trigrams
115  * --------------------------------------------------
116  * The trigrams in the expanded graph are "color trigrams", consisting
117  * of three consecutive colors that must be present in the string. But for
118  * search, we need regular trigrams consisting of characters. In the 3rd
119  * stage, the color trigrams are expanded into regular trigrams. Since each
120  * color can represent many characters, the total number of regular trigrams
121  * after expansion could be very large. Because searching the index for
122  * thousands of trigrams would be slow, and would likely produce so many
123  * false positives that we would have to traverse a large fraction of the
124  * index, the graph is simplified further in a lossy fashion by removing
125  * color trigrams. When a color trigram is removed, the states connected by
126  * any arcs labeled with that trigram are merged.
127  *
128  * Trigrams do not all have equivalent value for searching: some of them are
129  * more frequent and some of them are less frequent. Ideally, we would like
130  * to know the distribution of trigrams, but we don't. But because of padding
131  * we know for sure that the empty character is more frequent than others,
132  * so we can penalize trigrams according to presence of whitespace. The
133  * penalty assigned to each color trigram is the number of simple trigrams
134  * it would produce, times the penalties[] multiplier associated with its
135  * whitespace content. (The penalties[] constants were calculated by analysis
136  * of some real-life text.) We eliminate color trigrams starting with the
137  * highest-penalty one, until we get to a total penalty of no more than
138  * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139  * lead to merging the initial and final states, so we may not be able to
140  * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141  * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142  *
143  * 4) Pack the graph into a compact representation
144  * -----------------------------------------------
145  * The 2nd and 3rd stages might have eliminated or merged many of the states
146  * and trigrams created earlier, so in this final stage, the graph is
147  * compacted and packed into a simpler struct that contains only the
148  * information needed to evaluate it.
149  *
150  * ALGORITHM EXAMPLE:
151  *
152  * Consider the example regex "ab[cd]". This regex is transformed into the
153  * following NFA (for simplicity we show colors as their single members):
154  *
155  * 4#
156  * c/
157  * a b /
158  * 1* --- 2 ---- 3
159  * \
160  * d\
161  * 5#
162  *
163  * We use * to mark initial state and # to mark final state. It's not depicted,
164  * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165  * because this pattern can match to any part of a string.
166  *
167  * As the result of stage 2 we will have the following graph:
168  *
169  * abc abd
170  * 2# <---- 1* ----> 3#
171  *
172  * The process for generating this graph is:
173  * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174  * 2) Add key (UNKNOWN, "a", 2) to state 1.
175  * 3) Add key ("a", "b", 3) to state 1.
176  * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177  * from state 1 to state 2 with label trigram "abc".
178  * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179  * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180  * from state 1 to state 3 with label trigram "abd".
181  * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182  *
183  *
184  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
185  * Portions Copyright (c) 1994, Regents of the University of California
186  *
187  * IDENTIFICATION
188  * contrib/pg_trgm/trgm_regexp.c
189  *
190  *-------------------------------------------------------------------------
191  */
192 #include "postgres.h"
193 
194 #include "regex/regexport.h"
195 #include "trgm.h"
196 #include "tsearch/ts_locale.h"
197 #include "utils/hsearch.h"
198 #include "utils/memutils.h"
199 #include "varatt.h"
200 
201 /*
202  * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
203  * for exploring and debugging the algorithm implementation.
204  * This produces three graph files in /tmp, in Graphviz .gv format.
205  * Some progress information is also printed to postmaster stderr.
206  */
207 /* #define TRGM_REGEXP_DEBUG */
208 
209 /*
210  * These parameters are used to limit the amount of work done.
211  * Otherwise regex processing could be too slow and memory-consuming.
212  *
213  * MAX_EXPANDED_STATES - How many states we allow in expanded graph
214  * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
215  * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
216  * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
217  * COLOR_COUNT_LIMIT - Maximum number of characters per color
218  */
219 #define MAX_EXPANDED_STATES 128
220 #define MAX_EXPANDED_ARCS 1024
221 #define MAX_TRGM_COUNT 256
222 #define WISH_TRGM_PENALTY 16
223 #define COLOR_COUNT_LIMIT 256
224 
225 /*
226  * Penalty multipliers for trigram counts depending on whitespace contents.
227  * Numbers based on analysis of real-life texts.
228  */
229 static const float4 penalties[8] = {
230  1.0f, /* "aaa" */
231  3.5f, /* "aa " */
232  0.0f, /* "a a" (impossible) */
233  0.0f, /* "a " (impossible) */
234  4.2f, /* " aa" */
235  2.1f, /* " a " */
236  25.0f, /* " a" */
237  0.0f /* " " (impossible) */
238 };
239 
240 /* Struct representing a single pg_wchar, converted back to multibyte form */
241 typedef struct
242 {
244 } trgm_mb_char;
245 
246 /*
247  * Attributes of NFA colors:
248  *
249  * expandable - we know the character expansion of this color
250  * containsNonWord - color contains non-word characters
251  * (which will not be extracted into trigrams)
252  * wordCharsCount - count of word characters in color
253  * wordChars - array of this color's word characters
254  * (which can be extracted into trigrams)
255  *
256  * When expandable is false, the other attributes don't matter; we just
257  * assume this color represents unknown character(s).
258  */
259 typedef struct
260 {
265 } TrgmColorInfo;
266 
267 /*
268  * A "prefix" is information about the colors of the last two characters read
269  * before reaching a specific NFA state. These colors can have special values
270  * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
271  * information, for example because we read some character of an unexpandable
272  * color. COLOR_BLANK means that we read a non-word character.
273  *
274  * We call a prefix ambiguous if at least one of its colors is unknown. It's
275  * fully ambiguous if both are unknown, partially ambiguous if only the first
276  * is unknown. (The case of first color known, second unknown is not valid.)
277  *
278  * Wholly- or partly-blank prefixes are mostly handled the same as regular
279  * color prefixes. This allows us to generate appropriate partly-blank
280  * trigrams when the NFA requires word character(s) to appear adjacent to
281  * non-word character(s).
282  */
283 typedef int TrgmColor;
284 
285 /* We assume that colors returned by the regexp engine cannot be these: */
286 #define COLOR_UNKNOWN (-3)
287 #define COLOR_BLANK (-4)
288 
289 typedef struct
290 {
291  TrgmColor colors[2];
292 } TrgmPrefix;
293 
294 /*
295  * Color-trigram data type. Note that some elements of the trigram can be
296  * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
297  */
298 typedef struct
299 {
300  TrgmColor colors[3];
301 } ColorTrgm;
302 
303 /*
304  * Key identifying a state of our expanded graph: color prefix, and number
305  * of the corresponding state in the underlying regex NFA. The color prefix
306  * shows how we reached the regex state (to the extent that we know it).
307  */
308 typedef struct
309 {
311  int nstate;
312 } TrgmStateKey;
313 
314 /*
315  * One state of the expanded graph.
316  *
317  * stateKey - ID of this state
318  * arcs - outgoing arcs of this state (List of TrgmArc)
319  * enterKeys - enter keys reachable from this state without reading any
320  * predictable trigram (List of TrgmStateKey)
321  * flags - flag bits
322  * snumber - number of this state (initially assigned as -1, -2, etc,
323  * for debugging purposes only; then at the packaging stage,
324  * surviving states are renumbered with positive numbers)
325  * parent - parent state, if this state has been merged into another
326  * tentFlags - flags this state would acquire via planned merges
327  * tentParent - planned parent state, if considering a merge
328  */
329 #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
330 #define TSTATE_FIN 0x02 /* flag indicating this state is final */
331 
332 typedef struct TrgmState
333 {
334  TrgmStateKey stateKey; /* hashtable key: must be first field */
337  int flags;
338  int snumber;
339  struct TrgmState *parent;
343 
344 /*
345  * One arc in the expanded graph.
346  */
347 typedef struct
348 {
349  ColorTrgm ctrgm; /* trigram needed to traverse arc */
350  TrgmState *target; /* next state */
351 } TrgmArc;
352 
353 /*
354  * Information about arc of specific color trigram (used in stage 3)
355  *
356  * Contains pointers to the source and target states.
357  */
358 typedef struct
359 {
362 } TrgmArcInfo;
363 
364 /*
365  * Information about color trigram (used in stage 3)
366  *
367  * ctrgm - trigram itself
368  * cnumber - number of this trigram (used in the packaging stage)
369  * count - number of simple trigrams created from this color trigram
370  * expanded - indicates this color trigram is expanded into simple trigrams
371  * arcs - list of all arcs labeled with this color trigram.
372  */
373 typedef struct
374 {
376  int cnumber;
377  int count;
379  bool expanded;
381 } ColorTrgmInfo;
382 
383 /*
384  * Data structure representing all the data we need during regex processing.
385  *
386  * regex - compiled regex
387  * colorInfo - extracted information about regex's colors
388  * ncolors - number of colors in colorInfo[]
389  * states - hashtable of TrgmStates (states of expanded graph)
390  * initState - pointer to initial state of expanded graph
391  * queue - queue of to-be-processed TrgmStates
392  * keysQueue - queue of to-be-processed TrgmStateKeys
393  * arcsCount - total number of arcs of expanded graph (for resource
394  * limiting)
395  * overflowed - we have exceeded resource limit for transformation
396  * colorTrgms - array of all color trigrams present in graph
397  * colorTrgmsCount - count of those color trigrams
398  * totalTrgmCount - total count of extracted simple trigrams
399  */
400 typedef struct
401 {
402  /* Source regexp, and color information extracted from it (stage 1) */
405  int ncolors;
406 
407  /* Expanded graph (stage 2) */
410  int nstates;
411 
412  /* Workspace for stage 2 */
417 
418  /* Information about distinct color trigrams in the graph (stage 3) */
422 } TrgmNFA;
423 
424 /*
425  * Final, compact representation of expanded graph.
426  */
427 typedef struct
428 {
429  int targetState; /* index of target state (zero-based) */
430  int colorTrgm; /* index of color trigram for transition */
431 } TrgmPackedArc;
432 
433 typedef struct
434 {
435  int arcsCount; /* number of out-arcs for this state */
436  TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
438 
439 /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
441 {
442  /*
443  * colorTrigramsCount and colorTrigramGroups contain information about how
444  * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
445  * count of color trigrams and "colorTrigramGroups" contains number of
446  * simple trigrams for each color trigram. The array of simple trigrams
447  * (stored separately from this struct) is ordered so that the simple
448  * trigrams for each color trigram are consecutive, and they're in order
449  * by color trigram number.
450  */
452  int *colorTrigramGroups; /* array of size colorTrigramsCount */
453 
454  /*
455  * The states of the simplified NFA. State number 0 is always initial
456  * state and state number 1 is always final state.
457  */
459  TrgmPackedState *states; /* array of size statesCount */
460 
461  /* Temporary work space for trigramsMatchGraph() */
462  bool *colorTrigramsActive; /* array of size colorTrigramsCount */
463  bool *statesActive; /* array of size statesCount */
464  int *statesQueue; /* array of size statesCount */
465 };
466 
467 /*
468  * Temporary structure for representing an arc during packaging.
469  */
470 typedef struct
471 {
476 
477 
478 /* prototypes for private functions */
479 static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
480  MemoryContext rcontext);
481 static void RE_compile(regex_t *regex, text *text_re,
482  int cflags, Oid collation);
483 static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
484 static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
485 static void transformGraph(TrgmNFA *trgmNFA);
486 static void processState(TrgmNFA *trgmNFA, TrgmState *state);
487 static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
488 static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
489 static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
490 static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
491  TrgmColor co, TrgmStateKey *destKey);
492 static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
493 static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
494 static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
495 static bool selectColorTrigrams(TrgmNFA *trgmNFA);
496 static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
497 static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
498 static void mergeStates(TrgmState *state1, TrgmState *state2);
499 static int colorTrgmInfoCmp(const void *p1, const void *p2);
500 static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
501 static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
502 static int packArcInfoCmp(const void *a1, const void *a2);
503 
504 #ifdef TRGM_REGEXP_DEBUG
505 static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
506 static void printTrgmNFA(TrgmNFA *trgmNFA);
507 static void printTrgmColor(StringInfo buf, TrgmColor co);
508 static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
509 #endif
510 
511 
512 /*
513  * Main entry point to process a regular expression.
514  *
515  * Returns an array of trigrams required by the regular expression, or NULL if
516  * the regular expression was too complex to analyze. In addition, a packed
517  * graph representation of the regex is returned into *graph. The results
518  * must be allocated in rcontext (which might or might not be the current
519  * context).
520  */
521 TRGM *
522 createTrgmNFA(text *text_re, Oid collation,
523  TrgmPackedGraph **graph, MemoryContext rcontext)
524 {
525  TRGM *trg;
526  regex_t regex;
527  MemoryContext tmpcontext;
528  MemoryContext oldcontext;
529 
530  /*
531  * This processing generates a great deal of cruft, which we'd like to
532  * clean up before returning (since this function may be called in a
533  * query-lifespan memory context). Make a temp context we can work in so
534  * that cleanup is easy.
535  */
537  "createTrgmNFA temporary context",
539  oldcontext = MemoryContextSwitchTo(tmpcontext);
540 
541  /*
542  * Stage 1: Compile the regexp into a NFA, using the regexp library.
543  */
544 #ifdef IGNORECASE
545  RE_compile(&regex, text_re,
546  REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
547 #else
548  RE_compile(&regex, text_re,
549  REG_ADVANCED | REG_NOSUB, collation);
550 #endif
551 
552  /*
553  * Since the regexp library allocates its internal data structures with
554  * malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets
555  * done even if there's an error.
556  */
557  PG_TRY();
558  {
559  trg = createTrgmNFAInternal(&regex, graph, rcontext);
560  }
561  PG_FINALLY();
562  {
563  pg_regfree(&regex);
564  }
565  PG_END_TRY();
566 
567  /* Clean up all the cruft we created */
568  MemoryContextSwitchTo(oldcontext);
569  MemoryContextDelete(tmpcontext);
570 
571  return trg;
572 }
573 
574 /*
575  * Body of createTrgmNFA, exclusive of regex compilation/freeing.
576  */
577 static TRGM *
579  MemoryContext rcontext)
580 {
581  TRGM *trg;
582  TrgmNFA trgmNFA;
583 
584  trgmNFA.regex = regex;
585 
586  /* Collect color information from the regex */
587  getColorInfo(regex, &trgmNFA);
588 
589 #ifdef TRGM_REGEXP_DEBUG
590  printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
591 #endif
592 
593  /*
594  * Stage 2: Create an expanded graph from the source NFA.
595  */
596  transformGraph(&trgmNFA);
597 
598 #ifdef TRGM_REGEXP_DEBUG
599  printTrgmNFA(&trgmNFA);
600 #endif
601 
602  /*
603  * Fail if we were unable to make a nontrivial graph, ie it is possible to
604  * get from the initial state to the final state without reading any
605  * predictable trigram.
606  */
607  if (trgmNFA.initState->flags & TSTATE_FIN)
608  return NULL;
609 
610  /*
611  * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
612  */
613  if (!selectColorTrigrams(&trgmNFA))
614  return NULL;
615 
616  /*
617  * Stage 4: Expand color trigrams and pack graph into final
618  * representation.
619  */
620  trg = expandColorTrigrams(&trgmNFA, rcontext);
621 
622  *graph = packGraph(&trgmNFA, rcontext);
623 
624 #ifdef TRGM_REGEXP_DEBUG
625  printTrgmPackedGraph(*graph, trg);
626 #endif
627 
628  return trg;
629 }
630 
631 /*
632  * Main entry point for evaluating a graph during index scanning.
633  *
634  * The check[] array is indexed by trigram number (in the array of simple
635  * trigrams returned by createTrgmNFA), and holds true for those trigrams
636  * that are present in the index entry being checked.
637  */
638 bool
640 {
641  int i,
642  j,
643  k,
644  queueIn,
645  queueOut;
646 
647  /*
648  * Reset temporary working areas.
649  */
650  memset(graph->colorTrigramsActive, 0,
651  sizeof(bool) * graph->colorTrigramsCount);
652  memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
653 
654  /*
655  * Check which color trigrams were matched. A match for any simple
656  * trigram associated with a color trigram counts as a match of the color
657  * trigram.
658  */
659  j = 0;
660  for (i = 0; i < graph->colorTrigramsCount; i++)
661  {
662  int cnt = graph->colorTrigramGroups[i];
663 
664  for (k = j; k < j + cnt; k++)
665  {
666  if (check[k])
667  {
668  /*
669  * Found one matched trigram in the group. Can skip the rest
670  * of them and go to the next group.
671  */
672  graph->colorTrigramsActive[i] = true;
673  break;
674  }
675  }
676  j = j + cnt;
677  }
678 
679  /*
680  * Initialize the statesQueue to hold just the initial state. Note:
681  * statesQueue has room for statesCount entries, which is certainly enough
682  * since no state will be put in the queue more than once. The
683  * statesActive array marks which states have been queued.
684  */
685  graph->statesActive[0] = true;
686  graph->statesQueue[0] = 0;
687  queueIn = 0;
688  queueOut = 1;
689 
690  /* Process queued states as long as there are any. */
691  while (queueIn < queueOut)
692  {
693  int stateno = graph->statesQueue[queueIn++];
694  TrgmPackedState *state = &graph->states[stateno];
695  int cnt = state->arcsCount;
696 
697  /* Loop over state's out-arcs */
698  for (i = 0; i < cnt; i++)
699  {
700  TrgmPackedArc *arc = &state->arcs[i];
701 
702  /*
703  * If corresponding color trigram is present then activate the
704  * corresponding state. We're done if that's the final state,
705  * otherwise queue the state if it's not been queued already.
706  */
707  if (graph->colorTrigramsActive[arc->colorTrgm])
708  {
709  int nextstate = arc->targetState;
710 
711  if (nextstate == 1)
712  return true; /* success: final state is reachable */
713 
714  if (!graph->statesActive[nextstate])
715  {
716  graph->statesActive[nextstate] = true;
717  graph->statesQueue[queueOut++] = nextstate;
718  }
719  }
720  }
721  }
722 
723  /* Queue is empty, so match fails. */
724  return false;
725 }
726 
727 /*
728  * Compile regex string into struct at *regex.
729  * NB: pg_regfree must be applied to regex if this completes successfully.
730  */
731 static void
732 RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
733 {
734  int text_re_len = VARSIZE_ANY_EXHDR(text_re);
735  char *text_re_val = VARDATA_ANY(text_re);
736  pg_wchar *pattern;
737  int pattern_len;
738  int regcomp_result;
739  char errMsg[100];
740 
741  /* Convert pattern string to wide characters */
742  pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
743  pattern_len = pg_mb2wchar_with_len(text_re_val,
744  pattern,
745  text_re_len);
746 
747  /* Compile regex */
748  regcomp_result = pg_regcomp(regex,
749  pattern,
750  pattern_len,
751  cflags,
752  collation);
753 
754  pfree(pattern);
755 
756  if (regcomp_result != REG_OKAY)
757  {
758  /* re didn't compile (no need for pg_regfree, if so) */
759  pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
760  ereport(ERROR,
761  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
762  errmsg("invalid regular expression: %s", errMsg)));
763  }
764 }
765 
766 
767 /*---------------------
768  * Subroutines for pre-processing the color map (stage 1).
769  *---------------------
770  */
771 
772 /*
773  * Fill TrgmColorInfo structure for each color using regex export functions.
774  */
775 static void
776 getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
777 {
778  int colorsCount = pg_reg_getnumcolors(regex);
779  int i;
780 
781  trgmNFA->ncolors = colorsCount;
782  trgmNFA->colorInfo = (TrgmColorInfo *)
783  palloc0(colorsCount * sizeof(TrgmColorInfo));
784 
785  /*
786  * Loop over colors, filling TrgmColorInfo about each. Note we include
787  * WHITE (0) even though we know it'll be reported as non-expandable.
788  */
789  for (i = 0; i < colorsCount; i++)
790  {
791  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
792  int charsCount = pg_reg_getnumcharacters(regex, i);
793  pg_wchar *chars;
794  int j;
795 
796  if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
797  {
798  /* Non expandable, or too large to work with */
799  colorInfo->expandable = false;
800  continue;
801  }
802 
803  colorInfo->expandable = true;
804  colorInfo->containsNonWord = false;
805  colorInfo->wordChars = (trgm_mb_char *)
806  palloc(sizeof(trgm_mb_char) * charsCount);
807  colorInfo->wordCharsCount = 0;
808 
809  /* Extract all the chars in this color */
810  chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount);
811  pg_reg_getcharacters(regex, i, chars, charsCount);
812 
813  /*
814  * Convert characters back to multibyte form, and save only those that
815  * are word characters. Set "containsNonWord" if any non-word
816  * character. (Note: it'd probably be nicer to keep the chars in
817  * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
818  */
819  for (j = 0; j < charsCount; j++)
820  {
821  trgm_mb_char c;
822 
823  if (!convertPgWchar(chars[j], &c))
824  continue; /* ok to ignore it altogether */
825  if (ISWORDCHR(c.bytes))
826  colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
827  else
828  colorInfo->containsNonWord = true;
829  }
830 
831  pfree(chars);
832  }
833 }
834 
835 /*
836  * Convert pg_wchar to multibyte format.
837  * Returns false if the character should be ignored completely.
838  */
839 static bool
841 {
842  /* "s" has enough space for a multibyte character and a trailing NUL */
843  char s[MAX_MULTIBYTE_CHAR_LEN + 1];
844 
845  /*
846  * We can ignore the NUL character, since it can never appear in a PG text
847  * string. This avoids the need for various special cases when
848  * reconstructing trigrams.
849  */
850  if (c == 0)
851  return false;
852 
853  /* Do the conversion, making sure the result is NUL-terminated */
854  memset(s, 0, sizeof(s));
855  pg_wchar2mb_with_len(&c, s, 1);
856 
857  /*
858  * In IGNORECASE mode, we can ignore uppercase characters. We assume that
859  * the regex engine generated both uppercase and lowercase equivalents
860  * within each color, since we used the REG_ICASE option; so there's no
861  * need to process the uppercase version.
862  *
863  * XXX this code is dependent on the assumption that lowerstr() works the
864  * same as the regex engine's internal case folding machinery. Might be
865  * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c).
866  * On the other hand, the trigrams in the index were created using
867  * lowerstr(), so we're probably screwed if there's any incompatibility
868  * anyway.
869  */
870 #ifdef IGNORECASE
871  {
872  char *lowerCased = lowerstr(s);
873 
874  if (strcmp(lowerCased, s) != 0)
875  {
876  pfree(lowerCased);
877  return false;
878  }
879  pfree(lowerCased);
880  }
881 #endif
882 
883  /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
884  memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
885  return true;
886 }
887 
888 
889 /*---------------------
890  * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
891  *---------------------
892  */
893 
894 /*
895  * Transform the graph, given a regex and extracted color information.
896  *
897  * We create and process a queue of expanded-graph states until all the states
898  * are processed.
899  *
900  * This algorithm may be stopped due to resource limitation. In this case we
901  * force every unprocessed branch to immediately finish with matching (this
902  * can give us false positives but no false negatives) by marking all
903  * unprocessed states as final.
904  */
905 static void
907 {
908  HASHCTL hashCtl;
909  TrgmStateKey initkey;
910  TrgmState *initstate;
911  ListCell *lc;
912 
913  /* Initialize this stage's workspace in trgmNFA struct */
914  trgmNFA->queue = NIL;
915  trgmNFA->keysQueue = NIL;
916  trgmNFA->arcsCount = 0;
917  trgmNFA->overflowed = false;
918 
919  /* Create hashtable for states */
920  hashCtl.keysize = sizeof(TrgmStateKey);
921  hashCtl.entrysize = sizeof(TrgmState);
922  hashCtl.hcxt = CurrentMemoryContext;
923  trgmNFA->states = hash_create("Trigram NFA",
924  1024,
925  &hashCtl,
927  trgmNFA->nstates = 0;
928 
929  /* Create initial state: ambiguous prefix, NFA's initial state */
930  MemSet(&initkey, 0, sizeof(initkey));
931  initkey.prefix.colors[0] = COLOR_UNKNOWN;
932  initkey.prefix.colors[1] = COLOR_UNKNOWN;
933  initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
934 
935  initstate = getState(trgmNFA, &initkey);
936  initstate->flags |= TSTATE_INIT;
937  trgmNFA->initState = initstate;
938 
939  /*
940  * Recursively build the expanded graph by processing queue of states
941  * (breadth-first search). getState already put initstate in the queue.
942  * Note that getState will append new states to the queue within the loop,
943  * too; this works as long as we don't do repeat fetches using the "lc"
944  * pointer.
945  */
946  foreach(lc, trgmNFA->queue)
947  {
948  TrgmState *state = (TrgmState *) lfirst(lc);
949 
950  /*
951  * If we overflowed then just mark state as final. Otherwise do
952  * actual processing.
953  */
954  if (trgmNFA->overflowed)
955  state->flags |= TSTATE_FIN;
956  else
957  processState(trgmNFA, state);
958 
959  /* Did we overflow? */
960  if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
962  trgmNFA->overflowed = true;
963  }
964 }
965 
966 /*
967  * Process one state: add enter keys and then add outgoing arcs.
968  */
969 static void
971 {
972  ListCell *lc;
973 
974  /* keysQueue should be NIL already, but make sure */
975  trgmNFA->keysQueue = NIL;
976 
977  /*
978  * Add state's own key, and then process all keys added to keysQueue until
979  * queue is finished. But we can quit if the state gets marked final.
980  */
981  addKey(trgmNFA, state, &state->stateKey);
982  foreach(lc, trgmNFA->keysQueue)
983  {
984  TrgmStateKey *key = (TrgmStateKey *) lfirst(lc);
985 
986  if (state->flags & TSTATE_FIN)
987  break;
988  addKey(trgmNFA, state, key);
989  }
990 
991  /* Release keysQueue to clean up for next cycle */
992  list_free(trgmNFA->keysQueue);
993  trgmNFA->keysQueue = NIL;
994 
995  /*
996  * Add outgoing arcs only if state isn't final (we have no interest in
997  * outgoing arcs if we already match)
998  */
999  if (!(state->flags & TSTATE_FIN))
1000  addArcs(trgmNFA, state);
1001 }
1002 
1003 /*
1004  * Add the given enter key into the state's enterKeys list, and determine
1005  * whether this should result in any further enter keys being added.
1006  * If so, add those keys to keysQueue so that processState will handle them.
1007  *
1008  * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
1009  * This situation means that we can reach the final state from this expanded
1010  * state without reading any predictable trigram, so we must consider this
1011  * state as an accepting one.
1012  *
1013  * The given key could be a duplicate of one already in enterKeys, or be
1014  * redundant with some enterKeys. So we check that before doing anything.
1015  *
1016  * Note that we don't generate any actual arcs here. addArcs will do that
1017  * later, after we have identified all the enter keys for this state.
1018  */
1019 static void
1021 {
1022  regex_arc_t *arcs;
1023  TrgmStateKey destKey;
1024  ListCell *cell;
1025  int i,
1026  arcsCount;
1027 
1028  /*
1029  * Ensure any pad bytes in destKey are zero, since it may get used as a
1030  * hashtable key by getState.
1031  */
1032  MemSet(&destKey, 0, sizeof(destKey));
1033 
1034  /*
1035  * Compare key to each existing enter key of the state to check for
1036  * redundancy. We can drop either old key(s) or the new key if we find
1037  * redundancy.
1038  */
1039  foreach(cell, state->enterKeys)
1040  {
1041  TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1042 
1043  if (existingKey->nstate == key->nstate)
1044  {
1045  if (prefixContains(&existingKey->prefix, &key->prefix))
1046  {
1047  /* This old key already covers the new key. Nothing to do */
1048  return;
1049  }
1050  if (prefixContains(&key->prefix, &existingKey->prefix))
1051  {
1052  /*
1053  * The new key covers this old key. Remove the old key, it's
1054  * no longer needed once we add this key to the list.
1055  */
1056  state->enterKeys = foreach_delete_current(state->enterKeys,
1057  cell);
1058  }
1059  }
1060  }
1061 
1062  /* No redundancy, so add this key to the state's list */
1063  state->enterKeys = lappend(state->enterKeys, key);
1064 
1065  /* If state is now known final, mark it and we're done */
1066  if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1067  {
1068  state->flags |= TSTATE_FIN;
1069  return;
1070  }
1071 
1072  /*
1073  * Loop through all outgoing arcs of the corresponding state in the
1074  * original NFA.
1075  */
1076  arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1077  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
1078  pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1079 
1080  for (i = 0; i < arcsCount; i++)
1081  {
1082  regex_arc_t *arc = &arcs[i];
1083 
1084  if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1085  {
1086  /*
1087  * Start of line/string (^). Trigram extraction treats start of
1088  * line same as start of word: double space prefix is added.
1089  * Hence, make an enter key showing we can reach the arc
1090  * destination with all-blank prefix.
1091  */
1092  destKey.prefix.colors[0] = COLOR_BLANK;
1093  destKey.prefix.colors[1] = COLOR_BLANK;
1094  destKey.nstate = arc->to;
1095 
1096  /* Add enter key to this state */
1097  addKeyToQueue(trgmNFA, &destKey);
1098  }
1099  else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1100  {
1101  /*
1102  * End of line/string ($). We must consider this arc as a
1103  * transition that doesn't read anything. The reason for adding
1104  * this enter key to the state is that if the arc leads to the
1105  * NFA's final state, we must mark this expanded state as final.
1106  */
1107  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1108  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1109  destKey.nstate = arc->to;
1110 
1111  /* Add enter key to this state */
1112  addKeyToQueue(trgmNFA, &destKey);
1113  }
1114  else if (arc->co >= 0)
1115  {
1116  /* Regular color (including WHITE) */
1117  TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1118 
1119  if (colorInfo->expandable)
1120  {
1121  if (colorInfo->containsNonWord &&
1123  {
1124  /*
1125  * We can reach the arc destination after reading a
1126  * non-word character, but the prefix is not something
1127  * that addArc will accept with COLOR_BLANK, so no trigram
1128  * arc can get made for this transition. We must make an
1129  * enter key to show that the arc destination is
1130  * reachable. Set it up with an all-blank prefix, since
1131  * that corresponds to what the trigram extraction code
1132  * will do at a word starting boundary.
1133  */
1134  destKey.prefix.colors[0] = COLOR_BLANK;
1135  destKey.prefix.colors[1] = COLOR_BLANK;
1136  destKey.nstate = arc->to;
1137  addKeyToQueue(trgmNFA, &destKey);
1138  }
1139 
1140  if (colorInfo->wordCharsCount > 0 &&
1141  !validArcLabel(key, arc->co))
1142  {
1143  /*
1144  * We can reach the arc destination after reading a word
1145  * character, but the prefix is not something that addArc
1146  * will accept, so no trigram arc can get made for this
1147  * transition. We must make an enter key to show that the
1148  * arc destination is reachable. The prefix for the enter
1149  * key should reflect the info we have for this arc.
1150  */
1151  destKey.prefix.colors[0] = key->prefix.colors[1];
1152  destKey.prefix.colors[1] = arc->co;
1153  destKey.nstate = arc->to;
1154  addKeyToQueue(trgmNFA, &destKey);
1155  }
1156  }
1157  else
1158  {
1159  /*
1160  * Unexpandable color. Add enter key with ambiguous prefix,
1161  * showing we can reach the destination from this state, but
1162  * the preceding colors will be uncertain. (We do not set the
1163  * first prefix color to key->prefix.colors[1], because a
1164  * prefix of known followed by unknown is invalid.)
1165  */
1166  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1167  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1168  destKey.nstate = arc->to;
1169  addKeyToQueue(trgmNFA, &destKey);
1170  }
1171  }
1172  else
1173  {
1174  /* RAINBOW: treat as unexpandable color */
1175  destKey.prefix.colors[0] = COLOR_UNKNOWN;
1176  destKey.prefix.colors[1] = COLOR_UNKNOWN;
1177  destKey.nstate = arc->to;
1178  addKeyToQueue(trgmNFA, &destKey);
1179  }
1180  }
1181 
1182  pfree(arcs);
1183 }
1184 
1185 /*
1186  * Add copy of given key to keysQueue for later processing.
1187  */
1188 static void
1190 {
1191  TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey));
1192 
1193  memcpy(keyCopy, key, sizeof(TrgmStateKey));
1194  trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1195 }
1196 
1197 /*
1198  * Add outgoing arcs from given state, whose enter keys are all now known.
1199  */
1200 static void
1202 {
1203  TrgmStateKey destKey;
1204  ListCell *cell;
1205  regex_arc_t *arcs;
1206  int arcsCount,
1207  i;
1208 
1209  /*
1210  * Ensure any pad bytes in destKey are zero, since it may get used as a
1211  * hashtable key by getState.
1212  */
1213  MemSet(&destKey, 0, sizeof(destKey));
1214 
1215  /*
1216  * Iterate over enter keys associated with this expanded-graph state. This
1217  * includes both the state's own stateKey, and any enter keys we added to
1218  * it during addKey (which represent expanded-graph states that are not
1219  * distinguishable from this one by means of trigrams). For each such
1220  * enter key, examine all the out-arcs of the key's underlying NFA state,
1221  * and try to make a trigram arc leading to where the out-arc leads.
1222  * (addArc will deal with whether the arc is valid or not.)
1223  */
1224  foreach(cell, state->enterKeys)
1225  {
1226  TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1227 
1228  arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1229  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
1230  pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1231 
1232  for (i = 0; i < arcsCount; i++)
1233  {
1234  regex_arc_t *arc = &arcs[i];
1235  TrgmColorInfo *colorInfo;
1236 
1237  /*
1238  * Ignore non-expandable colors; addKey already handled the case.
1239  *
1240  * We need no special check for WHITE or begin/end pseudocolors
1241  * here. We don't need to do any processing for them, and they
1242  * will be marked non-expandable since the regex engine will have
1243  * reported them that way. We do have to watch out for RAINBOW,
1244  * which has a negative color number.
1245  */
1246  if (arc->co < 0)
1247  continue;
1248  Assert(arc->co < trgmNFA->ncolors);
1249 
1250  colorInfo = &trgmNFA->colorInfo[arc->co];
1251  if (!colorInfo->expandable)
1252  continue;
1253 
1254  if (colorInfo->containsNonWord)
1255  {
1256  /*
1257  * Color includes non-word character(s).
1258  *
1259  * Generate an arc, treating this transition as occurring on
1260  * BLANK. This allows word-ending trigrams to be manufactured
1261  * if possible.
1262  */
1263  destKey.prefix.colors[0] = key->prefix.colors[1];
1264  destKey.prefix.colors[1] = COLOR_BLANK;
1265  destKey.nstate = arc->to;
1266 
1267  addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
1268  }
1269 
1270  if (colorInfo->wordCharsCount > 0)
1271  {
1272  /*
1273  * Color includes word character(s).
1274  *
1275  * Generate an arc. Color is pushed into prefix of target
1276  * state.
1277  */
1278  destKey.prefix.colors[0] = key->prefix.colors[1];
1279  destKey.prefix.colors[1] = arc->co;
1280  destKey.nstate = arc->to;
1281 
1282  addArc(trgmNFA, state, key, arc->co, &destKey);
1283  }
1284  }
1285 
1286  pfree(arcs);
1287  }
1288 }
1289 
1290 /*
1291  * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1292  *
1293  * state: expanded-graph state we want to add an out-arc to
1294  * key: provides prefix colors (key->nstate is not used)
1295  * co: transition color
1296  * destKey: identifier for destination state of expanded graph
1297  */
1298 static void
1300  TrgmColor co, TrgmStateKey *destKey)
1301 {
1302  TrgmArc *arc;
1303  ListCell *cell;
1304 
1305  /* Do nothing if this wouldn't be a valid arc label trigram */
1306  if (!validArcLabel(key, co))
1307  return;
1308 
1309  /*
1310  * Check if we are going to reach key which is covered by a key which is
1311  * already listed in this state. If so arc is useless: the NFA can bypass
1312  * it through a path that doesn't require any predictable trigram, so
1313  * whether the arc's trigram is present or not doesn't really matter.
1314  */
1315  foreach(cell, state->enterKeys)
1316  {
1317  TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1318 
1319  if (existingKey->nstate == destKey->nstate &&
1320  prefixContains(&existingKey->prefix, &destKey->prefix))
1321  return;
1322  }
1323 
1324  /* Checks were successful, add new arc */
1325  arc = (TrgmArc *) palloc(sizeof(TrgmArc));
1326  arc->target = getState(trgmNFA, destKey);
1327  arc->ctrgm.colors[0] = key->prefix.colors[0];
1328  arc->ctrgm.colors[1] = key->prefix.colors[1];
1329  arc->ctrgm.colors[2] = co;
1330 
1331  state->arcs = lappend(state->arcs, arc);
1332  trgmNFA->arcsCount++;
1333 }
1334 
1335 /*
1336  * Can we make a valid trigram arc label from the given prefix and arc color?
1337  *
1338  * This is split out so that tests in addKey and addArc will stay in sync.
1339  */
1340 static bool
1342 {
1343  /*
1344  * We have to know full trigram in order to add outgoing arc. So we can't
1345  * do it if prefix is ambiguous.
1346  */
1347  if (key->prefix.colors[0] == COLOR_UNKNOWN)
1348  return false;
1349 
1350  /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1351  Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1352  /* And we should not be called with an unknown arc color anytime */
1353  Assert(co != COLOR_UNKNOWN);
1354 
1355  /*
1356  * We don't bother with making arcs representing three non-word
1357  * characters, since that's useless for trigram extraction.
1358  */
1359  if (key->prefix.colors[0] == COLOR_BLANK &&
1360  key->prefix.colors[1] == COLOR_BLANK &&
1361  co == COLOR_BLANK)
1362  return false;
1363 
1364  /*
1365  * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1366  * case doesn't correspond to any trigram the trigram extraction code
1367  * would make. The nonblank-blank-blank case is also not possible with
1368  * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1369  * trigram even if it were valid, for example processing "foo bar" will
1370  * not result in considering the trigram "o ". So if you want to support
1371  * RPADDING = 2, there's more to do than just twiddle this test.)
1372  */
1373  if (key->prefix.colors[0] != COLOR_BLANK &&
1374  key->prefix.colors[1] == COLOR_BLANK)
1375  return false;
1376 
1377  /*
1378  * Other combinations involving blank are valid, in particular we assume
1379  * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1380  *
1381  * Note: Using again the example "foo bar", we will not consider the
1382  * trigram " b", though this trigram would be found by the trigram
1383  * extraction code. Since we will find " ba", it doesn't seem worth
1384  * trying to hack the algorithm to generate the additional trigram.
1385  */
1386 
1387  /* arc label is valid */
1388  return true;
1389 }
1390 
1391 /*
1392  * Get state of expanded graph for given state key,
1393  * and queue the state for processing if it didn't already exist.
1394  */
1395 static TrgmState *
1397 {
1398  TrgmState *state;
1399  bool found;
1400 
1401  state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1402  &found);
1403  if (!found)
1404  {
1405  /* New state: initialize and queue it */
1406  state->arcs = NIL;
1407  state->enterKeys = NIL;
1408  state->flags = 0;
1409  /* states are initially given negative numbers */
1410  state->snumber = -(++trgmNFA->nstates);
1411  state->parent = NULL;
1412  state->tentFlags = 0;
1413  state->tentParent = NULL;
1414 
1415  trgmNFA->queue = lappend(trgmNFA->queue, state);
1416  }
1417  return state;
1418 }
1419 
1420 /*
1421  * Check if prefix1 "contains" prefix2.
1422  *
1423  * "contains" means that any exact prefix (with no ambiguity) that satisfies
1424  * prefix2 also satisfies prefix1.
1425  */
1426 static bool
1428 {
1429  if (prefix1->colors[1] == COLOR_UNKNOWN)
1430  {
1431  /* Fully ambiguous prefix contains everything */
1432  return true;
1433  }
1434  else if (prefix1->colors[0] == COLOR_UNKNOWN)
1435  {
1436  /*
1437  * Prefix with only first unknown color contains every prefix with
1438  * same second color.
1439  */
1440  if (prefix1->colors[1] == prefix2->colors[1])
1441  return true;
1442  else
1443  return false;
1444  }
1445  else
1446  {
1447  /* Exact prefix contains only the exact same prefix */
1448  if (prefix1->colors[0] == prefix2->colors[0] &&
1449  prefix1->colors[1] == prefix2->colors[1])
1450  return true;
1451  else
1452  return false;
1453  }
1454 }
1455 
1456 
1457 /*---------------------
1458  * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1459  *---------------------
1460  */
1461 
1462 /*
1463  * Get vector of all color trigrams in graph and select which of them
1464  * to expand into simple trigrams.
1465  *
1466  * Returns true if OK, false if exhausted resource limits.
1467  */
1468 static bool
1470 {
1471  HASH_SEQ_STATUS scan_status;
1472  int arcsCount = trgmNFA->arcsCount,
1473  i;
1474  TrgmState *state;
1475  ColorTrgmInfo *colorTrgms;
1476  int64 totalTrgmCount;
1477  float4 totalTrgmPenalty;
1478  int cnumber;
1479 
1480  /* Collect color trigrams from all arcs */
1481  colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
1482  trgmNFA->colorTrgms = colorTrgms;
1483 
1484  i = 0;
1485  hash_seq_init(&scan_status, trgmNFA->states);
1486  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1487  {
1488  ListCell *cell;
1489 
1490  foreach(cell, state->arcs)
1491  {
1492  TrgmArc *arc = (TrgmArc *) lfirst(cell);
1493  TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
1494  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1495 
1496  arcInfo->source = state;
1497  arcInfo->target = arc->target;
1498  trgmInfo->ctrgm = arc->ctrgm;
1499  trgmInfo->cnumber = -1;
1500  /* count and penalty will be set below */
1501  trgmInfo->expanded = true;
1502  trgmInfo->arcs = list_make1(arcInfo);
1503  i++;
1504  }
1505  }
1506  Assert(i == arcsCount);
1507 
1508  /* Remove duplicates, merging their arcs lists */
1509  if (arcsCount >= 2)
1510  {
1511  ColorTrgmInfo *p1,
1512  *p2;
1513 
1514  /* Sort trigrams to ease duplicate detection */
1515  qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1516 
1517  /* p1 is probe point, p2 is last known non-duplicate. */
1518  p2 = colorTrgms;
1519  for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1520  {
1521  if (colorTrgmInfoCmp(p1, p2) > 0)
1522  {
1523  p2++;
1524  *p2 = *p1;
1525  }
1526  else
1527  {
1528  p2->arcs = list_concat(p2->arcs, p1->arcs);
1529  }
1530  }
1531  trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1532  }
1533  else
1534  {
1535  trgmNFA->colorTrgmsCount = arcsCount;
1536  }
1537 
1538  /*
1539  * Count number of simple trigrams generated by each color trigram, and
1540  * also compute a penalty value, which is the number of simple trigrams
1541  * times a multiplier that depends on its whitespace content.
1542  *
1543  * Note: per-color-trigram counts cannot overflow an int so long as
1544  * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1545  * 1290. However, the grand total totalTrgmCount might conceivably
1546  * overflow an int, so we use int64 for that within this routine. Also,
1547  * penalties are calculated in float4 arithmetic to avoid any overflow
1548  * worries.
1549  */
1550  totalTrgmCount = 0;
1551  totalTrgmPenalty = 0.0f;
1552  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1553  {
1554  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1555  int j,
1556  count = 1,
1557  typeIndex = 0;
1558 
1559  for (j = 0; j < 3; j++)
1560  {
1561  TrgmColor c = trgmInfo->ctrgm.colors[j];
1562 
1563  typeIndex *= 2;
1564  if (c == COLOR_BLANK)
1565  typeIndex++;
1566  else
1567  count *= trgmNFA->colorInfo[c].wordCharsCount;
1568  }
1569  trgmInfo->count = count;
1570  totalTrgmCount += count;
1571  trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1572  totalTrgmPenalty += trgmInfo->penalty;
1573  }
1574 
1575  /* Sort color trigrams in descending order of their penalties */
1576  qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1578 
1579  /*
1580  * Remove color trigrams from the graph so long as total penalty of color
1581  * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1582  * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1583  * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1584  * penalty, since those are the most promising for reducing the total
1585  * penalty. When removing a color trigram we have to merge states
1586  * connected by arcs labeled with that trigram. It's necessary to not
1587  * merge initial and final states, because our graph becomes useless if
1588  * that happens; so we cannot always remove the trigram we'd prefer to.
1589  */
1590  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1591  {
1592  ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1593  bool canRemove = true;
1594  ListCell *cell;
1595 
1596  /* Done if we've reached the target */
1597  if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
1598  break;
1599 
1600 #ifdef TRGM_REGEXP_DEBUG
1601  fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1602  trgmInfo->ctrgm.colors[0],
1603  trgmInfo->ctrgm.colors[1],
1604  trgmInfo->ctrgm.colors[2],
1605  trgmInfo->penalty,
1606  list_length(trgmInfo->arcs));
1607 #endif
1608 
1609  /*
1610  * Does any arc of this color trigram connect initial and final
1611  * states? If so we can't remove it.
1612  */
1613  foreach(cell, trgmInfo->arcs)
1614  {
1615  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1616  TrgmState *source = arcInfo->source,
1617  *target = arcInfo->target;
1618  int source_flags,
1619  target_flags;
1620 
1621 #ifdef TRGM_REGEXP_DEBUG
1622  fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1623  -target->snumber, target->flags,
1624  -source->snumber, source->flags);
1625 #endif
1626 
1627  /* examine parent states, if any merging has already happened */
1628  while (source->parent)
1629  source = source->parent;
1630  while (target->parent)
1631  target = target->parent;
1632 
1633 #ifdef TRGM_REGEXP_DEBUG
1634  fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1635  -target->snumber, target->flags,
1636  -source->snumber, source->flags);
1637 #endif
1638 
1639  /* we must also consider merges we are planning right now */
1640  source_flags = source->flags | source->tentFlags;
1641  while (source->tentParent)
1642  {
1643  source = source->tentParent;
1644  source_flags |= source->flags | source->tentFlags;
1645  }
1646  target_flags = target->flags | target->tentFlags;
1647  while (target->tentParent)
1648  {
1649  target = target->tentParent;
1650  target_flags |= target->flags | target->tentFlags;
1651  }
1652 
1653 #ifdef TRGM_REGEXP_DEBUG
1654  fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1655  -target->snumber, target_flags,
1656  -source->snumber, source_flags);
1657 #endif
1658 
1659  /* would fully-merged state have both INIT and FIN set? */
1660  if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1661  (TSTATE_INIT | TSTATE_FIN))
1662  {
1663  canRemove = false;
1664  break;
1665  }
1666 
1667  /* ok so far, so remember planned merge */
1668  if (source != target)
1669  {
1670 #ifdef TRGM_REGEXP_DEBUG
1671  fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1672  -target->snumber, -source->snumber);
1673 #endif
1674  target->tentParent = source;
1675  source->tentFlags |= target_flags;
1676  }
1677  }
1678 
1679  /*
1680  * We must reset all the tentFlags/tentParent fields before
1681  * continuing. tentFlags could only have become set in states that
1682  * are the source or parent or tentative parent of one of the current
1683  * arcs; likewise tentParent could only have become set in states that
1684  * are the target or parent or tentative parent of one of the current
1685  * arcs. There might be some overlap between those sets, but if we
1686  * clear tentFlags in target states as well as source states, we
1687  * should be okay even if we visit a state as target before visiting
1688  * it as a source.
1689  */
1690  foreach(cell, trgmInfo->arcs)
1691  {
1692  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1693  TrgmState *source = arcInfo->source,
1694  *target = arcInfo->target;
1695  TrgmState *ttarget;
1696 
1697  /* no need to touch previously-merged states */
1698  while (source->parent)
1699  source = source->parent;
1700  while (target->parent)
1701  target = target->parent;
1702 
1703  while (source)
1704  {
1705  source->tentFlags = 0;
1706  source = source->tentParent;
1707  }
1708 
1709  while ((ttarget = target->tentParent) != NULL)
1710  {
1711  target->tentParent = NULL;
1712  target->tentFlags = 0; /* in case it was also a source */
1713  target = ttarget;
1714  }
1715  }
1716 
1717  /* Now, move on if we can't drop this trigram */
1718  if (!canRemove)
1719  {
1720 #ifdef TRGM_REGEXP_DEBUG
1721  fprintf(stderr, " ... not ok to merge\n");
1722 #endif
1723  continue;
1724  }
1725 
1726  /* OK, merge states linked by each arc labeled by the trigram */
1727  foreach(cell, trgmInfo->arcs)
1728  {
1729  TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1730  TrgmState *source = arcInfo->source,
1731  *target = arcInfo->target;
1732 
1733  while (source->parent)
1734  source = source->parent;
1735  while (target->parent)
1736  target = target->parent;
1737  if (source != target)
1738  {
1739 #ifdef TRGM_REGEXP_DEBUG
1740  fprintf(stderr, "merging s%d into s%d\n",
1741  -target->snumber, -source->snumber);
1742 #endif
1743  mergeStates(source, target);
1744  /* Assert we didn't merge initial and final states */
1745  Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1746  (TSTATE_INIT | TSTATE_FIN));
1747  }
1748  }
1749 
1750  /* Mark trigram unexpanded, and update totals */
1751  trgmInfo->expanded = false;
1752  totalTrgmCount -= trgmInfo->count;
1753  totalTrgmPenalty -= trgmInfo->penalty;
1754  }
1755 
1756  /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1757  if (totalTrgmCount > MAX_TRGM_COUNT)
1758  return false;
1759 
1760  trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1761 
1762  /*
1763  * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1764  * and enumerate the color trigrams that are expanded.
1765  */
1766  cnumber = 0;
1767  qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1769  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1770  {
1771  if (colorTrgms[i].expanded)
1772  {
1773  colorTrgms[i].cnumber = cnumber;
1774  cnumber++;
1775  }
1776  }
1777 
1778  return true;
1779 }
1780 
1781 /*
1782  * Expand selected color trigrams into regular trigrams.
1783  *
1784  * Returns the TRGM array to be passed to the index machinery.
1785  * The array must be allocated in rcontext.
1786  */
1787 static TRGM *
1789 {
1790  TRGM *trg;
1791  trgm *p;
1792  int i;
1793  TrgmColorInfo blankColor;
1794  trgm_mb_char blankChar;
1795 
1796  /* Set up "blank" color structure containing a single zero character */
1797  memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1798  blankColor.wordCharsCount = 1;
1799  blankColor.wordChars = &blankChar;
1800 
1801  /* Construct the trgm array */
1802  trg = (TRGM *)
1803  MemoryContextAllocZero(rcontext,
1804  TRGMHDRSIZE +
1805  trgmNFA->totalTrgmCount * sizeof(trgm));
1806  trg->flag = ARRKEY;
1807  SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1808  p = GETARR(trg);
1809  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1810  {
1811  ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1812  TrgmColorInfo *c[3];
1813  trgm_mb_char s[3];
1814  int j,
1815  i1,
1816  i2,
1817  i3;
1818 
1819  /* Ignore any unexpanded trigrams ... */
1820  if (!colorTrgm->expanded)
1821  continue;
1822 
1823  /* Get colors, substituting the dummy struct for COLOR_BLANK */
1824  for (j = 0; j < 3; j++)
1825  {
1826  if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1827  c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1828  else
1829  c[j] = &blankColor;
1830  }
1831 
1832  /* Iterate over all possible combinations of colors' characters */
1833  for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1834  {
1835  s[0] = c[0]->wordChars[i1];
1836  for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1837  {
1838  s[1] = c[1]->wordChars[i2];
1839  for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1840  {
1841  s[2] = c[2]->wordChars[i3];
1842  fillTrgm(p, s);
1843  p++;
1844  }
1845  }
1846  }
1847  }
1848 
1849  return trg;
1850 }
1851 
1852 /*
1853  * Convert trigram into trgm datatype.
1854  */
1855 static void
1857 {
1858  char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1859  *p;
1860  int i,
1861  j;
1862 
1863  /* Write multibyte string into "str" (we don't need null termination) */
1864  p = str;
1865 
1866  for (i = 0; i < 3; i++)
1867  {
1868  if (s[i].bytes[0] != 0)
1869  {
1870  for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1871  *p++ = s[i].bytes[j];
1872  }
1873  else
1874  {
1875  /* Emit a space in place of COLOR_BLANK */
1876  *p++ = ' ';
1877  }
1878  }
1879 
1880  /* Convert "str" to a standard trigram (possibly hashing it) */
1881  compact_trigram(ptrgm, str, p - str);
1882 }
1883 
1884 /*
1885  * Merge two states of graph.
1886  */
1887 static void
1889 {
1890  Assert(state1 != state2);
1891  Assert(!state1->parent);
1892  Assert(!state2->parent);
1893 
1894  /* state1 absorbs state2's flags */
1895  state1->flags |= state2->flags;
1896 
1897  /* state2, and indirectly all its children, become children of state1 */
1898  state2->parent = state1;
1899 }
1900 
1901 /*
1902  * Compare function for sorting of color trigrams by their colors.
1903  */
1904 static int
1905 colorTrgmInfoCmp(const void *p1, const void *p2)
1906 {
1907  const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1908  const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1909 
1910  return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1911 }
1912 
1913 /*
1914  * Compare function for sorting color trigrams in descending order of
1915  * their penalty fields.
1916  */
1917 static int
1918 colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1919 {
1920  float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1921  float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1922 
1923  if (penalty1 < penalty2)
1924  return 1;
1925  else if (penalty1 == penalty2)
1926  return 0;
1927  else
1928  return -1;
1929 }
1930 
1931 
1932 /*---------------------
1933  * Subroutines for packing the graph into final representation (stage 4).
1934  *---------------------
1935  */
1936 
1937 /*
1938  * Pack expanded graph into final representation.
1939  *
1940  * The result data must be allocated in rcontext.
1941  */
1942 static TrgmPackedGraph *
1943 packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
1944 {
1945  int snumber = 2,
1946  arcIndex,
1947  arcsCount;
1948  HASH_SEQ_STATUS scan_status;
1949  TrgmState *state;
1950  TrgmPackArcInfo *arcs;
1951  TrgmPackedArc *packedArcs;
1952  TrgmPackedGraph *result;
1953  int i,
1954  j;
1955 
1956  /* Enumerate surviving states, giving init and fin reserved numbers */
1957  hash_seq_init(&scan_status, trgmNFA->states);
1958  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1959  {
1960  while (state->parent)
1961  state = state->parent;
1962 
1963  if (state->snumber < 0)
1964  {
1965  if (state->flags & TSTATE_INIT)
1966  state->snumber = 0;
1967  else if (state->flags & TSTATE_FIN)
1968  state->snumber = 1;
1969  else
1970  {
1971  state->snumber = snumber;
1972  snumber++;
1973  }
1974  }
1975  }
1976 
1977  /* Collect array of all arcs */
1978  arcs = (TrgmPackArcInfo *)
1979  palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount);
1980  arcIndex = 0;
1981  hash_seq_init(&scan_status, trgmNFA->states);
1982  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1983  {
1984  TrgmState *source = state;
1985  ListCell *cell;
1986 
1987  while (source->parent)
1988  source = source->parent;
1989 
1990  foreach(cell, state->arcs)
1991  {
1992  TrgmArc *arc = (TrgmArc *) lfirst(cell);
1993  TrgmState *target = arc->target;
1994 
1995  while (target->parent)
1996  target = target->parent;
1997 
1998  if (source->snumber != target->snumber)
1999  {
2000  ColorTrgmInfo *ctrgm;
2001 
2002  ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
2003  trgmNFA->colorTrgms,
2004  trgmNFA->colorTrgmsCount,
2005  sizeof(ColorTrgmInfo),
2007  Assert(ctrgm != NULL);
2008  Assert(ctrgm->expanded);
2009 
2010  arcs[arcIndex].sourceState = source->snumber;
2011  arcs[arcIndex].targetState = target->snumber;
2012  arcs[arcIndex].colorTrgm = ctrgm->cnumber;
2013  arcIndex++;
2014  }
2015  }
2016  }
2017 
2018  /* Sort arcs to ease duplicate detection */
2019  qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
2020 
2021  /* We could have duplicates because states were merged. Remove them. */
2022  if (arcIndex > 1)
2023  {
2024  /* p1 is probe point, p2 is last known non-duplicate. */
2025  TrgmPackArcInfo *p1,
2026  *p2;
2027 
2028  p2 = arcs;
2029  for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2030  {
2031  if (packArcInfoCmp(p1, p2) > 0)
2032  {
2033  p2++;
2034  *p2 = *p1;
2035  }
2036  }
2037  arcsCount = (p2 - arcs) + 1;
2038  }
2039  else
2040  arcsCount = arcIndex;
2041 
2042  /* Create packed representation */
2043  result = (TrgmPackedGraph *)
2044  MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
2045 
2046  /* Pack color trigrams information */
2047  result->colorTrigramsCount = 0;
2048  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2049  {
2050  if (trgmNFA->colorTrgms[i].expanded)
2051  result->colorTrigramsCount++;
2052  }
2053  result->colorTrigramGroups = (int *)
2054  MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2055  j = 0;
2056  for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2057  {
2058  if (trgmNFA->colorTrgms[i].expanded)
2059  {
2060  result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2061  j++;
2062  }
2063  }
2064 
2065  /* Pack states and arcs information */
2066  result->statesCount = snumber;
2067  result->states = (TrgmPackedState *)
2068  MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2069  packedArcs = (TrgmPackedArc *)
2070  MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2071  j = 0;
2072  for (i = 0; i < snumber; i++)
2073  {
2074  int cnt = 0;
2075 
2076  result->states[i].arcs = &packedArcs[j];
2077  while (j < arcsCount && arcs[j].sourceState == i)
2078  {
2079  packedArcs[j].targetState = arcs[j].targetState;
2080  packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2081  cnt++;
2082  j++;
2083  }
2084  result->states[i].arcsCount = cnt;
2085  }
2086 
2087  /* Allocate working memory for trigramsMatchGraph() */
2088  result->colorTrigramsActive = (bool *)
2089  MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2090  result->statesActive = (bool *)
2091  MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2092  result->statesQueue = (int *)
2093  MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2094 
2095  return result;
2096 }
2097 
2098 /*
2099  * Comparison function for sorting TrgmPackArcInfos.
2100  *
2101  * Compares arcs in following order: sourceState, colorTrgm, targetState.
2102  */
2103 static int
2104 packArcInfoCmp(const void *a1, const void *a2)
2105 {
2106  const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2107  const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2108 
2109  if (p1->sourceState < p2->sourceState)
2110  return -1;
2111  if (p1->sourceState > p2->sourceState)
2112  return 1;
2113  if (p1->colorTrgm < p2->colorTrgm)
2114  return -1;
2115  if (p1->colorTrgm > p2->colorTrgm)
2116  return 1;
2117  if (p1->targetState < p2->targetState)
2118  return -1;
2119  if (p1->targetState > p2->targetState)
2120  return 1;
2121  return 0;
2122 }
2123 
2124 
2125 /*---------------------
2126  * Debugging functions
2127  *
2128  * These are designed to emit GraphViz files.
2129  *---------------------
2130  */
2131 
2132 #ifdef TRGM_REGEXP_DEBUG
2133 
2134 /*
2135  * Print initial NFA, in regexp library's representation
2136  */
2137 static void
2138 printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2139 {
2141  int nstates = pg_reg_getnumstates(regex);
2142  int state;
2143  int i;
2144 
2145  initStringInfo(&buf);
2146 
2147  appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2148 
2149  for (state = 0; state < nstates; state++)
2150  {
2151  regex_arc_t *arcs;
2152  int i,
2153  arcsCount;
2154 
2155  appendStringInfo(&buf, "s%d", state);
2156  if (pg_reg_getfinalstate(regex) == state)
2157  appendStringInfoString(&buf, " [shape = doublecircle]");
2158  appendStringInfoString(&buf, ";\n");
2159 
2160  arcsCount = pg_reg_getnumoutarcs(regex, state);
2161  arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
2162  pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2163 
2164  for (i = 0; i < arcsCount; i++)
2165  {
2166  appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2167  state, arcs[i].to, arcs[i].co);
2168  }
2169 
2170  pfree(arcs);
2171  }
2172 
2173  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2174  appendStringInfo(&buf, " initial -> s%d;\n",
2175  pg_reg_getinitialstate(regex));
2176 
2177  /* Print colors */
2178  appendStringInfoString(&buf, " { rank = sink;\n");
2179  appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2180 
2181  for (i = 0; i < ncolors; i++)
2182  {
2183  TrgmColorInfo *color = &colors[i];
2184  int j;
2185 
2186  appendStringInfo(&buf, "<br/>Color %d: ", i);
2187  if (color->expandable)
2188  {
2189  for (j = 0; j < color->wordCharsCount; j++)
2190  {
2191  char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2192 
2193  memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2194  s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2196  }
2197  }
2198  else
2199  appendStringInfoString(&buf, "not expandable");
2200  appendStringInfoChar(&buf, '\n');
2201  }
2202 
2203  appendStringInfoString(&buf, " >];\n");
2204  appendStringInfoString(&buf, " }\n");
2205  appendStringInfoString(&buf, "}\n");
2206 
2207  {
2208  /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2209  FILE *fp = fopen("/tmp/source.gv", "w");
2210 
2211  fprintf(fp, "%s", buf.data);
2212  fclose(fp);
2213  }
2214 
2215  pfree(buf.data);
2216 }
2217 
2218 /*
2219  * Print expanded graph.
2220  */
2221 static void
2222 printTrgmNFA(TrgmNFA *trgmNFA)
2223 {
2225  HASH_SEQ_STATUS scan_status;
2226  TrgmState *state;
2227  TrgmState *initstate = NULL;
2228 
2229  initStringInfo(&buf);
2230 
2231  appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2232 
2233  hash_seq_init(&scan_status, trgmNFA->states);
2234  while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2235  {
2236  ListCell *cell;
2237 
2238  appendStringInfo(&buf, "s%d", -state->snumber);
2239  if (state->flags & TSTATE_FIN)
2240  appendStringInfoString(&buf, " [shape = doublecircle]");
2241  if (state->flags & TSTATE_INIT)
2242  initstate = state;
2243  appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2244  appendStringInfoString(&buf, ";\n");
2245 
2246  foreach(cell, state->arcs)
2247  {
2248  TrgmArc *arc = (TrgmArc *) lfirst(cell);
2249 
2250  appendStringInfo(&buf, " s%d -> s%d [label = \"",
2251  -state->snumber, -arc->target->snumber);
2252  printTrgmColor(&buf, arc->ctrgm.colors[0]);
2253  appendStringInfoChar(&buf, ' ');
2254  printTrgmColor(&buf, arc->ctrgm.colors[1]);
2255  appendStringInfoChar(&buf, ' ');
2256  printTrgmColor(&buf, arc->ctrgm.colors[2]);
2257  appendStringInfoString(&buf, "\"];\n");
2258  }
2259  }
2260 
2261  if (initstate)
2262  {
2263  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2264  appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2265  }
2266 
2267  appendStringInfoString(&buf, "}\n");
2268 
2269  {
2270  /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2271  FILE *fp = fopen("/tmp/transformed.gv", "w");
2272 
2273  fprintf(fp, "%s", buf.data);
2274  fclose(fp);
2275  }
2276 
2277  pfree(buf.data);
2278 }
2279 
2280 /*
2281  * Print a TrgmColor readably.
2282  */
2283 static void
2284 printTrgmColor(StringInfo buf, TrgmColor co)
2285 {
2286  if (co == COLOR_UNKNOWN)
2287  appendStringInfoChar(buf, 'u');
2288  else if (co == COLOR_BLANK)
2289  appendStringInfoChar(buf, 'b');
2290  else
2291  appendStringInfo(buf, "%d", (int) co);
2292 }
2293 
2294 /*
2295  * Print final packed representation of trigram-based expanded graph.
2296  */
2297 static void
2298 printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
2299 {
2301  trgm *p;
2302  int i;
2303 
2304  initStringInfo(&buf);
2305 
2306  appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2307 
2308  for (i = 0; i < packedGraph->statesCount; i++)
2309  {
2310  TrgmPackedState *state = &packedGraph->states[i];
2311  int j;
2312 
2313  appendStringInfo(&buf, " s%d", i);
2314  if (i == 1)
2315  appendStringInfoString(&buf, " [shape = doublecircle]");
2316 
2317  appendStringInfo(&buf, " [label = <s%d>];\n", i);
2318 
2319  for (j = 0; j < state->arcsCount; j++)
2320  {
2321  TrgmPackedArc *arc = &state->arcs[j];
2322 
2323  appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2324  i, arc->targetState, arc->colorTrgm);
2325  }
2326  }
2327 
2328  appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2329  appendStringInfo(&buf, " initial -> s%d;\n", 0);
2330 
2331  /* Print trigrams */
2332  appendStringInfoString(&buf, " { rank = sink;\n");
2333  appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2334 
2335  p = GETARR(trigrams);
2336  for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2337  {
2338  int count = packedGraph->colorTrigramGroups[i];
2339  int j;
2340 
2341  appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2342 
2343  for (j = 0; j < count; j++)
2344  {
2345  if (j > 0)
2346  appendStringInfoString(&buf, ", ");
2347 
2348  /*
2349  * XXX This representation is nice only for all-ASCII trigrams.
2350  */
2351  appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2352  p++;
2353  }
2354  }
2355 
2356  appendStringInfoString(&buf, " >];\n");
2357  appendStringInfoString(&buf, " }\n");
2358  appendStringInfoString(&buf, "}\n");
2359 
2360  {
2361  /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2362  FILE *fp = fopen("/tmp/packed.gv", "w");
2363 
2364  fprintf(fp, "%s", buf.data);
2365  fclose(fp);
2366  }
2367 
2368  pfree(buf.data);
2369 }
2370 
2371 #endif /* TRGM_REGEXP_DEBUG */
float float4
Definition: c.h:613
#define MemSet(start, val, len)
Definition: c.h:1004
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1377
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define PG_TRY(...)
Definition: elog.h:370
#define PG_END_TRY(...)
Definition: elog.h:395
#define ERROR
Definition: elog.h:39
#define PG_FINALLY(...)
Definition: elog.h:387
#define ereport(elevel,...)
Definition: elog.h:149
static const FormData_pg_attribute a1
Definition: heap.c:141
static const FormData_pg_attribute a2
Definition: heap.c:155
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define CALCGTSIZE(flag, siglen)
Definition: hstore_gist.c:59
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lappend(List *list, void *datum)
Definition: list.c:338
void list_free(List *list)
Definition: list.c:1545
List * list_concat(List *list1, const List *list2)
Definition: list.c:560
unsigned int pg_wchar
Definition: mbprint.c:31
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:1009
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:987
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc0(Size size)
Definition: mcxt.c:1241
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1048
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1005
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:387
void * palloc(Size size)
Definition: mcxt.c:1210
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
#define lfirst(lc)
Definition: pg_list.h:172
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define list_make1(x1)
Definition: pg_list.h:212
#define foreach_delete_current(lst, cell)
Definition: pg_list.h:390
static rewind_source * source
Definition: pg_rewind.c:87
static char * buf
Definition: pg_test_fsync.c:67
#define MAX_MULTIBYTE_CHAR_LEN
Definition: pg_wchar.h:32
#define fprintf
Definition: port.h:242
#define qsort(a, b, c, d)
Definition: port.h:445
unsigned int Oid
Definition: postgres_ext.h:31
char * c
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition: regcomp.c:372
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition: regerror.c:60
#define REG_ICASE
Definition: regex.h:106
#define REG_ADVANCED
Definition: regex.h:103
#define REG_OKAY
Definition: regex.h:137
#define REG_NOSUB
Definition: regex.h:107
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
Definition: regexport.c:134
int pg_reg_getnumcolors(const regex_t *regex)
Definition: regexport.c:174
int pg_reg_getnumstates(const regex_t *regex)
Definition: regexport.c:36
int pg_reg_getfinalstate(const regex_t *regex)
Definition: regexport.c:64
int pg_reg_getinitialstate(const regex_t *regex)
Definition: regexport.c:50
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
Definition: regexport.c:155
int pg_reg_colorisend(const regex_t *regex, int co)
Definition: regexport.c:208
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
Definition: regexport.c:266
int pg_reg_getnumcharacters(const regex_t *regex, int co)
Definition: regexport.c:230
int pg_reg_colorisbegin(const regex_t *regex, int co)
Definition: regexport.c:191
void pg_regfree(regex_t *re)
Definition: regfree.c:49
short color
Definition: regguts.h:150
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
ColorTrgm ctrgm
Definition: trgm_regexp.c:375
float4 penalty
Definition: trgm_regexp.c:378
TrgmColor colors[3]
Definition: trgm_regexp.c:300
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
MemoryContext hcxt
Definition: hsearch.h:86
Definition: dynahash.c:220
Definition: pg_list.h:54
Definition: trgm.h:67
uint8 flag
Definition: trgm.h:69
TrgmState * source
Definition: trgm_regexp.c:360
TrgmState * target
Definition: trgm_regexp.c:361
ColorTrgm ctrgm
Definition: trgm_regexp.c:349
TrgmState * target
Definition: trgm_regexp.c:350
trgm_mb_char * wordChars
Definition: trgm_regexp.c:264
bool containsNonWord
Definition: trgm_regexp.c:262
bool overflowed
Definition: trgm_regexp.c:416
int nstates
Definition: trgm_regexp.c:410
int arcsCount
Definition: trgm_regexp.c:415
ColorTrgmInfo * colorTrgms
Definition: trgm_regexp.c:419
List * queue
Definition: trgm_regexp.c:413
int colorTrgmsCount
Definition: trgm_regexp.c:420
TrgmColorInfo * colorInfo
Definition: trgm_regexp.c:404
int totalTrgmCount
Definition: trgm_regexp.c:421
List * keysQueue
Definition: trgm_regexp.c:414
int ncolors
Definition: trgm_regexp.c:405
TrgmState * initState
Definition: trgm_regexp.c:409
regex_t * regex
Definition: trgm_regexp.c:403
HTAB * states
Definition: trgm_regexp.c:408
TrgmPackedState * states
Definition: trgm_regexp.c:459
bool * statesActive
Definition: trgm_regexp.c:463
bool * colorTrigramsActive
Definition: trgm_regexp.c:462
int * colorTrigramGroups
Definition: trgm_regexp.c:452
TrgmPackedArc * arcs
Definition: trgm_regexp.c:436
TrgmColor colors[2]
Definition: trgm_regexp.c:291
TrgmPrefix prefix
Definition: trgm_regexp.c:310
List * arcs
Definition: trgm_regexp.c:335
List * enterKeys
Definition: trgm_regexp.c:336
struct TrgmState * parent
Definition: trgm_regexp.c:339
TrgmStateKey stateKey
Definition: trgm_regexp.c:334
int tentFlags
Definition: trgm_regexp.c:340
struct TrgmState * tentParent
Definition: trgm_regexp.c:341
Definition: regguts.h:291
color co
Definition: regguts.h:293
struct state * to
Definition: regguts.h:295
Definition: regex.h:56
Definition: regguts.h:318
char bytes[MAX_MULTIBYTE_CHAR_LEN]
Definition: trgm_regexp.c:243
Definition: c.h:671
#define ISWORDCHR(c)
Definition: trgm.h:55
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition: trgm_op.c:198
char trgm[3]
Definition: trgm.h:42
#define GETARR(x)
Definition: trgm.h:106
#define ARRKEY
Definition: trgm.h:96
#define TRGMHDRSIZE
Definition: trgm.h:73
int TrgmColor
Definition: trgm_regexp.c:283
static bool convertPgWchar(pg_wchar c, trgm_mb_char *result)
Definition: trgm_regexp.c:840
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
Definition: trgm_regexp.c:1341
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1396
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:776
#define COLOR_BLANK
Definition: trgm_regexp.c:287
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:578
#define TSTATE_INIT
Definition: trgm_regexp.c:329
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
Definition: trgm_regexp.c:1299
#define MAX_EXPANDED_STATES
Definition: trgm_regexp.c:219
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1918
#define MAX_TRGM_COUNT
Definition: trgm_regexp.c:221
#define MAX_EXPANDED_ARCS
Definition: trgm_regexp.c:220
struct TrgmState TrgmState
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:1469
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
Definition: trgm_regexp.c:1856
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1788
static void transformGraph(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:906
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
Definition: trgm_regexp.c:639
static const float4 penalties[8]
Definition: trgm_regexp.c:229
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1189
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
Definition: trgm_regexp.c:1020
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1943
static int packArcInfoCmp(const void *a1, const void *a2)
Definition: trgm_regexp.c:2104
static void processState(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:970
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:522
#define COLOR_COUNT_LIMIT
Definition: trgm_regexp.c:223
#define TSTATE_FIN
Definition: trgm_regexp.c:330
#define WISH_TRGM_PENALTY
Definition: trgm_regexp.c:222
static void mergeStates(TrgmState *state1, TrgmState *state2)
Definition: trgm_regexp.c:1888
#define COLOR_UNKNOWN
Definition: trgm_regexp.c:286
static int colorTrgmInfoCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1905
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
Definition: trgm_regexp.c:732
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:1201
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
Definition: trgm_regexp.c:1427
char * lowerstr(const char *str)
Definition: ts_locale.c:255
#define VARDATA_ANY(PTR)
Definition: varatt.h:324
#define SET_VARSIZE(PTR, len)
Definition: varatt.h:305
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317
static char chars[TZ_MAX_CHARS]
Definition: zic.c:401