PostgreSQL Source Code git master
Loading...
Searching...
No Matches
trgm_regexp.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * trgm_regexp.c
4 * Regular expression matching using trigrams.
5 *
6 * The general idea of trigram index support for a regular expression (regex)
7 * search is to transform the regex into a logical expression on trigrams.
8 * For example:
9 *
10 * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11 *
12 * If a string matches the regex, then it must match the logical expression on
13 * trigrams. The opposite is not necessarily true, however: a string that
14 * matches the logical expression might not match the original regex. Such
15 * false positives are removed via recheck, by running the regular regex match
16 * operator on the retrieved heap tuple.
17 *
18 * Since the trigram expression involves both AND and OR operators, we can't
19 * expect the core index machinery to evaluate it completely. Instead, the
20 * result of regex analysis is a list of trigrams to be sought in the index,
21 * plus a simplified graph that is used by trigramsMatchGraph() to determine
22 * whether a particular indexed value matches the expression.
23 *
24 * Converting a regex to a trigram expression is based on analysis of an
25 * automaton corresponding to the regex. The algorithm consists of four
26 * stages:
27 *
28 * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29 * regexp library, which provides accessors for its opaque regex_t struct
30 * to expose the NFA state graph and the "colors" (sets of equivalent
31 * characters) used as state transition labels.
32 *
33 * 2) Transform the original NFA into an expanded graph, where arcs
34 * are labeled with trigrams that must be present in order to move from
35 * one state to another via the arcs. The trigrams used in this stage
36 * consist of colors, not characters, as in the original NFA.
37 *
38 * 3) Expand the color trigrams into regular trigrams consisting of
39 * characters. If too many distinct trigrams are produced, trigrams are
40 * eliminated and the graph is simplified until it's simple enough.
41 *
42 * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43 * and returned to the caller.
44 *
45 * 1) Compile the regexp to NFA form
46 * ---------------------------------
47 * The automaton returned by the regexp compiler is a graph where vertices
48 * are "states" and arcs are labeled with colors. Each color represents
49 * a set of characters, so that all characters assigned to the same color
50 * are interchangeable, so far as matching the regexp is concerned. There
51 * are two special states: "initial" and "final". A state can have multiple
52 * outgoing arcs labeled with the same color, which makes the automaton
53 * non-deterministic, because it can be in many states simultaneously.
54 *
55 * Note that this NFA is already lossy compared to the original regexp,
56 * since it ignores some regex features such as lookahead constraints and
57 * backref matching. This is OK for our purposes since it's still the case
58 * that only strings matching the NFA can possibly satisfy the regexp.
59 *
60 * 2) Transform the original NFA into an expanded graph
61 * ----------------------------------------------------
62 * In the 2nd stage, the automaton is transformed into a graph based on the
63 * original NFA. Each state in the expanded graph represents a state from
64 * the original NFA, plus a prefix identifying the last two characters
65 * (colors, to be precise) seen before entering the state. There can be
66 * multiple states in the expanded graph for each state in the original NFA,
67 * depending on what characters can precede it. A prefix position can be
68 * "unknown" if it's uncertain what the preceding character was, or "blank"
69 * if the character was a non-word character (we don't need to distinguish
70 * which non-word character it was, so just think of all of them as blanks).
71 *
72 * For convenience in description, call an expanded-state identifier
73 * (two prefix colors plus a state number from the original NFA) an
74 * "enter key".
75 *
76 * Each arc of the expanded graph is labeled with a trigram that must be
77 * present in the string to match. We can construct this from an out-arc of
78 * the underlying NFA state by combining the expanded state's prefix with the
79 * color label of the underlying out-arc, if neither prefix position is
80 * "unknown". But note that some of the colors in the trigram might be
81 * "blank". This is OK since we want to generate word-boundary trigrams as
82 * the regular trigram machinery would, if we know that some word characters
83 * must be adjacent to a word boundary in all strings matching the NFA.
84 *
85 * The expanded graph can also have fewer states than the original NFA,
86 * because we don't bother to make a separate state entry unless the state
87 * is reachable by a valid arc. When an enter key is reachable from a state
88 * of the expanded graph, but we do not know a complete trigram associated
89 * with that transition, we cannot make a valid arc; instead we insert the
90 * enter key into the enterKeys list of the source state. This effectively
91 * means that the two expanded states are not reliably distinguishable based
92 * on examining trigrams.
93 *
94 * So the expanded graph resembles the original NFA, but the arcs are
95 * labeled with trigrams instead of individual characters, and there may be
96 * more or fewer states. It is a lossy representation of the original NFA:
97 * any string that matches the original regexp must match the expanded graph,
98 * but the reverse is not true.
99 *
100 * We build the expanded graph through a breadth-first traversal of states
101 * reachable from the initial state. At each reachable state, we identify the
102 * states reachable from it without traversing a predictable trigram, and add
103 * those states' enter keys to the current state. Then we generate all
104 * out-arcs leading out of this collection of states that have predictable
105 * trigrams, adding their target states to the queue of states to examine.
106 *
107 * When building the graph, if the number of states or arcs exceed pre-defined
108 * limits, we give up and simply mark any states not yet processed as final
109 * states. Roughly speaking, that means that we make use of some portion from
110 * the beginning of the regexp. Also, any colors that have too many member
111 * characters are treated as "unknown", so that we can't derive trigrams
112 * from them.
113 *
114 * 3) Expand the color trigrams into regular trigrams
115 * --------------------------------------------------
116 * The trigrams in the expanded graph are "color trigrams", consisting
117 * of three consecutive colors that must be present in the string. But for
118 * search, we need regular trigrams consisting of characters. In the 3rd
119 * stage, the color trigrams are expanded into regular trigrams. Since each
120 * color can represent many characters, the total number of regular trigrams
121 * after expansion could be very large. Because searching the index for
122 * thousands of trigrams would be slow, and would likely produce so many
123 * false positives that we would have to traverse a large fraction of the
124 * index, the graph is simplified further in a lossy fashion by removing
125 * color trigrams. When a color trigram is removed, the states connected by
126 * any arcs labeled with that trigram are merged.
127 *
128 * Trigrams do not all have equivalent value for searching: some of them are
129 * more frequent and some of them are less frequent. Ideally, we would like
130 * to know the distribution of trigrams, but we don't. But because of padding
131 * we know for sure that the empty character is more frequent than others,
132 * so we can penalize trigrams according to presence of whitespace. The
133 * penalty assigned to each color trigram is the number of simple trigrams
134 * it would produce, times the penalties[] multiplier associated with its
135 * whitespace content. (The penalties[] constants were calculated by analysis
136 * of some real-life text.) We eliminate color trigrams starting with the
137 * highest-penalty one, until we get to a total penalty of no more than
138 * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139 * lead to merging the initial and final states, so we may not be able to
140 * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141 * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142 *
143 * 4) Pack the graph into a compact representation
144 * -----------------------------------------------
145 * The 2nd and 3rd stages might have eliminated or merged many of the states
146 * and trigrams created earlier, so in this final stage, the graph is
147 * compacted and packed into a simpler struct that contains only the
148 * information needed to evaluate it.
149 *
150 * ALGORITHM EXAMPLE:
151 *
152 * Consider the example regex "ab[cd]". This regex is transformed into the
153 * following NFA (for simplicity we show colors as their single members):
154 *
155 * 4#
156 * c/
157 * a b /
158 * 1* --- 2 ---- 3
159 * \
160 * d\
161 * 5#
162 *
163 * We use * to mark initial state and # to mark final state. It's not depicted,
164 * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165 * because this pattern can match to any part of a string.
166 *
167 * As the result of stage 2 we will have the following graph:
168 *
169 * abc abd
170 * 2# <---- 1* ----> 3#
171 *
172 * The process for generating this graph is:
173 * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174 * 2) Add key (UNKNOWN, "a", 2) to state 1.
175 * 3) Add key ("a", "b", 3) to state 1.
176 * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177 * from state 1 to state 2 with label trigram "abc".
178 * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179 * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180 * from state 1 to state 3 with label trigram "abd".
181 * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182 *
183 *
184 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
185 * Portions Copyright (c) 1994, Regents of the University of California
186 *
187 * IDENTIFICATION
188 * contrib/pg_trgm/trgm_regexp.c
189 *
190 *-------------------------------------------------------------------------
191 */
192#include "postgres.h"
193
194#include "catalog/pg_collation_d.h"
195#include "regex/regexport.h"
196#include "trgm.h"
197#include "tsearch/ts_locale.h"
198#include "utils/formatting.h"
199#include "utils/hsearch.h"
200#include "utils/memutils.h"
201#include "varatt.h"
202
203/*
204 * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
205 * for exploring and debugging the algorithm implementation.
206 * This produces three graph files in /tmp, in Graphviz .gv format.
207 * Some progress information is also printed to postmaster stderr.
208 */
209/* #define TRGM_REGEXP_DEBUG */
210
211/*
212 * These parameters are used to limit the amount of work done.
213 * Otherwise regex processing could be too slow and memory-consuming.
214 *
215 * MAX_EXPANDED_STATES - How many states we allow in expanded graph
216 * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
217 * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
218 * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
219 * COLOR_COUNT_LIMIT - Maximum number of characters per color
220 */
221#define MAX_EXPANDED_STATES 128
222#define MAX_EXPANDED_ARCS 1024
223#define MAX_TRGM_COUNT 256
224#define WISH_TRGM_PENALTY 16
225#define COLOR_COUNT_LIMIT 256
226
227/*
228 * Penalty multipliers for trigram counts depending on whitespace contents.
229 * Numbers based on analysis of real-life texts.
230 */
231static const float4 penalties[8] = {
232 1.0f, /* "aaa" */
233 3.5f, /* "aa " */
234 0.0f, /* "a a" (impossible) */
235 0.0f, /* "a " (impossible) */
236 4.2f, /* " aa" */
237 2.1f, /* " a " */
238 25.0f, /* " a" */
239 0.0f /* " " (impossible) */
240};
241
242/* Struct representing a single pg_wchar, converted back to multibyte form */
243typedef struct
244{
247
248/*
249 * Attributes of NFA colors:
250 *
251 * expandable - we know the character expansion of this color
252 * containsNonWord - color contains non-word characters
253 * (which will not be extracted into trigrams)
254 * wordCharsCount - count of word characters in color
255 * wordChars - array of this color's word characters
256 * (which can be extracted into trigrams)
257 *
258 * When expandable is false, the other attributes don't matter; we just
259 * assume this color represents unknown character(s).
260 */
268
269/*
270 * A "prefix" is information about the colors of the last two characters read
271 * before reaching a specific NFA state. These colors can have special values
272 * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
273 * information, for example because we read some character of an unexpandable
274 * color. COLOR_BLANK means that we read a non-word character.
275 *
276 * We call a prefix ambiguous if at least one of its colors is unknown. It's
277 * fully ambiguous if both are unknown, partially ambiguous if only the first
278 * is unknown. (The case of first color known, second unknown is not valid.)
279 *
280 * Wholly- or partly-blank prefixes are mostly handled the same as regular
281 * color prefixes. This allows us to generate appropriate partly-blank
282 * trigrams when the NFA requires word character(s) to appear adjacent to
283 * non-word character(s).
284 */
285typedef int TrgmColor;
286
287/* We assume that colors returned by the regexp engine cannot be these: */
288#define COLOR_UNKNOWN (-3)
289#define COLOR_BLANK (-4)
290
291typedef struct
292{
293 TrgmColor colors[2];
294} TrgmPrefix;
295
296/*
297 * Color-trigram data type. Note that some elements of the trigram can be
298 * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
299 */
300typedef struct
301{
302 TrgmColor colors[3];
303} ColorTrgm;
304
305/*
306 * Key identifying a state of our expanded graph: color prefix, and number
307 * of the corresponding state in the underlying regex NFA. The color prefix
308 * shows how we reached the regex state (to the extent that we know it).
309 */
310typedef struct
311{
315
316/*
317 * One state of the expanded graph.
318 *
319 * stateKey - ID of this state
320 * arcs - outgoing arcs of this state (List of TrgmArc)
321 * enterKeys - enter keys reachable from this state without reading any
322 * predictable trigram (List of TrgmStateKey)
323 * flags - flag bits
324 * snumber - number of this state (initially assigned as -1, -2, etc,
325 * for debugging purposes only; then at the packaging stage,
326 * surviving states are renumbered with positive numbers)
327 * parent - parent state, if this state has been merged into another
328 * tentFlags - flags this state would acquire via planned merges
329 * tentParent - planned parent state, if considering a merge
330 */
331#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
332#define TSTATE_FIN 0x02 /* flag indicating this state is final */
333
334typedef struct TrgmState
335{
336 TrgmStateKey stateKey; /* hashtable key: must be first field */
339 int flags;
345
346/*
347 * One arc in the expanded graph.
348 */
349typedef struct
350{
351 ColorTrgm ctrgm; /* trigram needed to traverse arc */
352 TrgmState *target; /* next state */
353} TrgmArc;
354
355/*
356 * Information about arc of specific color trigram (used in stage 3)
357 *
358 * Contains pointers to the source and target states.
359 */
365
366/*
367 * Information about color trigram (used in stage 3)
368 *
369 * ctrgm - trigram itself
370 * cnumber - number of this trigram (used in the packaging stage)
371 * count - number of simple trigrams created from this color trigram
372 * expanded - indicates this color trigram is expanded into simple trigrams
373 * arcs - list of all arcs labeled with this color trigram.
374 */
384
385/*
386 * Data structure representing all the data we need during regex processing.
387 *
388 * regex - compiled regex
389 * colorInfo - extracted information about regex's colors
390 * ncolors - number of colors in colorInfo[]
391 * states - hashtable of TrgmStates (states of expanded graph)
392 * initState - pointer to initial state of expanded graph
393 * queue - queue of to-be-processed TrgmStates
394 * keysQueue - queue of to-be-processed TrgmStateKeys
395 * arcsCount - total number of arcs of expanded graph (for resource
396 * limiting)
397 * overflowed - we have exceeded resource limit for transformation
398 * colorTrgms - array of all color trigrams present in graph
399 * colorTrgmsCount - count of those color trigrams
400 * totalTrgmCount - total count of extracted simple trigrams
401 */
402typedef struct
403{
404 /* Source regexp, and color information extracted from it (stage 1) */
408
409 /* Expanded graph (stage 2) */
413
414 /* Workspace for stage 2 */
419
420 /* Information about distinct color trigrams in the graph (stage 3) */
424} TrgmNFA;
425
426/*
427 * Final, compact representation of expanded graph.
428 */
429typedef struct
430{
431 int targetState; /* index of target state (zero-based) */
432 int colorTrgm; /* index of color trigram for transition */
434
435typedef struct
436{
437 int arcsCount; /* number of out-arcs for this state */
438 TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
440
441/* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
443{
444 /*
445 * colorTrigramsCount and colorTrigramGroups contain information about how
446 * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
447 * count of color trigrams and "colorTrigramGroups" contains number of
448 * simple trigrams for each color trigram. The array of simple trigrams
449 * (stored separately from this struct) is ordered so that the simple
450 * trigrams for each color trigram are consecutive, and they're in order
451 * by color trigram number.
452 */
454 int *colorTrigramGroups; /* array of size colorTrigramsCount */
455
456 /*
457 * The states of the simplified NFA. State number 0 is always initial
458 * state and state number 1 is always final state.
459 */
461 TrgmPackedState *states; /* array of size statesCount */
462
463 /* Temporary work space for trigramsMatchGraph() */
464 bool *colorTrigramsActive; /* array of size colorTrigramsCount */
465 bool *statesActive; /* array of size statesCount */
466 int *statesQueue; /* array of size statesCount */
467};
468
469/*
470 * Temporary structure for representing an arc during packaging.
471 */
472typedef struct
473{
478
479
480/* prototypes for private functions */
481static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
483static void RE_compile(regex_t *regex, text *text_re,
484 int cflags, Oid collation);
485static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
486static int convertPgWchar(pg_wchar c, trgm_mb_char *result);
487static void transformGraph(TrgmNFA *trgmNFA);
489static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
490static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
491static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
492static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
494static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
499static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
501static int colorTrgmInfoCmp(const void *p1, const void *p2);
502static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
504static int packArcInfoCmp(const void *a1, const void *a2);
505
506#ifdef TRGM_REGEXP_DEBUG
507static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
508static void printTrgmNFA(TrgmNFA *trgmNFA);
509static void printTrgmColor(StringInfo buf, TrgmColor co);
511#endif
512
513
514/*
515 * Main entry point to process a regular expression.
516 *
517 * Returns an array of trigrams required by the regular expression, or NULL if
518 * the regular expression was too complex to analyze. In addition, a packed
519 * graph representation of the regex is returned into *graph. The results
520 * must be allocated in rcontext (which might or might not be the current
521 * context).
522 */
523TRGM *
526{
527 TRGM *trg;
528 regex_t regex;
529 MemoryContext tmpcontext;
530 MemoryContext oldcontext;
531
532 /*
533 * This processing generates a great deal of cruft, which we'd like to
534 * clean up before returning (since this function may be called in a
535 * query-lifespan memory context). Make a temp context we can work in so
536 * that cleanup is easy.
537 */
539 "createTrgmNFA temporary context",
541 oldcontext = MemoryContextSwitchTo(tmpcontext);
542
543 /*
544 * Stage 1: Compile the regexp into a NFA, using the regexp library.
545 */
546#ifdef IGNORECASE
547 RE_compile(&regex, text_re,
548 REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
549#else
550 RE_compile(&regex, text_re,
551 REG_ADVANCED | REG_NOSUB, collation);
552#endif
553
554 trg = createTrgmNFAInternal(&regex, graph, rcontext);
555
556 /* Clean up all the cruft we created (including regex) */
557 MemoryContextSwitchTo(oldcontext);
558 MemoryContextDelete(tmpcontext);
559
560 return trg;
561}
562
563/*
564 * Body of createTrgmNFA, exclusive of regex compilation/freeing.
565 */
566static TRGM *
569{
570 TRGM *trg;
572
573 trgmNFA.regex = regex;
574
575 /* Collect color information from the regex */
576 getColorInfo(regex, &trgmNFA);
577
578#ifdef TRGM_REGEXP_DEBUG
579 printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
580#endif
581
582 /*
583 * Stage 2: Create an expanded graph from the source NFA.
584 */
586
587#ifdef TRGM_REGEXP_DEBUG
589#endif
590
591 /*
592 * Fail if we were unable to make a nontrivial graph, ie it is possible to
593 * get from the initial state to the final state without reading any
594 * predictable trigram.
595 */
596 if (trgmNFA.initState->flags & TSTATE_FIN)
597 return NULL;
598
599 /*
600 * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
601 */
603 return NULL;
604
605 /*
606 * Stage 4: Expand color trigrams and pack graph into final
607 * representation.
608 */
610
611 *graph = packGraph(&trgmNFA, rcontext);
612
613#ifdef TRGM_REGEXP_DEBUG
614 printTrgmPackedGraph(*graph, trg);
615#endif
616
617 return trg;
618}
619
620/*
621 * Main entry point for evaluating a graph during index scanning.
622 *
623 * The check[] array is indexed by trigram number (in the array of simple
624 * trigrams returned by createTrgmNFA), and holds true for those trigrams
625 * that are present in the index entry being checked.
626 */
627bool
629{
630 int i,
631 j,
632 k,
633 queueIn,
634 queueOut;
635
636 /*
637 * Reset temporary working areas.
638 */
639 memset(graph->colorTrigramsActive, 0,
640 sizeof(bool) * graph->colorTrigramsCount);
641 memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
642
643 /*
644 * Check which color trigrams were matched. A match for any simple
645 * trigram associated with a color trigram counts as a match of the color
646 * trigram.
647 */
648 j = 0;
649 for (i = 0; i < graph->colorTrigramsCount; i++)
650 {
651 int cnt = graph->colorTrigramGroups[i];
652
653 for (k = j; k < j + cnt; k++)
654 {
655 if (check[k])
656 {
657 /*
658 * Found one matched trigram in the group. Can skip the rest
659 * of them and go to the next group.
660 */
661 graph->colorTrigramsActive[i] = true;
662 break;
663 }
664 }
665 j = j + cnt;
666 }
667
668 /*
669 * Initialize the statesQueue to hold just the initial state. Note:
670 * statesQueue has room for statesCount entries, which is certainly enough
671 * since no state will be put in the queue more than once. The
672 * statesActive array marks which states have been queued.
673 */
674 graph->statesActive[0] = true;
675 graph->statesQueue[0] = 0;
676 queueIn = 0;
677 queueOut = 1;
678
679 /* Process queued states as long as there are any. */
680 while (queueIn < queueOut)
681 {
682 int stateno = graph->statesQueue[queueIn++];
684 int cnt = state->arcsCount;
685
686 /* Loop over state's out-arcs */
687 for (i = 0; i < cnt; i++)
688 {
689 TrgmPackedArc *arc = &state->arcs[i];
690
691 /*
692 * If corresponding color trigram is present then activate the
693 * corresponding state. We're done if that's the final state,
694 * otherwise queue the state if it's not been queued already.
695 */
696 if (graph->colorTrigramsActive[arc->colorTrgm])
697 {
698 int nextstate = arc->targetState;
699
700 if (nextstate == 1)
701 return true; /* success: final state is reachable */
702
703 if (!graph->statesActive[nextstate])
704 {
705 graph->statesActive[nextstate] = true;
706 graph->statesQueue[queueOut++] = nextstate;
707 }
708 }
709 }
710 }
711
712 /* Queue is empty, so match fails. */
713 return false;
714}
715
716/*
717 * Compile regex string into struct at *regex.
718 * NB: pg_regfree must be applied to regex if this completes successfully.
719 */
720static void
721RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
722{
725 pg_wchar *pattern;
726 int pattern_len;
727 int regcomp_result;
728 char errMsg[100];
729
730 /* Convert pattern string to wide characters */
731 pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
733 pattern,
735
736 /* Compile regex */
738 pattern,
740 cflags,
741 collation);
742
743 pfree(pattern);
744
746 {
747 /* re didn't compile (no need for pg_regfree, if so) */
748 pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
751 errmsg("invalid regular expression: %s", errMsg)));
752 }
753}
754
755
756/*---------------------
757 * Subroutines for pre-processing the color map (stage 1).
758 *---------------------
759 */
760
761/*
762 * Fill TrgmColorInfo structure for each color using regex export functions.
763 */
764static void
766{
767 int colorsCount = pg_reg_getnumcolors(regex);
768 int i;
769
770 trgmNFA->ncolors = colorsCount;
771 trgmNFA->colorInfo = (TrgmColorInfo *)
773
774 /*
775 * Loop over colors, filling TrgmColorInfo about each. Note we include
776 * WHITE (0) even though we know it'll be reported as non-expandable.
777 */
778 for (i = 0; i < colorsCount; i++)
779 {
780 TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
783 int j;
784
786 {
787 /* Non expandable, or too large to work with */
788 colorInfo->expandable = false;
789 continue;
790 }
791
792 colorInfo->expandable = true;
793 colorInfo->containsNonWord = false;
795 colorInfo->wordCharsCount = 0;
796
797 /* Extract all the chars in this color */
800
801 /*
802 * Convert characters back to multibyte form, and save only those that
803 * are word characters. Set "containsNonWord" if any non-word
804 * character. (Note: it'd probably be nicer to keep the chars in
805 * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
806 */
807 for (j = 0; j < charsCount; j++)
808 {
810 int clen = convertPgWchar(chars[j], &c);
811
812 if (!clen)
813 continue; /* ok to ignore it altogether */
814 if (ISWORDCHR(c.bytes, clen))
815 colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
816 else
817 colorInfo->containsNonWord = true;
818 }
819
820 pfree(chars);
821 }
822}
823
824/*
825 * Convert pg_wchar to multibyte format.
826 * Returns 0 if the character should be ignored completely, else returns its
827 * byte length.
828 */
829static int
831{
832 /* "s" has enough space for a multibyte character and a trailing NUL */
833 char s[MAX_MULTIBYTE_CHAR_LEN + 1];
834 int clen;
835
836 /*
837 * We can ignore the NUL character, since it can never appear in a PG text
838 * string. This avoids the need for various special cases when
839 * reconstructing trigrams.
840 */
841 if (c == 0)
842 return 0;
843
844 /* Do the conversion, making sure the result is NUL-terminated */
845 memset(s, 0, sizeof(s));
846 clen = pg_wchar2mb_with_len(&c, s, 1);
847
848 /*
849 * In IGNORECASE mode, we can ignore uppercase characters. We assume that
850 * the regex engine generated both uppercase and lowercase equivalents
851 * within each color, since we used the REG_ICASE option; so there's no
852 * need to process the uppercase version.
853 *
854 * XXX this code is dependent on the assumption that str_tolower() works
855 * the same as the regex engine's internal case folding machinery. Might
856 * be wiser to expose pg_wc_tolower and test whether c ==
857 * pg_wc_tolower(c). On the other hand, the trigrams in the index were
858 * created using str_tolower(), so we're probably screwed if there's any
859 * incompatibility anyway.
860 */
861#ifdef IGNORECASE
862 {
864
865 if (strcmp(lowerCased, s) != 0)
866 {
868 return 0;
869 }
871 }
872#endif
873
874 /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
876 return clen;
877}
878
879
880/*---------------------
881 * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
882 *---------------------
883 */
884
885/*
886 * Transform the graph, given a regex and extracted color information.
887 *
888 * We create and process a queue of expanded-graph states until all the states
889 * are processed.
890 *
891 * This algorithm may be stopped due to resource limitation. In this case we
892 * force every unprocessed branch to immediately finish with matching (this
893 * can give us false positives but no false negatives) by marking all
894 * unprocessed states as final.
895 */
896static void
898{
901 TrgmState *initstate;
902 ListCell *lc;
903
904 /* Initialize this stage's workspace in trgmNFA struct */
905 trgmNFA->queue = NIL;
906 trgmNFA->keysQueue = NIL;
907 trgmNFA->arcsCount = 0;
908 trgmNFA->overflowed = false;
909
910 /* Create hashtable for states */
911 hashCtl.keysize = sizeof(TrgmStateKey);
912 hashCtl.entrysize = sizeof(TrgmState);
914 trgmNFA->states = hash_create("Trigram NFA",
915 1024,
916 &hashCtl,
918 trgmNFA->nstates = 0;
919
920 /* Create initial state: ambiguous prefix, NFA's initial state */
921 MemSet(&initkey, 0, sizeof(initkey));
922 initkey.prefix.colors[0] = COLOR_UNKNOWN;
923 initkey.prefix.colors[1] = COLOR_UNKNOWN;
924 initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
925
926 initstate = getState(trgmNFA, &initkey);
927 initstate->flags |= TSTATE_INIT;
928 trgmNFA->initState = initstate;
929
930 /*
931 * Recursively build the expanded graph by processing queue of states
932 * (breadth-first search). getState already put initstate in the queue.
933 * Note that getState will append new states to the queue within the loop,
934 * too; this works as long as we don't do repeat fetches using the "lc"
935 * pointer.
936 */
937 foreach(lc, trgmNFA->queue)
938 {
940
941 /*
942 * If we overflowed then just mark state as final. Otherwise do
943 * actual processing.
944 */
945 if (trgmNFA->overflowed)
946 state->flags |= TSTATE_FIN;
947 else
949
950 /* Did we overflow? */
951 if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
953 trgmNFA->overflowed = true;
954 }
955}
956
957/*
958 * Process one state: add enter keys and then add outgoing arcs.
959 */
960static void
962{
963 ListCell *lc;
964
965 /* keysQueue should be NIL already, but make sure */
966 trgmNFA->keysQueue = NIL;
967
968 /*
969 * Add state's own key, and then process all keys added to keysQueue until
970 * queue is finished. But we can quit if the state gets marked final.
971 */
972 addKey(trgmNFA, state, &state->stateKey);
973 foreach(lc, trgmNFA->keysQueue)
974 {
976
977 if (state->flags & TSTATE_FIN)
978 break;
979 addKey(trgmNFA, state, key);
980 }
981
982 /* Release keysQueue to clean up for next cycle */
983 list_free(trgmNFA->keysQueue);
984 trgmNFA->keysQueue = NIL;
985
986 /*
987 * Add outgoing arcs only if state isn't final (we have no interest in
988 * outgoing arcs if we already match)
989 */
990 if (!(state->flags & TSTATE_FIN))
992}
993
994/*
995 * Add the given enter key into the state's enterKeys list, and determine
996 * whether this should result in any further enter keys being added.
997 * If so, add those keys to keysQueue so that processState will handle them.
998 *
999 * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
1000 * This situation means that we can reach the final state from this expanded
1001 * state without reading any predictable trigram, so we must consider this
1002 * state as an accepting one.
1003 *
1004 * The given key could be a duplicate of one already in enterKeys, or be
1005 * redundant with some enterKeys. So we check that before doing anything.
1006 *
1007 * Note that we don't generate any actual arcs here. addArcs will do that
1008 * later, after we have identified all the enter keys for this state.
1009 */
1010static void
1012{
1013 regex_arc_t *arcs;
1015 ListCell *cell;
1016 int i,
1017 arcsCount;
1018
1019 /*
1020 * Ensure any pad bytes in destKey are zero, since it may get used as a
1021 * hashtable key by getState.
1022 */
1023 MemSet(&destKey, 0, sizeof(destKey));
1024
1025 /*
1026 * Compare key to each existing enter key of the state to check for
1027 * redundancy. We can drop either old key(s) or the new key if we find
1028 * redundancy.
1029 */
1030 foreach(cell, state->enterKeys)
1031 {
1033
1034 if (existingKey->nstate == key->nstate)
1035 {
1036 if (prefixContains(&existingKey->prefix, &key->prefix))
1037 {
1038 /* This old key already covers the new key. Nothing to do */
1039 return;
1040 }
1041 if (prefixContains(&key->prefix, &existingKey->prefix))
1042 {
1043 /*
1044 * The new key covers this old key. Remove the old key, it's
1045 * no longer needed once we add this key to the list.
1046 */
1047 state->enterKeys = foreach_delete_current(state->enterKeys,
1048 cell);
1049 }
1050 }
1051 }
1052
1053 /* No redundancy, so add this key to the state's list */
1054 state->enterKeys = lappend(state->enterKeys, key);
1055
1056 /* If state is now known final, mark it and we're done */
1057 if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1058 {
1059 state->flags |= TSTATE_FIN;
1060 return;
1061 }
1062
1063 /*
1064 * Loop through all outgoing arcs of the corresponding state in the
1065 * original NFA.
1066 */
1067 arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1068 arcs = palloc_array(regex_arc_t, arcsCount);
1069 pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1070
1071 for (i = 0; i < arcsCount; i++)
1072 {
1073 regex_arc_t *arc = &arcs[i];
1074
1075 if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1076 {
1077 /*
1078 * Start of line/string (^). Trigram extraction treats start of
1079 * line same as start of word: double space prefix is added.
1080 * Hence, make an enter key showing we can reach the arc
1081 * destination with all-blank prefix.
1082 */
1083 destKey.prefix.colors[0] = COLOR_BLANK;
1084 destKey.prefix.colors[1] = COLOR_BLANK;
1085 destKey.nstate = arc->to;
1086
1087 /* Add enter key to this state */
1089 }
1090 else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1091 {
1092 /*
1093 * End of line/string ($). We must consider this arc as a
1094 * transition that doesn't read anything. The reason for adding
1095 * this enter key to the state is that if the arc leads to the
1096 * NFA's final state, we must mark this expanded state as final.
1097 */
1098 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1099 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1100 destKey.nstate = arc->to;
1101
1102 /* Add enter key to this state */
1104 }
1105 else if (arc->co >= 0)
1106 {
1107 /* Regular color (including WHITE) */
1108 TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1109
1110 if (colorInfo->expandable)
1111 {
1112 if (colorInfo->containsNonWord &&
1114 {
1115 /*
1116 * We can reach the arc destination after reading a
1117 * non-word character, but the prefix is not something
1118 * that addArc will accept with COLOR_BLANK, so no trigram
1119 * arc can get made for this transition. We must make an
1120 * enter key to show that the arc destination is
1121 * reachable. Set it up with an all-blank prefix, since
1122 * that corresponds to what the trigram extraction code
1123 * will do at a word starting boundary.
1124 */
1125 destKey.prefix.colors[0] = COLOR_BLANK;
1126 destKey.prefix.colors[1] = COLOR_BLANK;
1127 destKey.nstate = arc->to;
1129 }
1130
1131 if (colorInfo->wordCharsCount > 0 &&
1132 !validArcLabel(key, arc->co))
1133 {
1134 /*
1135 * We can reach the arc destination after reading a word
1136 * character, but the prefix is not something that addArc
1137 * will accept, so no trigram arc can get made for this
1138 * transition. We must make an enter key to show that the
1139 * arc destination is reachable. The prefix for the enter
1140 * key should reflect the info we have for this arc.
1141 */
1142 destKey.prefix.colors[0] = key->prefix.colors[1];
1143 destKey.prefix.colors[1] = arc->co;
1144 destKey.nstate = arc->to;
1146 }
1147 }
1148 else
1149 {
1150 /*
1151 * Unexpandable color. Add enter key with ambiguous prefix,
1152 * showing we can reach the destination from this state, but
1153 * the preceding colors will be uncertain. (We do not set the
1154 * first prefix color to key->prefix.colors[1], because a
1155 * prefix of known followed by unknown is invalid.)
1156 */
1157 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1158 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1159 destKey.nstate = arc->to;
1161 }
1162 }
1163 else
1164 {
1165 /* RAINBOW: treat as unexpandable color */
1166 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1167 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1168 destKey.nstate = arc->to;
1170 }
1171 }
1172
1173 pfree(arcs);
1174}
1175
1176/*
1177 * Add copy of given key to keysQueue for later processing.
1178 */
1179static void
1181{
1183
1184 memcpy(keyCopy, key, sizeof(TrgmStateKey));
1185 trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1186}
1187
1188/*
1189 * Add outgoing arcs from given state, whose enter keys are all now known.
1190 */
1191static void
1193{
1195 ListCell *cell;
1196 regex_arc_t *arcs;
1197 int arcsCount,
1198 i;
1199
1200 /*
1201 * Ensure any pad bytes in destKey are zero, since it may get used as a
1202 * hashtable key by getState.
1203 */
1204 MemSet(&destKey, 0, sizeof(destKey));
1205
1206 /*
1207 * Iterate over enter keys associated with this expanded-graph state. This
1208 * includes both the state's own stateKey, and any enter keys we added to
1209 * it during addKey (which represent expanded-graph states that are not
1210 * distinguishable from this one by means of trigrams). For each such
1211 * enter key, examine all the out-arcs of the key's underlying NFA state,
1212 * and try to make a trigram arc leading to where the out-arc leads.
1213 * (addArc will deal with whether the arc is valid or not.)
1214 */
1215 foreach(cell, state->enterKeys)
1216 {
1217 TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1218
1219 arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1220 arcs = palloc_array(regex_arc_t, arcsCount);
1221 pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1222
1223 for (i = 0; i < arcsCount; i++)
1224 {
1225 regex_arc_t *arc = &arcs[i];
1226 TrgmColorInfo *colorInfo;
1227
1228 /*
1229 * Ignore non-expandable colors; addKey already handled the case.
1230 *
1231 * We need no special check for WHITE or begin/end pseudocolors
1232 * here. We don't need to do any processing for them, and they
1233 * will be marked non-expandable since the regex engine will have
1234 * reported them that way. We do have to watch out for RAINBOW,
1235 * which has a negative color number.
1236 */
1237 if (arc->co < 0)
1238 continue;
1239 Assert(arc->co < trgmNFA->ncolors);
1240
1241 colorInfo = &trgmNFA->colorInfo[arc->co];
1242 if (!colorInfo->expandable)
1243 continue;
1244
1245 if (colorInfo->containsNonWord)
1246 {
1247 /*
1248 * Color includes non-word character(s).
1249 *
1250 * Generate an arc, treating this transition as occurring on
1251 * BLANK. This allows word-ending trigrams to be manufactured
1252 * if possible.
1253 */
1254 destKey.prefix.colors[0] = key->prefix.colors[1];
1255 destKey.prefix.colors[1] = COLOR_BLANK;
1256 destKey.nstate = arc->to;
1257
1259 }
1260
1261 if (colorInfo->wordCharsCount > 0)
1262 {
1263 /*
1264 * Color includes word character(s).
1265 *
1266 * Generate an arc. Color is pushed into prefix of target
1267 * state.
1268 */
1269 destKey.prefix.colors[0] = key->prefix.colors[1];
1270 destKey.prefix.colors[1] = arc->co;
1271 destKey.nstate = arc->to;
1272
1273 addArc(trgmNFA, state, key, arc->co, &destKey);
1274 }
1275 }
1276
1277 pfree(arcs);
1278 }
1279}
1280
1281/*
1282 * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1283 *
1284 * state: expanded-graph state we want to add an out-arc to
1285 * key: provides prefix colors (key->nstate is not used)
1286 * co: transition color
1287 * destKey: identifier for destination state of expanded graph
1288 */
1289static void
1292{
1293 TrgmArc *arc;
1294 ListCell *cell;
1295
1296 /* Do nothing if this wouldn't be a valid arc label trigram */
1297 if (!validArcLabel(key, co))
1298 return;
1299
1300 /*
1301 * Check if we are going to reach key which is covered by a key which is
1302 * already listed in this state. If so arc is useless: the NFA can bypass
1303 * it through a path that doesn't require any predictable trigram, so
1304 * whether the arc's trigram is present or not doesn't really matter.
1305 */
1306 foreach(cell, state->enterKeys)
1307 {
1309
1310 if (existingKey->nstate == destKey->nstate &&
1311 prefixContains(&existingKey->prefix, &destKey->prefix))
1312 return;
1313 }
1314
1315 /* Checks were successful, add new arc */
1317 arc->target = getState(trgmNFA, destKey);
1318 arc->ctrgm.colors[0] = key->prefix.colors[0];
1319 arc->ctrgm.colors[1] = key->prefix.colors[1];
1320 arc->ctrgm.colors[2] = co;
1321
1322 state->arcs = lappend(state->arcs, arc);
1323 trgmNFA->arcsCount++;
1324}
1325
1326/*
1327 * Can we make a valid trigram arc label from the given prefix and arc color?
1328 *
1329 * This is split out so that tests in addKey and addArc will stay in sync.
1330 */
1331static bool
1333{
1334 /*
1335 * We have to know full trigram in order to add outgoing arc. So we can't
1336 * do it if prefix is ambiguous.
1337 */
1338 if (key->prefix.colors[0] == COLOR_UNKNOWN)
1339 return false;
1340
1341 /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1342 Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1343 /* And we should not be called with an unknown arc color anytime */
1344 Assert(co != COLOR_UNKNOWN);
1345
1346 /*
1347 * We don't bother with making arcs representing three non-word
1348 * characters, since that's useless for trigram extraction.
1349 */
1350 if (key->prefix.colors[0] == COLOR_BLANK &&
1351 key->prefix.colors[1] == COLOR_BLANK &&
1352 co == COLOR_BLANK)
1353 return false;
1354
1355 /*
1356 * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1357 * case doesn't correspond to any trigram the trigram extraction code
1358 * would make. The nonblank-blank-blank case is also not possible with
1359 * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1360 * trigram even if it were valid, for example processing "foo bar" will
1361 * not result in considering the trigram "o ". So if you want to support
1362 * RPADDING = 2, there's more to do than just twiddle this test.)
1363 */
1364 if (key->prefix.colors[0] != COLOR_BLANK &&
1365 key->prefix.colors[1] == COLOR_BLANK)
1366 return false;
1367
1368 /*
1369 * Other combinations involving blank are valid, in particular we assume
1370 * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1371 *
1372 * Note: Using again the example "foo bar", we will not consider the
1373 * trigram " b", though this trigram would be found by the trigram
1374 * extraction code. Since we will find " ba", it doesn't seem worth
1375 * trying to hack the algorithm to generate the additional trigram.
1376 */
1377
1378 /* arc label is valid */
1379 return true;
1380}
1381
1382/*
1383 * Get state of expanded graph for given state key,
1384 * and queue the state for processing if it didn't already exist.
1385 */
1386static TrgmState *
1388{
1390 bool found;
1391
1392 state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1393 &found);
1394 if (!found)
1395 {
1396 /* New state: initialize and queue it */
1397 state->arcs = NIL;
1398 state->enterKeys = NIL;
1399 state->flags = 0;
1400 /* states are initially given negative numbers */
1401 state->snumber = -(++trgmNFA->nstates);
1402 state->parent = NULL;
1403 state->tentFlags = 0;
1404 state->tentParent = NULL;
1405
1406 trgmNFA->queue = lappend(trgmNFA->queue, state);
1407 }
1408 return state;
1409}
1410
1411/*
1412 * Check if prefix1 "contains" prefix2.
1413 *
1414 * "contains" means that any exact prefix (with no ambiguity) that satisfies
1415 * prefix2 also satisfies prefix1.
1416 */
1417static bool
1419{
1420 if (prefix1->colors[1] == COLOR_UNKNOWN)
1421 {
1422 /* Fully ambiguous prefix contains everything */
1423 return true;
1424 }
1425 else if (prefix1->colors[0] == COLOR_UNKNOWN)
1426 {
1427 /*
1428 * Prefix with only first unknown color contains every prefix with
1429 * same second color.
1430 */
1431 if (prefix1->colors[1] == prefix2->colors[1])
1432 return true;
1433 else
1434 return false;
1435 }
1436 else
1437 {
1438 /* Exact prefix contains only the exact same prefix */
1439 if (prefix1->colors[0] == prefix2->colors[0] &&
1440 prefix1->colors[1] == prefix2->colors[1])
1441 return true;
1442 else
1443 return false;
1444 }
1445}
1446
1447
1448/*---------------------
1449 * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1450 *---------------------
1451 */
1452
1453/*
1454 * Get vector of all color trigrams in graph and select which of them
1455 * to expand into simple trigrams.
1456 *
1457 * Returns true if OK, false if exhausted resource limits.
1458 */
1459static bool
1461{
1463 int arcsCount = trgmNFA->arcsCount,
1464 i;
1466 ColorTrgmInfo *colorTrgms;
1467 int64 totalTrgmCount;
1469 int cnumber;
1470
1471 /* Collect color trigrams from all arcs */
1472 colorTrgms = palloc0_array(ColorTrgmInfo, arcsCount);
1473 trgmNFA->colorTrgms = colorTrgms;
1474
1475 i = 0;
1477 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1478 {
1479 ListCell *cell;
1480
1481 foreach(cell, state->arcs)
1482 {
1483 TrgmArc *arc = (TrgmArc *) lfirst(cell);
1485 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1486
1487 arcInfo->source = state;
1488 arcInfo->target = arc->target;
1489 trgmInfo->ctrgm = arc->ctrgm;
1490 trgmInfo->cnumber = -1;
1491 /* count and penalty will be set below */
1492 trgmInfo->expanded = true;
1493 trgmInfo->arcs = list_make1(arcInfo);
1494 i++;
1495 }
1496 }
1497 Assert(i == arcsCount);
1498
1499 /* Remove duplicates, merging their arcs lists */
1500 if (arcsCount >= 2)
1501 {
1503 *p2;
1504
1505 /* Sort trigrams to ease duplicate detection */
1506 qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1507
1508 /* p1 is probe point, p2 is last known non-duplicate. */
1509 p2 = colorTrgms;
1510 for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1511 {
1512 if (colorTrgmInfoCmp(p1, p2) > 0)
1513 {
1514 p2++;
1515 *p2 = *p1;
1516 }
1517 else
1518 {
1519 p2->arcs = list_concat(p2->arcs, p1->arcs);
1520 }
1521 }
1522 trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1523 }
1524 else
1525 {
1526 trgmNFA->colorTrgmsCount = arcsCount;
1527 }
1528
1529 /*
1530 * Count number of simple trigrams generated by each color trigram, and
1531 * also compute a penalty value, which is the number of simple trigrams
1532 * times a multiplier that depends on its whitespace content.
1533 *
1534 * Note: per-color-trigram counts cannot overflow an int so long as
1535 * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1536 * 1290. However, the grand total totalTrgmCount might conceivably
1537 * overflow an int, so we use int64 for that within this routine. Also,
1538 * penalties are calculated in float4 arithmetic to avoid any overflow
1539 * worries.
1540 */
1541 totalTrgmCount = 0;
1542 totalTrgmPenalty = 0.0f;
1543 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1544 {
1545 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1546 int j,
1547 count = 1,
1548 typeIndex = 0;
1549
1550 for (j = 0; j < 3; j++)
1551 {
1553
1554 typeIndex *= 2;
1555 if (c == COLOR_BLANK)
1556 typeIndex++;
1557 else
1558 count *= trgmNFA->colorInfo[c].wordCharsCount;
1559 }
1560 trgmInfo->count = count;
1561 totalTrgmCount += count;
1562 trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1563 totalTrgmPenalty += trgmInfo->penalty;
1564 }
1565
1566 /* Sort color trigrams in descending order of their penalties */
1567 qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1569
1570 /*
1571 * Remove color trigrams from the graph so long as total penalty of color
1572 * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1573 * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1574 * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1575 * penalty, since those are the most promising for reducing the total
1576 * penalty. When removing a color trigram we have to merge states
1577 * connected by arcs labeled with that trigram. It's necessary to not
1578 * merge initial and final states, because our graph becomes useless if
1579 * that happens; so we cannot always remove the trigram we'd prefer to.
1580 */
1581 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1582 {
1583 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1584 bool canRemove = true;
1585 ListCell *cell;
1586
1587 /* Done if we've reached the target */
1589 break;
1590
1591#ifdef TRGM_REGEXP_DEBUG
1592 fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1593 trgmInfo->ctrgm.colors[0],
1594 trgmInfo->ctrgm.colors[1],
1595 trgmInfo->ctrgm.colors[2],
1596 trgmInfo->penalty,
1597 list_length(trgmInfo->arcs));
1598#endif
1599
1600 /*
1601 * Does any arc of this color trigram connect initial and final
1602 * states? If so we can't remove it.
1603 */
1604 foreach(cell, trgmInfo->arcs)
1605 {
1607 TrgmState *source = arcInfo->source,
1608 *target = arcInfo->target;
1609 int source_flags,
1611
1612#ifdef TRGM_REGEXP_DEBUG
1613 fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1614 -target->snumber, target->flags,
1615 -source->snumber, source->flags);
1616#endif
1617
1618 /* examine parent states, if any merging has already happened */
1619 while (source->parent)
1620 source = source->parent;
1621 while (target->parent)
1622 target = target->parent;
1623
1624#ifdef TRGM_REGEXP_DEBUG
1625 fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1626 -target->snumber, target->flags,
1627 -source->snumber, source->flags);
1628#endif
1629
1630 /* we must also consider merges we are planning right now */
1631 source_flags = source->flags | source->tentFlags;
1632 while (source->tentParent)
1633 {
1634 source = source->tentParent;
1635 source_flags |= source->flags | source->tentFlags;
1636 }
1637 target_flags = target->flags | target->tentFlags;
1638 while (target->tentParent)
1639 {
1640 target = target->tentParent;
1641 target_flags |= target->flags | target->tentFlags;
1642 }
1643
1644#ifdef TRGM_REGEXP_DEBUG
1645 fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1646 -target->snumber, target_flags,
1647 -source->snumber, source_flags);
1648#endif
1649
1650 /* would fully-merged state have both INIT and FIN set? */
1653 {
1654 canRemove = false;
1655 break;
1656 }
1657
1658 /* ok so far, so remember planned merge */
1659 if (source != target)
1660 {
1661#ifdef TRGM_REGEXP_DEBUG
1662 fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1663 -target->snumber, -source->snumber);
1664#endif
1665 target->tentParent = source;
1666 source->tentFlags |= target_flags;
1667 }
1668 }
1669
1670 /*
1671 * We must reset all the tentFlags/tentParent fields before
1672 * continuing. tentFlags could only have become set in states that
1673 * are the source or parent or tentative parent of one of the current
1674 * arcs; likewise tentParent could only have become set in states that
1675 * are the target or parent or tentative parent of one of the current
1676 * arcs. There might be some overlap between those sets, but if we
1677 * clear tentFlags in target states as well as source states, we
1678 * should be okay even if we visit a state as target before visiting
1679 * it as a source.
1680 */
1681 foreach(cell, trgmInfo->arcs)
1682 {
1684 TrgmState *source = arcInfo->source,
1685 *target = arcInfo->target;
1687
1688 /* no need to touch previously-merged states */
1689 while (source->parent)
1690 source = source->parent;
1691 while (target->parent)
1692 target = target->parent;
1693
1694 while (source)
1695 {
1696 source->tentFlags = 0;
1697 source = source->tentParent;
1698 }
1699
1700 while ((ttarget = target->tentParent) != NULL)
1701 {
1702 target->tentParent = NULL;
1703 target->tentFlags = 0; /* in case it was also a source */
1704 target = ttarget;
1705 }
1706 }
1707
1708 /* Now, move on if we can't drop this trigram */
1709 if (!canRemove)
1710 {
1711#ifdef TRGM_REGEXP_DEBUG
1712 fprintf(stderr, " ... not ok to merge\n");
1713#endif
1714 continue;
1715 }
1716
1717 /* OK, merge states linked by each arc labeled by the trigram */
1718 foreach(cell, trgmInfo->arcs)
1719 {
1721 TrgmState *source = arcInfo->source,
1722 *target = arcInfo->target;
1723
1724 while (source->parent)
1725 source = source->parent;
1726 while (target->parent)
1727 target = target->parent;
1728 if (source != target)
1729 {
1730#ifdef TRGM_REGEXP_DEBUG
1731 fprintf(stderr, "merging s%d into s%d\n",
1732 -target->snumber, -source->snumber);
1733#endif
1734 mergeStates(source, target);
1735 /* Assert we didn't merge initial and final states */
1736 Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1738 }
1739 }
1740
1741 /* Mark trigram unexpanded, and update totals */
1742 trgmInfo->expanded = false;
1743 totalTrgmCount -= trgmInfo->count;
1744 totalTrgmPenalty -= trgmInfo->penalty;
1745 }
1746
1747 /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1748 if (totalTrgmCount > MAX_TRGM_COUNT)
1749 return false;
1750
1751 trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1752
1753 /*
1754 * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1755 * and enumerate the color trigrams that are expanded.
1756 */
1757 cnumber = 0;
1758 qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1760 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1761 {
1762 if (colorTrgms[i].expanded)
1763 {
1764 colorTrgms[i].cnumber = cnumber;
1765 cnumber++;
1766 }
1767 }
1768
1769 return true;
1770}
1771
1772/*
1773 * Expand selected color trigrams into regular trigrams.
1774 *
1775 * Returns the TRGM array to be passed to the index machinery.
1776 * The array must be allocated in rcontext.
1777 */
1778static TRGM *
1780{
1781 TRGM *trg;
1782 trgm *p;
1783 int i;
1786
1787 /* Set up "blank" color structure containing a single zero character */
1788 memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1789 blankColor.wordCharsCount = 1;
1790 blankColor.wordChars = &blankChar;
1791
1792 /* Construct the trgm array */
1793 trg = (TRGM *)
1795 TRGMHDRSIZE +
1796 trgmNFA->totalTrgmCount * sizeof(trgm));
1797 trg->flag = ARRKEY;
1798 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1799 p = GETARR(trg);
1800 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1801 {
1802 ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1803 TrgmColorInfo *c[3];
1804 trgm_mb_char s[3];
1805 int j,
1806 i1,
1807 i2,
1808 i3;
1809
1810 /* Ignore any unexpanded trigrams ... */
1811 if (!colorTrgm->expanded)
1812 continue;
1813
1814 /* Get colors, substituting the dummy struct for COLOR_BLANK */
1815 for (j = 0; j < 3; j++)
1816 {
1817 if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1818 c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1819 else
1820 c[j] = &blankColor;
1821 }
1822
1823 /* Iterate over all possible combinations of colors' characters */
1824 for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1825 {
1826 s[0] = c[0]->wordChars[i1];
1827 for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1828 {
1829 s[1] = c[1]->wordChars[i2];
1830 for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1831 {
1832 s[2] = c[2]->wordChars[i3];
1833 fillTrgm(p, s);
1834 p++;
1835 }
1836 }
1837 }
1838 }
1839
1840 return trg;
1841}
1842
1843/*
1844 * Convert trigram into trgm datatype.
1845 */
1846static void
1848{
1849 char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1850 *p;
1851 int i,
1852 j;
1853
1854 /* Write multibyte string into "str" (we don't need null termination) */
1855 p = str;
1856
1857 for (i = 0; i < 3; i++)
1858 {
1859 if (s[i].bytes[0] != 0)
1860 {
1861 for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1862 *p++ = s[i].bytes[j];
1863 }
1864 else
1865 {
1866 /* Emit a space in place of COLOR_BLANK */
1867 *p++ = ' ';
1868 }
1869 }
1870
1871 /* Convert "str" to a standard trigram (possibly hashing it) */
1873}
1874
1875/*
1876 * Merge two states of graph.
1877 */
1878static void
1880{
1881 Assert(state1 != state2);
1882 Assert(!state1->parent);
1883 Assert(!state2->parent);
1884
1885 /* state1 absorbs state2's flags */
1886 state1->flags |= state2->flags;
1887
1888 /* state2, and indirectly all its children, become children of state1 */
1889 state2->parent = state1;
1890}
1891
1892/*
1893 * Compare function for sorting of color trigrams by their colors.
1894 */
1895static int
1896colorTrgmInfoCmp(const void *p1, const void *p2)
1897{
1898 const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1899 const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1900
1901 return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1902}
1903
1904/*
1905 * Compare function for sorting color trigrams in descending order of
1906 * their penalty fields.
1907 */
1908static int
1909colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1910{
1911 float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1912 float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1913
1914 if (penalty1 < penalty2)
1915 return 1;
1916 else if (penalty1 == penalty2)
1917 return 0;
1918 else
1919 return -1;
1920}
1921
1922
1923/*---------------------
1924 * Subroutines for packing the graph into final representation (stage 4).
1925 *---------------------
1926 */
1927
1928/*
1929 * Pack expanded graph into final representation.
1930 *
1931 * The result data must be allocated in rcontext.
1932 */
1933static TrgmPackedGraph *
1935{
1936 int snumber = 2,
1937 arcIndex,
1938 arcsCount;
1941 TrgmPackArcInfo *arcs;
1943 TrgmPackedGraph *result;
1944 int i,
1945 j;
1946
1947 /* Enumerate surviving states, giving init and fin reserved numbers */
1949 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1950 {
1951 while (state->parent)
1952 state = state->parent;
1953
1954 if (state->snumber < 0)
1955 {
1956 if (state->flags & TSTATE_INIT)
1957 state->snumber = 0;
1958 else if (state->flags & TSTATE_FIN)
1959 state->snumber = 1;
1960 else
1961 {
1962 state->snumber = snumber;
1963 snumber++;
1964 }
1965 }
1966 }
1967
1968 /* Collect array of all arcs */
1969 arcs = palloc_array(TrgmPackArcInfo, trgmNFA->arcsCount);
1970 arcIndex = 0;
1972 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1973 {
1975 ListCell *cell;
1976
1977 while (source->parent)
1978 source = source->parent;
1979
1980 foreach(cell, state->arcs)
1981 {
1982 TrgmArc *arc = (TrgmArc *) lfirst(cell);
1983 TrgmState *target = arc->target;
1984
1985 while (target->parent)
1986 target = target->parent;
1987
1988 if (source->snumber != target->snumber)
1989 {
1990 ColorTrgmInfo *ctrgm;
1991
1992 ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1993 trgmNFA->colorTrgms,
1994 trgmNFA->colorTrgmsCount,
1995 sizeof(ColorTrgmInfo),
1997 Assert(ctrgm != NULL);
1998 Assert(ctrgm->expanded);
1999
2000 arcs[arcIndex].sourceState = source->snumber;
2001 arcs[arcIndex].targetState = target->snumber;
2002 arcs[arcIndex].colorTrgm = ctrgm->cnumber;
2003 arcIndex++;
2004 }
2005 }
2006 }
2007
2008 /* Sort arcs to ease duplicate detection */
2010
2011 /* We could have duplicates because states were merged. Remove them. */
2012 if (arcIndex > 1)
2013 {
2014 /* p1 is probe point, p2 is last known non-duplicate. */
2016 *p2;
2017
2018 p2 = arcs;
2019 for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2020 {
2021 if (packArcInfoCmp(p1, p2) > 0)
2022 {
2023 p2++;
2024 *p2 = *p1;
2025 }
2026 }
2027 arcsCount = (p2 - arcs) + 1;
2028 }
2029 else
2030 arcsCount = arcIndex;
2031
2032 /* Create packed representation */
2033 result = (TrgmPackedGraph *)
2035
2036 /* Pack color trigrams information */
2037 result->colorTrigramsCount = 0;
2038 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2039 {
2040 if (trgmNFA->colorTrgms[i].expanded)
2041 result->colorTrigramsCount++;
2042 }
2043 result->colorTrigramGroups = (int *)
2044 MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2045 j = 0;
2046 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2047 {
2048 if (trgmNFA->colorTrgms[i].expanded)
2049 {
2050 result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2051 j++;
2052 }
2053 }
2054
2055 /* Pack states and arcs information */
2056 result->statesCount = snumber;
2057 result->states = (TrgmPackedState *)
2058 MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2060 MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2061 j = 0;
2062 for (i = 0; i < snumber; i++)
2063 {
2064 int cnt = 0;
2065
2066 result->states[i].arcs = &packedArcs[j];
2067 while (j < arcsCount && arcs[j].sourceState == i)
2068 {
2070 packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2071 cnt++;
2072 j++;
2073 }
2074 result->states[i].arcsCount = cnt;
2075 }
2076
2077 /* Allocate working memory for trigramsMatchGraph() */
2078 result->colorTrigramsActive = (bool *)
2079 MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2080 result->statesActive = (bool *)
2081 MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2082 result->statesQueue = (int *)
2083 MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2084
2085 return result;
2086}
2087
2088/*
2089 * Comparison function for sorting TrgmPackArcInfos.
2090 *
2091 * Compares arcs in following order: sourceState, colorTrgm, targetState.
2092 */
2093static int
2094packArcInfoCmp(const void *a1, const void *a2)
2095{
2096 const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2097 const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2098
2099 if (p1->sourceState < p2->sourceState)
2100 return -1;
2101 if (p1->sourceState > p2->sourceState)
2102 return 1;
2103 if (p1->colorTrgm < p2->colorTrgm)
2104 return -1;
2105 if (p1->colorTrgm > p2->colorTrgm)
2106 return 1;
2107 if (p1->targetState < p2->targetState)
2108 return -1;
2109 if (p1->targetState > p2->targetState)
2110 return 1;
2111 return 0;
2112}
2113
2114
2115/*---------------------
2116 * Debugging functions
2117 *
2118 * These are designed to emit GraphViz files.
2119 *---------------------
2120 */
2121
2122#ifdef TRGM_REGEXP_DEBUG
2123
2124/*
2125 * Print initial NFA, in regexp library's representation
2126 */
2127static void
2128printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2129{
2131 int nstates = pg_reg_getnumstates(regex);
2132 int state;
2133 int i;
2134
2136
2137 appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2138
2139 for (state = 0; state < nstates; state++)
2140 {
2141 regex_arc_t *arcs;
2142 int i,
2143 arcsCount;
2144
2145 appendStringInfo(&buf, "s%d", state);
2146 if (pg_reg_getfinalstate(regex) == state)
2147 appendStringInfoString(&buf, " [shape = doublecircle]");
2148 appendStringInfoString(&buf, ";\n");
2149
2150 arcsCount = pg_reg_getnumoutarcs(regex, state);
2151 arcs = palloc_array(regex_arc_t, arcsCount);
2152 pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2153
2154 for (i = 0; i < arcsCount; i++)
2155 {
2156 appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2157 state, arcs[i].to, arcs[i].co);
2158 }
2159
2160 pfree(arcs);
2161 }
2162
2163 appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2164 appendStringInfo(&buf, " initial -> s%d;\n",
2165 pg_reg_getinitialstate(regex));
2166
2167 /* Print colors */
2168 appendStringInfoString(&buf, " { rank = sink;\n");
2169 appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2170
2171 for (i = 0; i < ncolors; i++)
2172 {
2173 TrgmColorInfo *color = &colors[i];
2174 int j;
2175
2176 appendStringInfo(&buf, "<br/>Color %d: ", i);
2177 if (color->expandable)
2178 {
2179 for (j = 0; j < color->wordCharsCount; j++)
2180 {
2181 char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2182
2183 memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2184 s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2186 }
2187 }
2188 else
2189 appendStringInfoString(&buf, "not expandable");
2190 appendStringInfoChar(&buf, '\n');
2191 }
2192
2193 appendStringInfoString(&buf, " >];\n");
2194 appendStringInfoString(&buf, " }\n");
2195 appendStringInfoString(&buf, "}\n");
2196
2197 {
2198 /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2199 FILE *fp = fopen("/tmp/source.gv", "w");
2200
2201 fprintf(fp, "%s", buf.data);
2202 fclose(fp);
2203 }
2204
2205 pfree(buf.data);
2206}
2207
2208/*
2209 * Print expanded graph.
2210 */
2211static void
2213{
2217 TrgmState *initstate = NULL;
2218
2220
2221 appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2222
2224 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2225 {
2226 ListCell *cell;
2227
2228 appendStringInfo(&buf, "s%d", -state->snumber);
2229 if (state->flags & TSTATE_FIN)
2230 appendStringInfoString(&buf, " [shape = doublecircle]");
2231 if (state->flags & TSTATE_INIT)
2232 initstate = state;
2233 appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2234 appendStringInfoString(&buf, ";\n");
2235
2236 foreach(cell, state->arcs)
2237 {
2238 TrgmArc *arc = (TrgmArc *) lfirst(cell);
2239
2240 appendStringInfo(&buf, " s%d -> s%d [label = \"",
2241 -state->snumber, -arc->target->snumber);
2242 printTrgmColor(&buf, arc->ctrgm.colors[0]);
2244 printTrgmColor(&buf, arc->ctrgm.colors[1]);
2246 printTrgmColor(&buf, arc->ctrgm.colors[2]);
2247 appendStringInfoString(&buf, "\"];\n");
2248 }
2249 }
2250
2251 if (initstate)
2252 {
2253 appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2254 appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2255 }
2256
2257 appendStringInfoString(&buf, "}\n");
2258
2259 {
2260 /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2261 FILE *fp = fopen("/tmp/transformed.gv", "w");
2262
2263 fprintf(fp, "%s", buf.data);
2264 fclose(fp);
2265 }
2266
2267 pfree(buf.data);
2268}
2269
2270/*
2271 * Print a TrgmColor readably.
2272 */
2273static void
2275{
2276 if (co == COLOR_UNKNOWN)
2278 else if (co == COLOR_BLANK)
2280 else
2281 appendStringInfo(buf, "%d", (int) co);
2282}
2283
2284/*
2285 * Print final packed representation of trigram-based expanded graph.
2286 */
2287static void
2289{
2291 trgm *p;
2292 int i;
2293
2295
2296 appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2297
2298 for (i = 0; i < packedGraph->statesCount; i++)
2299 {
2300 TrgmPackedState *state = &packedGraph->states[i];
2301 int j;
2302
2303 appendStringInfo(&buf, " s%d", i);
2304 if (i == 1)
2305 appendStringInfoString(&buf, " [shape = doublecircle]");
2306
2307 appendStringInfo(&buf, " [label = <s%d>];\n", i);
2308
2309 for (j = 0; j < state->arcsCount; j++)
2310 {
2311 TrgmPackedArc *arc = &state->arcs[j];
2312
2313 appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2314 i, arc->targetState, arc->colorTrgm);
2315 }
2316 }
2317
2318 appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2319 appendStringInfo(&buf, " initial -> s%d;\n", 0);
2320
2321 /* Print trigrams */
2322 appendStringInfoString(&buf, " { rank = sink;\n");
2323 appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2324
2325 p = GETARR(trigrams);
2326 for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2327 {
2328 int count = packedGraph->colorTrigramGroups[i];
2329 int j;
2330
2331 appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2332
2333 for (j = 0; j < count; j++)
2334 {
2335 if (j > 0)
2337
2338 /*
2339 * XXX This representation is nice only for all-ASCII trigrams.
2340 */
2341 appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2342 p++;
2343 }
2344 }
2345
2346 appendStringInfoString(&buf, " >];\n");
2347 appendStringInfoString(&buf, " }\n");
2348 appendStringInfoString(&buf, "}\n");
2349
2350 {
2351 /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2352 FILE *fp = fopen("/tmp/packed.gv", "w");
2353
2354 fprintf(fp, "%s", buf.data);
2355 fclose(fp);
2356 }
2357
2358 pfree(buf.data);
2359}
2360
2361#endif /* TRGM_REGEXP_DEBUG */
#define Assert(condition)
Definition c.h:885
int64_t int64
Definition c.h:555
float float4
Definition c.h:655
#define MemSet(start, val, len)
Definition c.h:1035
#define fprintf(file, fmt, msg)
Definition cubescan.l:21
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1415
int64 hash_get_num_entries(HTAB *hashp)
Definition dynahash.c:1336
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1380
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define ERROR
Definition elog.h:39
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_array(type, count)
Definition fe_memutils.h:77
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
const char * str
static const FormData_pg_attribute a1
Definition heap.c:144
static const FormData_pg_attribute a2
Definition heap.c:157
@ HASH_ENTER
Definition hsearch.h:114
#define HASH_CONTEXT
Definition hsearch.h:102
#define HASH_ELEM
Definition hsearch.h:95
#define HASH_BLOBS
Definition hsearch.h:97
#define CALCGTSIZE(flag, siglen)
Definition hstore_gist.c:60
int j
Definition isn.c:78
int i
Definition isn.c:77
List * lappend(List *list, void *datum)
Definition list.c:339
List * list_concat(List *list1, const List *list2)
Definition list.c:561
void list_free(List *list)
Definition list.c:1546
unsigned int pg_wchar
Definition mbprint.c:31
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition mbutils.c:1019
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition mbutils.c:997
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1266
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
#define lfirst(lc)
Definition pg_list.h:172
static int list_length(const List *l)
Definition pg_list.h:152
#define NIL
Definition pg_list.h:68
#define foreach_delete_current(lst, var_or_cell)
Definition pg_list.h:391
#define list_make1(x1)
Definition pg_list.h:212
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define MAX_MULTIBYTE_CHAR_LEN
Definition pg_wchar.h:33
#define qsort(a, b, c, d)
Definition port.h:495
unsigned int Oid
char * c
static int fb(int x)
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition regcomp.c:372
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition regerror.c:60
#define REG_ICASE
Definition regex.h:184
#define REG_ADVANCED
Definition regex.h:181
#define REG_OKAY
Definition regex.h:215
#define REG_NOSUB
Definition regex.h:185
#define regex_t
Definition regex.h:245
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
Definition regexport.c:134
int pg_reg_getnumcolors(const regex_t *regex)
Definition regexport.c:174
int pg_reg_getnumstates(const regex_t *regex)
Definition regexport.c:36
int pg_reg_getfinalstate(const regex_t *regex)
Definition regexport.c:64
int pg_reg_getinitialstate(const regex_t *regex)
Definition regexport.c:50
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
Definition regexport.c:155
int pg_reg_colorisend(const regex_t *regex, int co)
Definition regexport.c:208
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
Definition regexport.c:266
int pg_reg_getnumcharacters(const regex_t *regex, int co)
Definition regexport.c:230
int pg_reg_colorisbegin(const regex_t *regex, int co)
Definition regexport.c:191
short color
Definition regguts.h:155
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition stringinfo.c:242
void initStringInfo(StringInfo str)
Definition stringinfo.c:97
ColorTrgm ctrgm
TrgmColor colors[3]
Definition pg_list.h:54
Definition trgm.h:58
uint8 flag
Definition trgm.h:60
TrgmState * source
TrgmState * target
ColorTrgm ctrgm
TrgmState * target
trgm_mb_char * wordChars
bool overflowed
int nstates
int arcsCount
ColorTrgmInfo * colorTrgms
List * queue
int colorTrgmsCount
TrgmColorInfo * colorInfo
int totalTrgmCount
List * keysQueue
int ncolors
TrgmState * initState
regex_t * regex
HTAB * states
TrgmPackedState * states
bool * colorTrigramsActive
int * colorTrigramGroups
TrgmPackedArc * arcs
TrgmPrefix prefix
List * arcs
List * enterKeys
struct TrgmState * parent
TrgmStateKey stateKey
struct TrgmState * tentParent
Definition regguts.h:296
color co
Definition regguts.h:298
struct state * to
Definition regguts.h:300
char bytes[MAX_MULTIBYTE_CHAR_LEN]
Definition c.h:718
#define ISWORDCHR(c, len)
Definition trgm.h:50
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition trgm_op.c:329
char trgm[3]
Definition trgm.h:41
#define GETARR(x)
Definition trgm.h:97
#define ARRKEY
Definition trgm.h:87
#define TRGMHDRSIZE
Definition trgm.h:64
int TrgmColor
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
#define COLOR_BLANK
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
#define TSTATE_INIT
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
#define MAX_EXPANDED_STATES
static int convertPgWchar(pg_wchar c, trgm_mb_char *result)
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
#define MAX_TRGM_COUNT
#define MAX_EXPANDED_ARCS
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
static void transformGraph(TrgmNFA *trgmNFA)
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
static const float4 penalties[8]
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
static int packArcInfoCmp(const void *a1, const void *a2)
static void processState(TrgmNFA *trgmNFA, TrgmState *state)
#define COLOR_COUNT_LIMIT
#define TSTATE_FIN
#define WISH_TRGM_PENALTY
static void mergeStates(TrgmState *state1, TrgmState *state2)
#define COLOR_UNKNOWN
static int colorTrgmInfoCmp(const void *p1, const void *p2)
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
static Size VARSIZE_ANY_EXHDR(const void *PTR)
Definition varatt.h:472
static char * VARDATA_ANY(const void *PTR)
Definition varatt.h:486
static void SET_VARSIZE(void *PTR, Size len)
Definition varatt.h:432
static char chars[TZ_MAX_CHARS]
Definition zic.c:404